Keras + CV

Thanks @anokas for the starter code at https://www.kaggle.com/anokas/planet-understanding-the-amazon-from-space/simple-keras-starter/

In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time

Pre-processing the train and test data

In [10]:
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/sample_submission.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

labels = ['blow_down',
 'bare_ground',
 'conventional_mine',
 'blooming',
 'cultivation',
 'artisinal_mine',
 'haze',
 'primary',
 'slash_burn',
 'habitation',
 'clear',
 'road',
 'selective_logging',
 'partly_cloudy',
 'agriculture',
 'water',
 'cloudy']

label_map = {'agriculture': 14,
 'artisinal_mine': 5,
 'bare_ground': 1,
 'blooming': 3,
 'blow_down': 0,
 'clear': 10,
 'cloudy': 16,
 'conventional_mine': 2,
 'cultivation': 4,
 'habitation': 9,
 'haze': 6,
 'partly_cloudy': 13,
 'primary': 7,
 'road': 11,
 'selective_logging': 12,
 'slash_burn': 8,
 'water': 15}

for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('./input/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (64, 64)))
    y_train.append(targets)

for f, tags in tqdm(df_test.values, miniters=1000):
    img = cv2.imread('./input/test-jpg/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (64, 64)))
    
y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

print(x_train.shape)
print(y_train.shape)

100%|██████████| 40479/40479 [04:09<00:00, 161.92it/s]
100%|██████████| 40669/40669 [04:13<00:00, 160.64it/s]


(40479, 64, 64, 3)
(40479, 17)


Transpose the data if use Theano

In [None]:
#x_train = x_train.transpose((0, 3, 1, 2))
#x_test = x_test.transpose((0, 3, 1, 2))

Create n-folds cross-validation

In [11]:
# https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/32475
import numpy as np
from sklearn.metrics import fbeta_score

def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
  def mf(x):
    p2 = np.zeros_like(p)
    for i in range(17):
      p2[:, i] = (p[:, i] > x[i]).astype(np.int)
    score = fbeta_score(y, p2, beta=2, average='samples')
    return score

  x = [0.2]*17
  for i in range(17):
    best_i2 = 0
    best_score = 0
    for i2 in range(resolution):
      i2 /= resolution
      x[i] = i2
      score = mf(x)
      if score > best_score:
        best_i2 = i2
        best_score = score
    x[i] = best_i2
    if verbose:
      print(i, best_i2, best_score)

  return x

In [12]:
from keras.layers.normalization import BatchNormalization

nfolds = 3

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')
        
        model = Sequential()
        model.add(BatchNormalization(input_shape=(64, 64, 3)))
        model.add(Conv2D(8, 1, 1, activation='relu'))
        model.add(Conv2D(16, 2, 2, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(32, 3, 3, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Conv2D(64, 3, 3, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(17, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=2, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0)]
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=128,verbose=2, nb_epoch=10,callbacks=callbacks,
                  shuffle=True)
        
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)
        
        p_valid = model.predict(X_valid, batch_size = 128, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.2, beta=2, average='samples'))
        print("Optimizing prediction threshold")
        print(optimise_f2_thresholds(Y_valid, p_valid))
        
        p_test = model.predict(x_train, batch_size = 128, verbose=2)
        yfull_train.append(p_test)
        
        p_test = model.predict(x_test, batch_size = 128, verbose=2)
        yfull_test.append(p_test)

Start KFold number 1 from 3
Split train:  26986 26986
Split valid:  13493 13493




Train on 26986 samples, validate on 13493 samples
Epoch 1/10
183s - loss: 0.2351 - acc: 0.9104 - val_loss: 0.2152 - val_acc: 0.9169
Epoch 2/10
166s - loss: 0.1897 - acc: 0.9256 - val_loss: 0.1757 - val_acc: 0.9336
Epoch 3/10
185s - loss: 0.1734 - acc: 0.9324 - val_loss: 0.1654 - val_acc: 0.9368
Epoch 4/10
189s - loss: 0.1642 - acc: 0.9358 - val_loss: 0.1565 - val_acc: 0.9391
Epoch 5/10
185s - loss: 0.1567 - acc: 0.9391 - val_loss: 0.1479 - val_acc: 0.9432
Epoch 6/10
194s - loss: 0.1517 - acc: 0.9409 - val_loss: 0.1433 - val_acc: 0.9450
Epoch 7/10
179s - loss: 0.1462 - acc: 0.9427 - val_loss: 0.1413 - val_acc: 0.9453
Epoch 8/10
173s - loss: 0.1438 - acc: 0.9439 - val_loss: 0.1357 - val_acc: 0.9471
Epoch 9/10
176s - loss: 0.1405 - acc: 0.9447 - val_loss: 0.1336 - val_acc: 0.9474
Epoch 10/10
170s - loss: 0.1387 - acc: 0.9458 - val_loss: 0.1320 - val_acc: 0.9476
0.881289939425
Optimizing prediction threshold
0 0.09 0.881289939425
1 0.1 0.881792108131
2 0.14 0.881792108131
3 0.08 0.88179618

  'precision', 'predicted', average, warn_for)


16 0.08 0.883868763052
[0.09, 0.1, 0.14, 0.08, 0.21, 0.2, 0.27, 0.22, 0.1, 0.2, 0.14, 0.23, 0.14, 0.11, 0.19, 0.22, 0.08]
Start KFold number 2 from 3
Split train:  26986 26986
Split valid:  13493 13493
Train on 26986 samples, validate on 13493 samples
Epoch 1/10
207s - loss: 0.2343 - acc: 0.9106 - val_loss: 0.2242 - val_acc: 0.9082
Epoch 2/10
179s - loss: 0.1901 - acc: 0.9262 - val_loss: 0.1807 - val_acc: 0.9295
Epoch 3/10
190s - loss: 0.1749 - acc: 0.9319 - val_loss: 0.1677 - val_acc: 0.9322
Epoch 4/10
198s - loss: 0.1658 - acc: 0.9358 - val_loss: 0.1537 - val_acc: 0.9393
Epoch 5/10
181s - loss: 0.1573 - acc: 0.9388 - val_loss: 0.1500 - val_acc: 0.9398
Epoch 6/10
213s - loss: 0.1543 - acc: 0.9395 - val_loss: 0.1435 - val_acc: 0.9422
Epoch 7/10
215s - loss: 0.1495 - acc: 0.9415 - val_loss: 0.1408 - val_acc: 0.9442
Epoch 8/10
222s - loss: 0.1457 - acc: 0.9429 - val_loss: 0.1368 - val_acc: 0.9454
Epoch 9/10
214s - loss: 0.1449 - acc: 0.9432 - val_loss: 0.1359 - val_acc: 0.9464
Epoch 10/1

Averaging the prediction from each fold

In [13]:
result = np.array(yfull_test[0])
for i in range(1, nfolds):
    result += np.array(yfull_test[i])
result /= nfolds
result = pd.DataFrame(result, columns = labels)
result

Unnamed: 0,blow_down,bare_ground,conventional_mine,blooming,cultivation,artisinal_mine,haze,primary,slash_burn,habitation,clear,road,selective_logging,partly_cloudy,agriculture,water,cloudy
0,2.478743e-03,0.001692,0.000069,1.266691e-02,0.018855,7.268426e-05,0.004101,0.999649,0.000841,0.002808,0.980633,0.016790,4.652178e-03,0.007742,0.027856,0.038720,6.111130e-04
1,9.629046e-03,0.003728,0.000130,4.308486e-02,0.058522,2.176049e-04,0.000981,0.999894,0.002572,0.007717,0.986431,0.041556,2.863836e-02,0.024408,0.056382,0.069617,1.399609e-04
2,2.334380e-05,0.000427,0.000007,8.183181e-06,0.034296,2.433891e-06,0.000019,0.999507,0.000149,0.001640,0.000369,0.027132,2.803963e-06,0.992888,0.081290,0.041145,1.046471e-03
3,1.138566e-02,0.006968,0.000213,4.030065e-02,0.085808,4.103550e-04,0.002506,0.999700,0.004448,0.011470,0.943470,0.049940,3.101859e-02,0.068006,0.084585,0.090162,3.798110e-04
4,1.087785e-05,0.001010,0.000112,3.569243e-06,0.030711,2.783286e-05,0.000427,0.838189,0.000154,0.005646,0.000292,0.071725,1.058586e-06,0.920085,0.123395,0.079657,1.961616e-01
5,5.208685e-03,0.001416,0.000079,3.120751e-02,0.014662,1.117924e-04,0.001156,0.999848,0.000919,0.002611,0.993369,0.015397,1.046067e-02,0.005291,0.016214,0.027779,2.664905e-04
6,6.018067e-03,0.038735,0.002695,5.011777e-03,0.277943,3.430425e-03,0.091940,0.990401,0.022110,0.157809,0.538266,0.322887,1.003304e-02,0.351704,0.582758,0.313010,6.599638e-03
7,4.449791e-08,0.036233,0.002614,1.201757e-08,0.011494,3.714860e-02,0.007966,0.817683,0.000005,0.916558,0.993355,0.973756,6.522833e-05,0.000469,0.268333,0.206005,4.193643e-07
8,4.707221e-03,0.002233,0.000103,2.423514e-02,0.023034,1.356843e-04,0.002218,0.999715,0.001276,0.003750,0.985624,0.020848,9.975935e-03,0.009872,0.028778,0.040249,4.545439e-04
9,4.982810e-04,0.005471,0.000548,6.987713e-04,0.061033,1.347629e-04,0.677785,0.970258,0.001024,0.017578,0.209109,0.087055,6.874840e-04,0.060300,0.171922,0.139570,2.845017e-02


Output prediction for submission

In [17]:
from tqdm import tqdm
#thres = [0.07, 0.17, 0.2, 0.04, 0.23, 0.33, 0.24, 0.22, 0.1, 0.19, 0.23, 0.24, 0.12, 0.14, 0.25, 0.26, 0.16]
thres = [0.0475, 0.2225, 0.0875, 0.19, 0.265, 0.1375, 0.1925, 0.2625, 0.085, 0.2175, 0.2375, 0.21, 0.14, 0.1625, 0.245, 0.205, 0.12]
preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > thres, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

100%|██████████| 40669/40669 [01:47<00:00, 377.70it/s]


In [18]:
df_test['tags'] = preds
df_test

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,primary partly_cloudy
3,test_3,primary clear
4,test_4,primary partly_cloudy cloudy
5,test_5,primary clear
6,test_6,cultivation primary clear road partly_cloudy a...
7,test_7,primary habitation clear road agriculture water
8,test_8,primary clear
9,test_9,haze primary


In [19]:
df_test.to_csv('submission_1.csv', index=False)