In [1]:
import numpy as np
import pandas as pd
import os
import gc

import keras as k

import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
from multiprocessing import Pool, cpu_count    

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [3]:
np.empty((1, 256, 256, 3), dtype=np.float16).nbytes * 40000 / (1024*1024)

15000.0

# 1. Data Preprocessing

In [4]:
img_height = 64
img_width  = 64

In [5]:
df_train = pd.read_csv('../input/train_v2.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

Y = np.empty((df_train.shape[0], 17), dtype=np.uint8)
i=0
for tags in tqdm(df_train['tags'].values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    Y[i,:] = targets
    i += 1
print(Y.shape)    

def get_images(names):
    i = 0
    X = np.empty((names.shape[0], img_height, img_width, 3), dtype=np.float16)
    for f in tqdm(names.values, miniters=1000):
        img = cv2.imread('../input/train-jpg/{}.jpg'.format(f))
        if img_height != img.shape[0]:
            img = cv2.resize(img, (img_height, img_width))
        X[i,:,:,:] = np.array(img, np.float16)
        i += 1
    return X / 255.

#multiply cpu_count if cannot fit memory
pool = Pool(cpu_count())
X = np.concatenate(pool.map(
    get_images, 
    np.array_split(df_train['image_name'], cpu_count())
))
pool.close()
pool.join()
print(X.shape)

100%|██████████| 40479/40479 [00:00<00:00, 269305.93it/s]


(40479, 17)


100%|██████████| 10120/10120 [00:29<00:00, 342.81it/s]
100%|██████████| 10120/10120 [00:29<00:00, 341.77it/s]
100%|██████████| 10119/10119 [00:29<00:00, 340.09it/s]
100%|██████████| 10120/10120 [00:29<00:00, 339.73it/s]


(40479, 64, 64, 3)


In [None]:
plt.figure(figsize=(12,8))
for i in range(6):
    plt.subplot(2,3,i+1)
    plt.imshow(plt.imread('../input/train-jpg/train_{}.jpg'.format(i)))
    plt.title(str(df_train.loc[i].tags))

In [29]:
inv_label_map

{0: 'blooming',
 1: 'partly_cloudy',
 2: 'blow_down',
 3: 'road',
 4: 'bare_ground',
 5: 'agriculture',
 6: 'habitation',
 7: 'slash_burn',
 8: 'artisinal_mine',
 9: 'cloudy',
 10: 'clear',
 11: 'conventional_mine',
 12: 'haze',
 13: 'water',
 14: 'cultivation',
 15: 'selective_logging',
 16: 'primary'}

# 2. Model Building

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
split = 35000
#x_train, x_valid, y_train, y_valid = X[:split], X[split:], Y[:split], Y[split:]
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

In [21]:
def fbeta(y_true, y_pred):
    beta = 2
    threshold_shift = -0.3

    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin), axis=1) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)), axis=1)
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)), axis=1)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return K.mean((beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon()))

In [18]:
model = Sequential()
model.add(BatchNormalization(input_shape=(64, 64,3)))
model.add(Conv2D(32, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(256, kernel_size=(3, 3),padding='same', activation='relu'))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

In [22]:
epochs_arr = [20, 5, 5]
learn_rates = [0.001, 0.0001, 0.00001]
kfold_weights_path = os.path.join('', 'weights.h5')

for learn_rate, epochs in zip(learn_rates, epochs_arr):
    opt  = Adam(lr=learn_rate)
    model.compile(loss='binary_crossentropy', # We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
                  optimizer=opt,
                  metrics=['accuracy', fbeta])
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=2, verbose=2),
        ModelCheckpoint(kfold_weights_path, monitor='val_loss', 
                        save_best_only=True, verbose=2)
    ]

    model.fit(x = x_train, y= y_train, validation_data=(x_valid, y_valid),
          batch_size=128,verbose=1, epochs=epochs,callbacks=callbacks,shuffle=True)

Train on 32383 samples, validate on 8096 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 00015: early stopping
Train on 32383 samples, validate on 8096 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 32383 samples, validate on 8096 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
from sklearn.metrics import fbeta_score, accuracy_score

In [24]:
if os.path.isfile(kfold_weights_path):
    model.load_weights(kfold_weights_path)

In [26]:
p_valid = model.predict(x_valid, batch_size=128, verbose=1)



In [45]:
print(fbeta_score(y_valid, np.array(p_valid) > 0.16, beta=2, average='samples'))

0.912473230662


In [28]:
score = fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average=None)
print('F2 test scores per tag:')
[(inv_label_map[l], score[l]) for l in score.argsort()[::-1]]

F2 test scores per tag:


  'precision', 'predicted', average, warn_for)


[('primary', 0.9904487247327759),
 ('clear', 0.97583707283396637),
 ('partly_cloudy', 0.92655908096280104),
 ('cloudy', 0.88376560999039389),
 ('agriculture', 0.8780657103192967),
 ('road', 0.81247754759908986),
 ('haze', 0.75657894736842113),
 ('water', 0.73514144822216454),
 ('artisinal_mine', 0.65934065934065922),
 ('habitation', 0.6344601412714429),
 ('cultivation', 0.63124617581072817),
 ('bare_ground', 0.27744982290436837),
 ('selective_logging', 0.24561403508771931),
 ('blooming', 0.019230769230769232),
 ('slash_burn', 0.0),
 ('conventional_mine', 0.0),
 ('blow_down', 0.0)]

In [44]:
blow_down = p_valid
for i in np.arange(0.1,0.2,0.01):
    print(i, '\t:', accuracy_score(y_valid, blow_down>i))
    #print(i, '\t:', fbeta_score(y_valid, blow_down > i, beta=2, average='samples'))
#pd.Series(p_valid[y_valid[2]==1][2]).describe()

0.1 	: 0.436511857708
0.11 	: 0.447504940711
0.12 	: 0.458992094862
0.13 	: 0.469738142292
0.14 	: 0.479866600791
0.15 	: 0.48851284585
0.16 	: 0.499505928854
0.17 	: 0.507164031621
0.18 	: 0.512475296443
0.19 	: 0.518404150198


In [None]:
thres = 0.16

# 3. Make Prediction

In [None]:
df_submission = pd.read_csv('../input/sample_submission_v2.csv')

def get_images(names):
    i = 0
    X = np.empty((names.shape[0], img_height, img_width, 3), dtype=np.float16)
    for f in tqdm(names.values, miniters=1000):
        img = cv2.imread('../input/test-jpg/{}.jpg'.format(f))
        if img_height != img.shape[0]:
            img = cv2.resize(img, (img_height, img_width))
        X[i,:,:,:] = np.array(img, np.float16)
        i += 1
    return X / 255.

pool = Pool(cpu_count())
X_submission = np.concatenate(pool.map(
    get_images, 
    np.array_split(df_submission['image_name'], cpu_count())
))
pool.close()
pool.join()
print(X_submission.shape)

In [None]:
predict = model.predict(X_submission, batch_size = 128, verbose=1)

In [None]:
result = pd.DataFrame(np.zeros((df_submission.shape[0], 17)))
result[1] = 1
result[10] = 1
preds = []
sorted_tags = pd.Series(inv_label_map)

for i in tqdm(range(result.shape[0]), miniters=1000):
    preds.append(' '.join(list(
        sorted_tags[np.where(result.loc[i] == 1)[0]]
    )))

In [None]:
df_submission['tags'] = preds
df_submission.to_csv('submission#.csv', index=False)