In [162]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
from keras.callbacks import EarlyStopping

from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm

from utils.file import makedirs
from utils.recorder import record_model_medata, record_model_scores
from utils.loader import load_training_set, load_test_set
from utils.imagegen import *
from utils.models import *

from datetime import datetime
import time
import configparser
import json
import sys

In [165]:
timestr = time.strftime("%Y%m%d-%H%M%S")
start_time = datetime.now()

In [166]:
config_file = 'cfg/default.cfg'

if len(sys.argv) > 1 and '.cfg' in sys.argv[1]:
    config_file = sys.argv[1]

print('reading configurations from config file: {}'.format(config_file))

settings = configparser.ConfigParser()
settings.read(config_file)
data_dir = settings.get('data', 'data_dir')

df_train = pd.read_csv(data_dir + 'train_v2.csv')
model_filename = 'aggregate_model_'+ timestr +'.h5'
model_filepath = data_dir + 'models/' + model_filename
sample_submission_filepath = data_dir + 'sample_submission_v2.csv'
number_of_samples = len(df_train.index)
print('total number of training samples: {}'.format(number_of_samples))

# WARNING: keras allow either 1, 3, or 4 channels per pixel. Other numbers not allowed.
data_mask_label = np.array(['R', 'G', 'B', 'NDVI', 'NDWI', 'NIR'])
#print(settings.get('data', 'data_mask'))
data_mask_list = json.loads(settings.get('data', 'data_mask'))

data_mask = ma.make_mask(data_mask_list)
print(data_mask)

num_channels = np.sum(data_mask)
need_norm_stats = False

model_id = settings.get('model', 'model_id')
print('model: {}'.format(model_id))
num_samples_per_epoch = number_of_samples * 3 # TODO understand the implications of this
rescaled_dim = 64
number_epoch = settings.getint('model', 'number_epoch')  # TODO Keep increasing
batch_size = settings.getint('model', 'batch_size') # depends on model size and GPU memory
print('batch size: {}'.format(batch_size))
classifier_threshold = 0.2

split = int(number_of_samples * 0.80)  # TODO we may want to increase to 0.90 eventually
number_validations = number_of_samples - split

reading configurations from config file: cfg/default.cfg




total number of training samples: 40479
[ True False False  True  True False]
model: JAGG_1
batch size: 512


In [156]:
flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

print(labels)
print(len(labels))

['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']
17


In [157]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [158]:
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)

(40479L, 64L, 64L, 6L)
(40479L, 17L)


In [159]:
x_train = x_train[:, :, :, data_mask]

x_train = x_train.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_train.shape)

(40479L, 3L, 64L, 64L)


In [121]:
# shuffle the samples because 
# 1) the original samples may not be randomized & 
# 2) to avoid the possiblility of overfitting the validation data while we tune the model
from sklearn.utils import shuffle
x_train, y_train = shuffle(x_train, y_train, random_state=0)

x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

In [122]:
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(32383L, 3L, 64L, 64L)
(32383L, 17L)
(8096L, 3L, 64L, 64L)
(8096L, 17L)


In [123]:
image_generator = ScaledDown() # NormalizedByFeature() offers seemingly no improvement but cost 30 min more to run.

In [124]:
# this is the augmentation configuration we will use for training
# TODO augment with random rotations for rare classes
train_datagen = image_generator.getTrainGenenerator()

In [125]:
if (need_norm_stats):
    # need to compute internal stats like featurewise std and zca whitening
    train_datagen.fit(x_train)

In [126]:
train_generator = train_datagen.flow(
        x_train, 
        y_train, 
        batch_size=batch_size,
        shuffle=True) 

In [127]:
validation_datagen = image_generator.getValidationGenenerator()

In [128]:
# workaround to provide your own stats: 
# http://stackoverflow.com/questions/41855512/how-does-data-normalization-work-in-keras-during-prediction/43069409#43069409
if (need_norm_stats):
    # need to compute internal stats like featurewise std and zca whitening
    validation_datagen.fit(x_valid)

In [129]:
validation_generator = validation_datagen.flow(
        x_valid,
        y_valid,
        batch_size=batch_size,
        shuffle=False)

In [130]:
model = get_model(model_id, num_channels, rescaled_dim, rescaled_dim)

# TODO 
# Use custom loss function to optimize F2 score.
# https://github.com/fchollet/keras/issues/369
# https://github.com/fchollet/keras/blob/master/keras/losses.py
model.compile(loss='binary_crossentropy', # Is this the best loss function?
              optimizer='adam',
              metrics=['accuracy', 'recall', 'precision'])

In [131]:
# BUG when resuming training, the learning rate need to be decreased.
# let's load an existing trained model and continue training more epoch gives 0.01 improvement in LB score.
# model = load_model(data_dir + 'models/aggregate_model_20170507-124128.h5') # 0.86
# model = load_model(data_dir + 'models/aggregate_model_20170507-184232.h5') # 0.87
# model = load_model(data_dir + 'models/aggregate_model_20170511-133235.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170515-062741.h5')
#number_epoch = 2

In [132]:
def compute_f2_measure(l_model):
    val_generator_f2 = validation_datagen.flow(
        x_valid,
        y_valid,
        batch_size=batch_size,
        shuffle=False)
    raw_pred = l_model.predict_generator(val_generator_f2, number_of_samples - split)
    thresholded_pred = (np.array(raw_pred) > classifier_threshold).astype(int)  # TODO how does a threshold affect metric?
    l_f2_score = fbeta_score(y_valid, thresholded_pred, beta=2, average='samples')
    return l_f2_score
    
class F2_Validation(k.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.f2_measures = []
    def on_epoch_end(self, epoch, logs={}):
        self.f2_measures.append(compute_f2_measure(self.model))

f2_score_val = F2_Validation()

In [133]:
# prevent overfitting on training data
early_stop = EarlyStopping(monitor='val_loss',patience=6, min_delta=0, verbose=0, mode='auto')  # TODO patience and min_delta

In [134]:
training_start_time = datetime.now()
# fits the model on batches with real-time data augmentation:
history = model.fit_generator(train_generator,
                    samples_per_epoch=num_samples_per_epoch,
                    nb_epoch=number_epoch,
                    validation_data=validation_generator,
                    nb_val_samples=number_validations,
                    callbacks=[f2_score_val, early_stop])

model.save(model_filepath)  # always save your model and weights after training or during training
time_spent_trianing = datetime.now() - training_start_time

print('model training complete')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model training complete


In [135]:
#np.set_printoptions(threshold='nan')

# use the validation data to compute some stats which tell us how the model is performing on the validation data set.
val_generator_score_board = validation_datagen.flow(
    x_valid,
    y_valid,
    batch_size=batch_size,
    shuffle=False)
p_valid = model.predict_generator(val_generator_score_board, number_validations)

print(y_valid)
print(p_valid)

y_predictions = (np.array(p_valid) > classifier_threshold).astype(int)  # TODO calculate this threshold unique per label
print(y_predictions)

# see how many positives samples per label for truth vs prediction
print(np.sum(y_valid, axis=0))
print(np.sum(y_predictions, axis=0))

# F2 score, which gives twice the weight to recall emphasising recall higher than precision
# 'samples' is what the evaluation criteria is for the contest
f2_score = fbeta_score(y_valid, y_predictions, beta=2, average='samples')
print('f2 score over validation set using samples averaging ' , f2_score)

filtered_data_mask_label = data_mask_label[data_mask]

record_model_scores(model_filepath, 
                    model_id, 
                    history, 
                    f2_score, 
                    time_spent_trianing, 
                    num_channels, 
                    np.array_str(filtered_data_mask_label))

[[0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 1 1 0]
 [0 0 0 ..., 1 0 0]]
[[ 0.07144569  0.6632278   0.03226416 ...,  0.45637     0.29341534
   0.06081685]
 [ 0.00186764  0.97120464  0.00131295 ...,  0.14553931  0.07041587
   0.0015439 ]
 [ 0.0311518   0.64984947  0.03588678 ...,  0.21825695  0.20191094
   0.02163785]
 ..., 
 [ 0.02562239  0.61320764  0.02034554 ...,  0.33294511  0.26330003
   0.0175603 ]
 [ 0.0805354   0.39007246  0.02406379 ...,  0.46833941  0.34671462
   0.0524301 ]
 [ 0.05245041  0.49983412  0.02244233 ...,  0.45227399  0.33804035
   0.03869794]]
[[0 1 0 ..., 1 1 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 1 1 0]
 ..., 
 [0 1 0 ..., 1 1 0]
 [0 1 0 ..., 1 1 0]
 [0 1 0 ..., 1 1 0]]
[  44 5684   66 7451  452   22 1513  527  874 1433   70  732  170   23 2449
 1635   74]
[   0 7708    0 8096  603    0 3148   41 1914 3669    0   24    1    0 5204
 4284    0]
('f2 score over validation set using samples averaging ', 0.64956261

In [136]:
figures_dir = 'figures/' + model_id
makedirs(figures_dir)

# list all data in history
print('training history stats:')
print(history.history.keys())

# summarize history for f2 score
fig = plt.figure(figsize=(15, 10))
subplot0 = fig.add_subplot(231)
if hasattr(f2_score_val, 'f2_measures'):
    subplot0.plot(f2_score_val.f2_measures)
subplot0.set_title('f2 score')
subplot0.set_ylabel('f2 score')
subplot0.set_xlabel('epoch')
subplot0.legend(['val'], loc='upper left')

# summarize history for recall
subplot3 = fig.add_subplot(232)
subplot3.plot(history.history['recall'])
subplot3.plot(history.history['val_recall'])
subplot3.set_title('recall')
subplot3.set_ylabel('recall')
subplot3.set_xlabel('epoch')
subplot3.legend(['train', 'val'], loc='upper left')

# summarize history for precision
subplot2 = fig.add_subplot(233)
subplot2.plot(history.history['precision'])
subplot2.plot(history.history['val_precision'])
subplot2.set_title('precision')
subplot2.set_ylabel('precision')
subplot2.set_xlabel('epoch')
subplot2.legend(['train', 'val'], loc='upper left')

# summarize history for accuracy
subplot1 = fig.add_subplot(234)
subplot1.plot(history.history['acc'])
subplot1.plot(history.history['val_acc'])
subplot1.set_title('accuracy')
subplot1.set_ylabel('accuracy')
subplot1.set_xlabel('epoch')
subplot1.legend(['train', 'val'], loc='upper left')

# summarize history for loss
subplot4 = fig.add_subplot(235)
subplot4.plot(history.history['loss'])
subplot4.plot(history.history['val_loss'])
subplot4.set_title('model loss')
subplot4.set_ylabel('loss')
subplot4.set_xlabel('epoch')
subplot4.legend(['train', 'val'], loc='upper left')

fig.savefig(figures_dir + '/stats_' + timestr + '.png')
#plt.show()

training history stats:
['acc', 'loss', 'recall', 'precision', 'val_acc', 'val_recall', 'val_precision', 'val_loss']


In [137]:
#model = load_model(model_filepath)

In [138]:
def f2score(truth, predict, label_index):
    return fbeta_score(truth[:, label_index], predict[:, label_index], beta=2, average='macro')
    
def precision_for_label_index2(truth, predict, label_index):
    return precision_score(truth[:, label_index], predict[:, label_index], average='macro')

def recall_for_label_index(truth, predict, label_index):
    return recall_score(truth[:, label_index], predict[:, label_index], average='macro')

In [139]:
# calculate f2 score for each label. find out which labels model is performing badly.
# print stats for each label
for x in range(0, len(labels)):
    f2_score = f2score(y_valid, y_predictions, x)
    precision_s = precision_for_label_index2(y_valid, y_predictions, x)
    recall_s = recall_for_label_index(y_valid, y_predictions, x)
    label = labels[x]
    print(label)
    print('    f2 score : ' , f2_score)
    print('    precision: ' , precision_s)
    print('    recall   : ' , recall_s)

slash_burn
('    f2 score : ', 0.49945414847161573)
('    precision: ', 0.49728260869565216)
('    recall   : ', 0.5)
clear
('    f2 score : ', 0.4680189813583463)
('    precision: ', 0.47889192644942458)
('    recall   : ', 0.49539541326909903)
blooming
('    f2 score : ', 0.49917943107221013)
('    precision: ', 0.49592391304347827)
('    recall   : ', 0.5)
primary
('    f2 score : ', 0.49149076517150397)
('    precision: ', 0.46016551383399207)
('    recall   : ', 0.5)
cloudy
('    f2 score : ', 0.48774353264263681)
('    precision: ', 0.49044459184570055)
('    recall   : ', 0.48750422565213969)
conventional_mine
('    f2 score : ', 0.49972766884531583)
('    precision: ', 0.49864130434782611)
('    recall   : ', 0.5)
water
('    f2 score : ', 0.48166560102301798)
('    precision: ', 0.50018040223149218)
('    recall   : ', 0.50028212627630764)
haze
('    f2 score : ', 0.49467206729887858)
('    precision: ', 0.50405898472392541)
('    recall   : ', 0.50033606067693981)
cultivation

In [25]:
# https://www.kaggle.com/paulorzp/find-best-f2-score-threshold
# TODO should this threshold be unique per label?
def estimate_f2score_threshold(p_valid, y_valid, try_all=False, verbose=False):
    best = 0
    best_score = -1
    totry = np.arange(0,1,0.005) if try_all is False else np.unique(p_valid)
    for t in totry:
        score = f2_score(y_valid, p_valid > t)
        if score > best_score:
            best_score = score
            best = t
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold =', best)
    return best

In [26]:
testset_dir = data_dir + 'test'

df_test_list = pd.read_csv(sample_submission_filepath)

x_test = load_test_set(df_test_list, rescaled_dim)

x_test = x_test[:, :, :, data_mask]

In [27]:
#x_test = np.array(x_test, np.uint8)
print(x_test.shape)
x_test = x_test.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_test.shape)

(61191L, 64L, 64L, 3L)
(61191L, 3L, 64L, 64L)


In [28]:
# this is the configuration we will use for testing:
testset_datagen = image_generator.getTestGenenerator()

if (need_norm_stats):
    # need to compute internal stats like featurewise std and zca whitening
    testset_datagen.fit(x_test)

In [29]:
testset_generator = testset_datagen.flow(
    x_test,
    y=None,
    batch_size=batch_size,
    shuffle=False)
    
# ??? There may be a bug below that casues LB score to be 0.5-0.6
# testset_generator = testset_datagen.flow_from_directory(
#         testset_dir,
#         target_size=(rescaled_dim, rescaled_dim),
#         batch_size=batch_size,
#         class_mode=None,
#         shuffle=False)

In [30]:
#from keras.models import load_model
# model = load_model(data_dir + 'models/aggregate_model_20170507-184232.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170509-215809.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170511-001322.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170511-150149.h5')

In [31]:
# run predictions on test set
testset_predict = model.predict_generator(testset_generator, x_test.shape[0]) # number of test samples

y_testset_predictions = (np.array(testset_predict) > classifier_threshold).astype(int)

result = pd.DataFrame(y_testset_predictions, columns = labels)

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.transpose()
    a = a.loc[a[i] == 1]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

df_test = pd.read_csv(sample_submission_filepath)
df_test['tags'] = preds
df_test
print('done')

100%|██████████████████████████████████████████████████████████████████████████| 61191/61191 [00:57<00:00, 1069.55it/s]


done


In [32]:
#test code
# nums_ones = np.ones((1, 17))
# nums_zeros = np.zeros((1, 17))
# haha = np.array([[1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]])

# y_testset_predictions = haha
# result = pd.DataFrame(y_testset_predictions, columns = labels)

# preds = []
# for i in tqdm(range(result.shape[0]), miniters=1000):
#     a = result.ix[[i]]
#     #print(a)
#     a = a.transpose()
#     print(a)
#     a = a.loc[a[i] == 1]
#     print(a)
#     ' '.join(list(a.index))
#     preds.append(' '.join(list(a.index)))
    
# print(preds)

In [33]:
df_test.to_csv(data_dir + 'my_submissions/submission_' + timestr + '.csv', index=False)

In [34]:
total_exec_time = datetime.now() - start_time
print ('time spent to complete execution: {}'.format(total_exec_time))

time spent to complete execution: 0:26:16.470000
