In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import os
import gc
import configparser
from datetime import datetime
import time

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import fbeta_score, precision_score, recall_score

from utils.f2thresholdfinder import *
from utils.loaderjpg import *
from utils.generator import *
from utils.custommetrics import *
from utils.visualization import *
from utils.predictorjpg import *
from utils.file import *

from pretrained.vgg16 import *
from pretrained.resnet50 import *

Using Theano backend.
Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:01:00.0)


In [2]:
config_file = 'cfg/default.cfg'

print('reading configurations from config file: {}'.format(config_file))

settings = configparser.ConfigParser()
settings.read(config_file)
data_dir = settings.get('data', 'data_dir')

rescaled_dim = 224

#model_name = 'vgg16'
model_name = 'resnet50'

train_top_classifer = False

if model_name == 'vgg16':
    train_top_classifer = False
else:
    train_top_classifer = True

file_uuid = time.strftime("%Y%m%d-%H%M%S")

verbose_level = 0

labels = ['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']

reading configurations from config file: cfg/default.cfg




In [3]:
df_train = pd.read_csv(data_dir + 'train_v2.csv')
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)

(40479L, 224L, 224L, 3L)
(40479L, 17L)


In [4]:
# subtract the mean
# Reference: https://gist.github.com/baraldilorenzo/07d7802847aaad0a35d3
# BGR mean values [103.94, 116.78, 123.68] should be subtracted before feeding into the model

#x_train[:,:,:,0] -= 104
#x_train[:,:,:,1] -= 117
#x_train[:,:,:,2] -= 124

In [6]:
number_of_samples = x_train.shape[0]
split = int(number_of_samples * 0.90)
                     
x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

number_validations = number_of_samples - split

In [8]:
if model_name == 'vgg16':
    frozen_layers = 11 # Tried: 25
    model = vgg16_model_custom_fc_layers(data_dir + 'bottleneck/vgg16/bottleneck_fc_model.h5', 
                                         channel=3, 
                                         num_classes=17, 
                                         num_frozen_layers=frozen_layers,
                                         learning_rate=0.005)
elif model_name == 'resnet50':
    frozen_layers = 175 # 175 : classifier top layers only
    model = resnet50_model_custom_top(num_classes=17, num_frozen_layers=frozen_layers)
else:
    raise ValueError('Unsupported Model : {}'.format(model_name))
    
# # Step 2
# frozen_layers = 18
# # load_model AssertionError possibly due to : https://github.com/fchollet/keras/pull/4338/files
# model = custom_vgg16_model(data_dir + 'bottleneck/vgg16/frozen25_20170629-222829.h5',
#                            num_frozen_layers=frozen_layers)

#print(model.summary())
# check trainability of all layers
for i, layer in enumerate(model.layers):
    print(i, layer.name, layer.trainable if hasattr(layer, 'trainable') else False)

(0, 'input_1', False)
(1, 'zeropadding2d_1', False)
(2, 'conv1', False)
(3, 'bn_conv1', False)
(4, 'activation_1', False)
(5, 'maxpooling2d_1', False)
(6, 'res2a_branch2a', False)
(7, 'bn2a_branch2a', False)
(8, 'activation_2', False)
(9, 'res2a_branch2b', False)
(10, 'bn2a_branch2b', False)
(11, 'activation_3', False)
(12, 'res2a_branch2c', False)
(13, 'res2a_branch1', False)
(14, 'bn2a_branch2c', False)
(15, 'bn2a_branch1', False)
(16, 'merge_1', False)
(17, 'activation_4', False)
(18, 'res2b_branch2a', False)
(19, 'bn2b_branch2a', False)
(20, 'activation_5', False)
(21, 'res2b_branch2b', False)
(22, 'bn2b_branch2b', False)
(23, 'activation_6', False)
(24, 'res2b_branch2c', False)
(25, 'bn2b_branch2c', False)
(26, 'merge_2', False)
(27, 'activation_7', False)
(28, 'res2c_branch2a', False)
(29, 'bn2c_branch2a', False)
(30, 'activation_8', False)
(31, 'res2c_branch2b', False)
(32, 'bn2c_branch2b', False)
(33, 'activation_9', False)
(34, 'res2c_branch2c', False)
(35, 'bn2c_branch2c', Fa

In [9]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
zeropadding2d_1 (ZeroPadding2D)  (None, 3, 230, 230)   0           input_1[0][0]                    
____________________________________________________________________________________________________
conv1 (Convolution2D)            (None, 64, 112, 112)  9472        zeropadding2d_1[0][0]            
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, 64, 112, 112)  256         conv1[0][0]                      
___________________________________________________________________________________________

In [10]:
# Note: threshold is fixed (not optimized per label)
def compute_f2_measure(l_model, x_data, y_data):
    custom_val_gen = CustomImgGenerator()
    val_generator_f2 = custom_val_gen.validationGen(x_data, y_data, 64)
    raw_pred = l_model.predict_generator(val_generator_f2, x_data.shape[0]) # VGG16 GpuArrayException: out of memory
    thresholded_pred = (np.array(raw_pred) > classifier_threshold).astype(int)
    l_f2_score = fbeta_score(y_data, thresholded_pred, beta=2, average=score_averaging_method)
    return l_f2_score

class F2_Validation(k.callbacks.Callback):
    def __init__(self, x_data, y_data):
        super(F2_Validation, self).__init__()
        # Ran into MemoryError when training DAGG_2 with 4 channels at epoch 50.
        # To try to get reduce memory usage, limit the number of samples to an arbitrary small number
        validation_num_samples = min(640, x_data.shape[0])
        self.x_data = x_data[:validation_num_samples]
        self.y_data = y_data[:validation_num_samples]
    
    def on_train_begin(self, logs={}):
        self.f2_measures = []
    def on_epoch_end(self, epoch, logs={}):
        self.f2_measures.append(compute_f2_measure(self.model, self.x_data, self.y_data))

In [11]:
# early stopping prevents overfitting on training data
early_stop = EarlyStopping(monitor='val_loss',patience=3, min_delta=0, verbose=0, mode='auto')

model_filepath = data_dir + 'bottleneck/{}/frozen{}_{}.h5'.format(model_name, frozen_layers, file_uuid)
# save only the best model, not the latest epoch model.
checkpoint = ModelCheckpoint(model_filepath, monitor='val_loss', verbose=1, save_best_only=True)

In [12]:
batch_size = 32

custom_gen = CustomImgGenerator()
#f2_score_val = F2_Validation(x_valid, y_valid)

train_datagen = BottleNeckImgGenerator()
train_gen = train_datagen.trainGen(x_train, y_train, batch_size)
valid_datagen = BottleNeckImgGenerator()
valid_gen = valid_datagen.validationGen(x_valid, y_valid, batch_size)

In [13]:
history = {}

# train the top classifer only from the full model
if train_top_classifer:
    training_start_time = datetime.now()

    learning_rate_schedule = [0.001, 0.0001]
    max_epoch_per_learning_rate = [7, 10]
    num_samples_per_epoch = x_train.shape[0]
    
    for learn_rate, epochs in zip(learning_rate_schedule, max_epoch_per_learning_rate):
        print('learning rate :{}'.format(learn_rate))
        model.optimizer.lr.set_value(learn_rate)
        
        tmp_history = model.fit_generator(train_gen,
                        samples_per_epoch=num_samples_per_epoch,
                        nb_epoch=epochs,
                        validation_data=valid_gen,
                        nb_val_samples=number_validations,              
                        verbose=verbose_level,
                        callbacks=[early_stop, checkpoint])
    
        for k, v in tmp_history.history.iteritems():
            history.setdefault(k, []).extend(v)

    time_spent_trianing = datetime.now() - training_start_time
    print('top classifier layers training complete. Time taken: {}'.format(time_spent_trianing))


learning rate :0.001
Epoch 00000: val_loss improved from inf to 0.11627, saving model to D:/Downloads/amazon/bottleneck/resnet50/frozen175_20170702-140130.h5
learning rate :0.0001
Epoch 00000: val_loss improved from 0.11627 to 0.11223, saving model to D:/Downloads/amazon/bottleneck/resnet50/frozen175_20170702-140130.h5
top classifier layers training complete. Time taken: 0:10:01.194000


In [15]:
#TODO put this in a utils script
def normalize_images(images):
    # int8 to float16, subtract mean, transpose
    x_result = images.astype(np.float16)
    subtract_mean(x_result)
    x_result = x_result.transpose(0,3,1,2) # theano expects channels come before dims
    return x_result

In [17]:

if train_top_classifer:
    valid_datagen = BottleNeckImgGenerator()
    valid_gen = valid_datagen.validationGen(x_valid, y_valid, batch_size)
    
    p_valid = model.predict_generator(valid_gen, x_valid.shape[0])
    y_predictions = (np.array(p_valid) > 0.2).astype(int)

    precision_s = precision_score(y_valid, y_predictions, average='samples')
    print('>>>> Overall precision score over validation set ' , precision_s)

    recall_s = recall_score(y_valid, y_predictions, average='samples')
    print('>>>> Overall recall score over validation set ' , recall_s)

    f2_score = fbeta_score(y_valid, y_predictions, beta=2, average='samples')
    print('>>>> Overall F2 score over validation set ' , f2_score)

('>>>> Overall precision score over validation set ', 0.83815366396888136)
('>>>> Overall recall score over validation set ', 0.93734413231695846)
('>>>> Overall F2 score over validation set ', 0.90345309239206284)


In [18]:
# unfreeze more layers

if train_top_classifer:
    # will loading the best model from Checkpoint improve performance?
    frozen_layers = 164
    model = freeze_layers(model, num_frozen_layers=frozen_layers)

    model_filepath = data_dir + 'bottleneck/{}/frozen{}_{}.h5'.format(model_name, frozen_layers, file_uuid)
    # save only the best model, not the latest epoch model.
    checkpoint = ModelCheckpoint(model_filepath, monitor='val_loss', verbose=1, save_best_only=True)
    # check trainability of all layers
    for i, layer in enumerate(model.layers):
       print(i, layer.name, layer.trainable if hasattr(layer, 'trainable') else False)

(0, 'input_1', False)
(1, 'zeropadding2d_1', False)
(2, 'conv1', False)
(3, 'bn_conv1', False)
(4, 'activation_1', False)
(5, 'maxpooling2d_1', False)
(6, 'res2a_branch2a', False)
(7, 'bn2a_branch2a', False)
(8, 'activation_2', False)
(9, 'res2a_branch2b', False)
(10, 'bn2a_branch2b', False)
(11, 'activation_3', False)
(12, 'res2a_branch2c', False)
(13, 'res2a_branch1', False)
(14, 'bn2a_branch2c', False)
(15, 'bn2a_branch1', False)
(16, 'merge_1', False)
(17, 'activation_4', False)
(18, 'res2b_branch2a', False)
(19, 'bn2b_branch2a', False)
(20, 'activation_5', False)
(21, 'res2b_branch2b', False)
(22, 'bn2b_branch2b', False)
(23, 'activation_6', False)
(24, 'res2b_branch2c', False)
(25, 'bn2b_branch2c', False)
(26, 'merge_2', False)
(27, 'activation_7', False)
(28, 'res2c_branch2a', False)
(29, 'bn2c_branch2a', False)
(30, 'activation_8', False)
(31, 'res2c_branch2b', False)
(32, 'bn2c_branch2b', False)
(33, 'activation_9', False)
(34, 'res2c_branch2c', False)
(35, 'bn2c_branch2c', Fa

In [19]:
training_start_time = datetime.now()

learning_rate_schedule = [0.001, 0.0002]
max_epoch_per_learning_rate = [100, 100]

history = {}
f2_history = []

num_samples_per_epoch = x_train.shape[0]

for learn_rate, epochs in zip(learning_rate_schedule, max_epoch_per_learning_rate):
    print('learning rate :{}'.format(learn_rate))
    model.optimizer.lr.set_value(learn_rate) # https://github.com/fchollet/keras/issues/888

    # DividByZero BUG.
    # TODO split x_train into smaller batches for larger models
    #train_gen = custom_gen.trainGen(x_train, y_train, batch_size, scale=False)
    #valid_gen = custom_gen.validationGen(x_valid, y_valid, batch_size, scale=False)
    
    tmp_history = model.fit_generator(train_gen,
                        samples_per_epoch=num_samples_per_epoch,
                        nb_epoch=epochs,
                        validation_data=valid_gen,
                        nb_val_samples=number_validations,              
                        verbose=verbose_level,
                        callbacks=[early_stop, checkpoint])
    
    for k, v in tmp_history.history.iteritems():
        history.setdefault(k, []).extend(v)

time_spent_trianing = datetime.now() - training_start_time
print('{} model training complete. Time taken: {}'.format(model_name, time_spent_trianing))

learning rate :0.001
Epoch 00000: val_loss improved from inf to 0.11198, saving model to D:/Downloads/amazon/bottleneck/resnet50/frozen164_20170702-140130.h5
learning rate :0.0002
Epoch 00000: val_loss improved from 0.11198 to 0.11183, saving model to D:/Downloads/amazon/bottleneck/resnet50/frozen164_20170702-140130.h5
resnet50 model training complete. Time taken: 0:10:25.288000


In [20]:
#model_filepath = data_dir + 'bottleneck/vgg16/frozen{}_{}_model.h5'.format(frozen_layers, 'manual_notbest_001')
#model.save(model_filepath)

In [21]:
valid_datagen = BottleNeckImgGenerator()
valid_gen = valid_datagen.validationGen(x_valid, y_valid, batch_size)

p_valid = model.predict_generator(valid_gen, number_validations)

optimized_thresholds = f2_optimized_thresholds(y_valid, p_valid)

y_predictions = (np.array(p_valid) > optimized_thresholds).astype(int)

precision_s = precision_score(y_valid, y_predictions, average='samples')
print('>>>> Overall precision score over validation set ' , precision_s)

recall_s = recall_score(y_valid, y_predictions, average='samples')
print('>>>> Overall recall score over validation set ' , recall_s)

# F2 score, which gives twice the weight to recall
# 'samples' is what the evaluation criteria is for the contest
f2_score = fbeta_score(y_valid, y_predictions, beta=2, average='samples')
print('>>>> Overall F2 score over validation set ' , f2_score)

  'precision', 'predicted', average, warn_for)


label:0 threshold:0.88 score:0.870573637412
label:1 threshold:0.24 score:0.874631425717
label:2 threshold:0.26 score:0.874654341986
label:3 threshold:0.22 score:0.877748530552
label:4 threshold:0.21 score:0.880519950454
label:5 threshold:0.21 score:0.880543582376
label:6 threshold:0.19 score:0.886213823079
label:7 threshold:0.2 score:0.887133824277
label:8 threshold:0.21 score:0.891908324863
label:9 threshold:0.19 score:0.894887948477
label:10 threshold:0.11 score:0.895102473598
label:11 threshold:0.15 score:0.89689576404
label:12 threshold:0.32 score:0.897445318492
label:13 threshold:0.21 score:0.897515900085
label:14 threshold:0.21 score:0.902355689508
label:15 threshold:0.15 score:0.906250140735
label:16 threshold:0.29 score:0.906329763224
('>>>> Overall precision score over validation set ', 0.83687525487797221)
('>>>> Overall recall score over validation set ', 0.94147550818746473)
('>>>> Overall F2 score over validation set ', 0.90632976322427072)


In [22]:
threshold_df = pd.DataFrame({'label':labels, 
                             'optimized_threshold':optimized_thresholds})
print(threshold_df)

                label  optimized_threshold
0          slash_burn                 0.88
1               clear                 0.24
2            blooming                 0.26
3             primary                 0.22
4              cloudy                 0.21
5   conventional_mine                 0.21
6               water                 0.19
7                haze                 0.20
8         cultivation                 0.21
9       partly_cloudy                 0.19
10     artisinal_mine                 0.11
11         habitation                 0.15
12        bare_ground                 0.32
13          blow_down                 0.21
14        agriculture                 0.21
15               road                 0.15
16  selective_logging                 0.29


In [23]:
precision_l, recall_l, f2_score_l = calculate_stats_for_prediction(y_valid, y_predictions)

prediction_stats_df = pd.DataFrame({
    'label': labels, 
    'true_sum': np.sum(y_valid, axis=0),
    'predict_sum': np.sum(y_predictions, axis=0),
    'f2': f2_score_l,
    'recall': recall_l,
    'precision': precision_l
})

# reordering the columns for easier reading
prediction_stats_df = prediction_stats_df[['label', 'f2', 'recall', 'precision', 'true_sum', 'predict_sum']]
print(prediction_stats_df)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


                label    f2  recall  precision  true_sum  predict_sum
0          slash_burn  0.00    0.00       0.00        24            0
1               clear  0.97    0.99       0.91      2873         3132
2            blooming  0.09    0.07       0.22        27            9
3             primary  0.99    0.99       0.96      3759         3879
4              cloudy  0.85    0.94       0.60       199          314
5   conventional_mine  0.53    0.57       0.40         7           10
6               water  0.79    0.84       0.63       716          964
7                haze  0.73    0.83       0.50       262          440
8         cultivation  0.61    0.67       0.46       451          650
9       partly_cloudy  0.87    0.91       0.74       714          884
10     artisinal_mine  0.80    0.86       0.63        35           48
11         habitation  0.72    0.82       0.48       346          589
12        bare_ground  0.29    0.26       0.47       102           57
13          blow_dow

In [26]:
figures_dir = 'figures/{}'.format(model_name)
makedirs(figures_dir)

plot_file_path = figures_dir + '/stats_' + file_uuid + '.png'
trainHistoryPlot(plot_file_path, history, f2_history, prediction_stats_df)

In [29]:
sample_submission_filepath = data_dir + 'sample_submission_v2.csv'

real_submission_filepath = data_dir + 'my_submissions/submission_{}_{}.csv'.format(model_name, file_uuid)
print(real_submission_filepath)

D:/Downloads/amazon/my_submissions/submission_resnet50_20170702-140130.csv


In [36]:
make_submission(model,
                optimized_thresholds,
                rescaled_dim, 
                labels, 
                sample_submission_filepath,
                real_submission_filepath)

100%|███████████████████████████████████████████████████████████████████████████| 61191/61191 [01:04<00:00, 942.59it/s]


submission file generated: D:/Downloads/amazon/my_submissions/submission_resnet50_20170702-140130.csv
