In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import os
import gc
import configparser
from datetime import datetime
import time

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import fbeta_score, precision_score, recall_score

from utils.f2thresholdfinder import *
from utils.loaderjpg import *
from utils.generator import *
from utils.custommetrics import *
from utils.visualization import *
from utils.predictorjpg import *
from utils.file import *

from pretrained.vgg16 import *
from pretrained.resnet50 import *

Using Theano backend.
Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:01:00.0)


In [2]:
config_file = 'cfg/default.cfg'

print('reading configurations from config file: {}'.format(config_file))

settings = configparser.ConfigParser()
settings.read(config_file)
data_dir = settings.get('data', 'data_dir')

rescaled_dim = 224

model_name = 'vgg16'
#model_name = 'resnet50'

train_top_classifer = True

file_uuid = time.strftime("%Y%m%d-%H%M%S")

verbose_level = 0

labels = ['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']

reading configurations from config file: cfg/default.cfg




In [3]:
df_train = pd.read_csv(data_dir + 'train_v2.csv')
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)

(40479L, 224L, 224L, 3L)
(40479L, 17L)


In [4]:
# subtract the mean
# Reference: https://gist.github.com/baraldilorenzo/07d7802847aaad0a35d3
# BGR mean values [103.94, 116.78, 123.68] should be subtracted before feeding into the model

#x_train[:,:,:,0] -= 104
#x_train[:,:,:,1] -= 117
#x_train[:,:,:,2] -= 124

In [5]:
number_of_samples = x_train.shape[0]
split = int(number_of_samples * 0.90)
                     
x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

number_validations = number_of_samples - split

In [6]:
if model_name == 'vgg16':
    frozen_layers = 19 # train top layers only
    model = vgg16_model_custom_top(num_classes=17, 
                                   num_frozen_layers=frozen_layers)
elif model_name == 'resnet50':
    frozen_layers = 175 # 175 : classifier top layers only
    model = resnet50_model_custom_top(num_classes=17, 
                                      num_frozen_layers=frozen_layers)
else:
    raise ValueError('Unsupported Model : {}'.format(model_name))

print(model.summary())
# check trainability of all layers
for i, layer in enumerate(model.layers):
    print(i, layer.name, layer.trainable if hasattr(layer, 'trainable') else False)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
block1_conv1 (Convolution2D)     (None, 64, 224, 224)  1792        input_1[0][0]                    
____________________________________________________________________________________________________
block1_conv2 (Convolution2D)     (None, 64, 224, 224)  36928       block1_conv1[0][0]               
____________________________________________________________________________________________________
block1_pool (MaxPooling2D)       (None, 64, 112, 112)  0           block1_conv2[0][0]               
___________________________________________________________________________________________

In [7]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
block1_conv1 (Convolution2D)     (None, 64, 224, 224)  1792        input_1[0][0]                    
____________________________________________________________________________________________________
block1_conv2 (Convolution2D)     (None, 64, 224, 224)  36928       block1_conv1[0][0]               
____________________________________________________________________________________________________
block1_pool (MaxPooling2D)       (None, 64, 112, 112)  0           block1_conv2[0][0]               
___________________________________________________________________________________________

In [8]:
# early stopping prevents overfitting on training data
early_stop = EarlyStopping(monitor='val_loss',patience=1, min_delta=0, verbose=0, mode='auto')

model_filepath = data_dir + 'bottleneck/{}/frozen{}_{}.h5'.format(model_name, frozen_layers, file_uuid)
# save only the best model, not the latest epoch model.
checkpoint = ModelCheckpoint(model_filepath, monitor='val_loss', verbose=1, save_best_only=True)

In [9]:
batch_size = 64

custom_gen = CustomImgGenerator()

train_datagen = BottleNeckImgGenerator()
train_gen = train_datagen.trainGen(x_train, y_train, batch_size)
valid_datagen = BottleNeckImgGenerator()
valid_gen = valid_datagen.validationGen(x_valid, y_valid, batch_size)

In [10]:
history = {}
f2_history = []

# train the top classifer only from the full model
if train_top_classifer:
    training_start_time = datetime.now()

    learning_rate_schedule = [0.0001, 0.00005]  # starting at 0.001 does not yeild good val_loss. 
    max_epoch_per_learning_rate = [70, 10]
    num_samples_per_epoch = x_train.shape[0]
    
    for learn_rate, epochs in zip(learning_rate_schedule, max_epoch_per_learning_rate):
        print('learning rate :{}'.format(learn_rate))
        model.optimizer.lr.set_value(learn_rate)
        
        tmp_history = model.fit_generator(train_gen,
                        samples_per_epoch=num_samples_per_epoch,
                        nb_epoch=epochs,
                        validation_data=valid_gen,
                        nb_val_samples=number_validations,              
                        verbose=verbose_level,
                        callbacks=[early_stop, checkpoint])
    
        for k, v in tmp_history.history.iteritems():
            history.setdefault(k, []).extend(v)

    time_spent_trianing = datetime.now() - training_start_time
    print('top classifier layers training complete. Time taken: {}'.format(time_spent_trianing))


learning rate :0.0001
Epoch 00000: val_loss improved from inf to 0.11933, saving model to D:/Downloads/amazon/bottleneck/vgg16/frozen19_20170704-132538.h5
Epoch 00001: val_loss improved from 0.11933 to 0.11524, saving model to D:/Downloads/amazon/bottleneck/vgg16/frozen19_20170704-132538.h5
Epoch 00002: val_loss improved from 0.11524 to 0.11293, saving model to D:/Downloads/amazon/bottleneck/vgg16/frozen19_20170704-132538.h5
Epoch 00003: val_loss improved from 0.11293 to 0.11057, saving model to D:/Downloads/amazon/bottleneck/vgg16/frozen19_20170704-132538.h5
Epoch 00004: val_loss did not improve
Epoch 00005: val_loss did not improve
learning rate :5e-05
Epoch 00000: val_loss did not improve
Epoch 00001: val_loss did not improve
Epoch 00002: val_loss did not improve
top classifier layers training complete. Time taken: 1:06:43.661000


In [11]:
# #TODO put this in a utils script
# def normalize_images(images):
#     # int8 to float16, subtract mean, transpose
#     x_result = images.astype(np.float16)
#     subtract_mean(x_result)
#     x_result = x_result.transpose(0,3,1,2) # theano expects channels come before dims
#     return x_result

In [12]:
if train_top_classifer:
    valid_datagen = BottleNeckImgGenerator()
    valid_gen = valid_datagen.validationGen(x_valid, y_valid, batch_size)
    
    p_valid = model.predict_generator(valid_gen, x_valid.shape[0])
    y_predictions = (np.array(p_valid) > 0.2).astype(int)

    precision_s = precision_score(y_valid, y_predictions, average='samples')
    print('>>>> Overall precision score over validation set ' , precision_s)

    recall_s = recall_score(y_valid, y_predictions, average='samples')
    print('>>>> Overall recall score over validation set ' , recall_s)

    f2_score = fbeta_score(y_valid, y_predictions, beta=2, average='samples')
    print('>>>> Overall F2 score over validation set ' , f2_score)

('>>>> Overall precision score over validation set ', 0.85523911474998426)
('>>>> Overall recall score over validation set ', 0.93329833662714101)
('>>>> Overall F2 score over validation set ', 0.90642164315693663)


In [13]:
figures_dir = 'figures/{}'.format(model_name)
makedirs(figures_dir)

plot_file_path = figures_dir + '/stats_frozen{}_{}.png'.format(frozen_layers, file_uuid)
trainHistoryPlot(plot_file_path, history, [], None)

In [14]:
# # unfreeze more layers for second stage training

# if train_top_classifer:
#     # will loading the best model from Checkpoint improve performance?
#     frozen_layers = 164
#     model = freeze_layers(model, num_frozen_layers=frozen_layers)

#     model_filepath = data_dir + 'bottleneck/{}/frozen{}_{}.h5'.format(model_name, frozen_layers, file_uuid)
#     # save only the best model, not the latest epoch model.
#     checkpoint = ModelCheckpoint(model_filepath, monitor='val_loss', verbose=1, save_best_only=True)
#     # check trainability of all layers
#     for i, layer in enumerate(model.layers):
#        print(i, layer.name, layer.trainable if hasattr(layer, 'trainable') else False)

In [15]:
# training_start_time = datetime.now()

# learning_rate_schedule = [0.001, 0.0002]
# max_epoch_per_learning_rate = [100, 100]

# history = {}
# f2_history = []

# num_samples_per_epoch = x_train.shape[0]

# for learn_rate, epochs in zip(learning_rate_schedule, max_epoch_per_learning_rate):
#     print('learning rate :{}'.format(learn_rate))
#     model.optimizer.lr.set_value(learn_rate)
    
#     tmp_history = model.fit_generator(train_gen,
#                         samples_per_epoch=num_samples_per_epoch,
#                         nb_epoch=epochs,
#                         validation_data=valid_gen,
#                         nb_val_samples=number_validations,              
#                         verbose=verbose_level,
#                         callbacks=[early_stop, checkpoint])
    
#     for k, v in tmp_history.history.iteritems():
#         history.setdefault(k, []).extend(v)

# time_spent_trianing = datetime.now() - training_start_time
# print('{} model training complete. Time taken: {}'.format(model_name, time_spent_trianing))

In [16]:
# # load your best model before making any final predictions
# import gc
# del model
# gc.collect()
# print('loading model: {}'.format(model_filepath))
# model = load_model(model_filepath)

In [17]:
# valid_datagen = BottleNeckImgGenerator()
# valid_gen = valid_datagen.validationGen(x_valid, y_valid, batch_size)

# p_valid = model.predict_generator(valid_gen, number_validations)

# optimized_thresholds = f2_optimized_thresholds(y_valid, p_valid)

# y_predictions = (np.array(p_valid) > optimized_thresholds).astype(int)

# precision_s = precision_score(y_valid, y_predictions, average='samples')
# print('>>>> Overall precision score over validation set ' , precision_s)

# recall_s = recall_score(y_valid, y_predictions, average='samples')
# print('>>>> Overall recall score over validation set ' , recall_s)

# # F2 score, which gives twice the weight to recall
# # 'samples' is what the evaluation criteria is for the contest
# f2_score = fbeta_score(y_valid, y_predictions, beta=2, average='samples')
# print('>>>> Overall F2 score over validation set ' , f2_score)

In [18]:
# threshold_df = pd.DataFrame({'label':labels, 
#                              'optimized_threshold':optimized_thresholds})
# print(threshold_df)

In [19]:
# precision_l, recall_l, f2_score_l = calculate_stats_for_prediction(y_valid, y_predictions)

# prediction_stats_df = pd.DataFrame({
#     'label': labels, 
#     'true_sum': np.sum(y_valid, axis=0),
#     'predict_sum': np.sum(y_predictions, axis=0),
#     'f2': f2_score_l,
#     'recall': recall_l,
#     'precision': precision_l
# })

# # reordering the columns for easier reading
# prediction_stats_df = prediction_stats_df[['label', 'f2', 'recall', 'precision', 'true_sum', 'predict_sum']]
# print(prediction_stats_df)

In [20]:
# figures_dir = 'figures/{}'.format(model_name)
# makedirs(figures_dir)

# plot_file_path = figures_dir + '/stats_frozen{}_{}.png'.format(frozen_layers, file_uuid)
# trainHistoryPlot(plot_file_path, history, f2_history, prediction_stats_df)

In [21]:
# sample_submission_filepath = data_dir + 'sample_submission_v2.csv'

# real_submission_filepath = data_dir + 'my_submissions/submission_{}_{}.csv'.format(model_name, file_uuid)

In [22]:
# make_submission(model,
#                 optimized_thresholds,
#                 rescaled_dim, 
#                 labels, 
#                 sample_submission_filepath,
#                 real_submission_filepath)