In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd

from keras.models import load_model

from utils.ensemble import *
from utils.f2thresholdfinder import *
from utils.loaderjpg import *
from utils.generator import *

from pretrained.custommodels import *

Using Theano backend.
Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:01:00.0)


The purose of this notebook is to find the optimal weight distributions to use in ensembling results from non-correlated models. I hand picked the best models I have to be used in ensembling.  Since the F2 calculation is super fast, we use a brute force approach to find the optimal weights that will optimize a metrics (in this case, the F2 score).

In [2]:
rescaled_dim = 224
data_dir = 'D:/Downloads/amazon/'

df_train = pd.read_csv(data_dir + 'train_v2.csv')
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)

number_of_samples = x_train.shape[0]
split = int(number_of_samples * 0.90)
                     
x_valid, y_valid = x_train[split:], y_train[split:]

del x_train, y_train

(40479L, 224L, 224L, 3L)
(40479L, 17L)


In [3]:
def predict_from_model(x, y, model_name, model):
    img_normalization = image_normalization_func(model_name)
    generator = BottleNeckImgGenerator(normalization=img_normalization)
    y_predict, thresholds = predict_with_optimal_thresholds(x, y, generator, model)
    return y_predict

In [4]:


y_predictions = []

model_name = 'resnet50'
model_filepath = None
weights_filepath = 'D:/Downloads/amazon/bottleneck/resnet50/frozen38_20170717-092206_weights_only.h5'
model = custom_top_model(model_name, num_classes=17, num_frozen_layers=0)
model.load_weights(weights_filepath)
img_normalization = image_normalization_func(model_name)
y_predictions.append(predict_from_model(x_valid, y_valid, model_name, model))

model_name = 'densenet121'
model_filepath = None
weights_filepath = 'D:/Downloads/amazon/bottleneck/densenet121/frozen73_20170716-171641_weights_only.h5'
model = custom_top_model(model_name, num_classes=17, num_frozen_layers=0)
model.load_weights(weights_filepath)
y_predictions.append(predict_from_model(x_valid, y_valid, model_name, model))

model_name = 'vgg16'
model_filepath = 'D:/Downloads/amazon/bottleneck/vgg16/frozen11_20170706-011852.h5'
weights_filepath = None
model = load_model(model_filepath)
y_predictions.append(predict_from_model(x_valid, y_valid, model_name, model))


  'precision', 'predicted', average, warn_for)


label:0 threshold:0.29 score:0.901941455181
label:1 threshold:0.08 score:0.90539498334
label:2 threshold:0.31 score:0.905609688139
label:3 threshold:0.17 score:0.907871945561
label:4 threshold:0.04 score:0.911936839873
label:5 threshold:0.25 score:0.911977083764
label:6 threshold:0.14 score:0.914077703266
label:7 threshold:0.17 score:0.916296101521
label:8 threshold:0.21 score:0.919021713075
label:9 threshold:0.11 score:0.920021282337
label:10 threshold:0.14 score:0.92023644765
label:11 threshold:0.17 score:0.922349162987
label:12 threshold:0.15 score:0.923297033475
label:13 threshold:0.21 score:0.92330687569
label:14 threshold:0.2 score:0.927807214006
label:15 threshold:0.18 score:0.930156605232
label:16 threshold:0.12 score:0.930250803913
('>>>> Overall precision score over validation set ', 0.87324594940084077)
('>>>> Overall recall score over validation set ', 0.95798483672124968)
('>>>> Overall F2 score over validation set ', 0.93025080391310422)
label:0 threshold:0.26 score:0.901

AttributeError: 'list' object has no attribute 'shape'

In [14]:
ekami_predictions = np.load('D:/Downloads/amazon/temp/valid_set_predictions.npy')

y_predictions.append(ekami_predictions)

In [23]:
print(ekami_predictions[0])

[0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:


def weighted_ensemble_f2_score_optimizer2(weights_combo, y_valid, y_predictions):
	"""
		An optimizer_func implementation that evaluate F2 scores as the metrics to optimize.
	"""
	y_predict_aggregate = np.zeros((y_valid.shape[0], y_valid.shape[1]), dtype=np.float32)
	for weight, y_predict in zip(weights_combo, y_predictions):
		y_predict_aggregate = y_predict_aggregate + (y_predict.astype(np.float32) * weight)
	binary_predictions = (np.array(y_predict_aggregate) >= 0.5).astype(int)
	f2_score = fbeta_score(y_valid, binary_predictions, beta=2, average='samples')
	print('> F2 score : {} for weights: {}'.format(f2_score, weights_combo))
	return f2_score

In [33]:
weights_combos = [
    [0.3333, 0.3333, 0.3333, 0.0],
    [0.4, 0.2, 0.2, 0.2],
    [0.3333, 0.3333, 0.2333, 0.1],
    [0.3333, 0.2333, 0.2333, 0.2],
    [0.35, 0.30, 0.15, 0.20],
    [0.4, 0.4, 0, 0.2],
    [0.35, 0.35, 0, 0.3],
    [0.4, 0.4, 0.1, 0.1],
    [0.3,0.3,0.2,0.2],
    [0.35,0.35,0.1,0.2],
    [0.4,0.3,0.2,0.1],
    [0.35,0.35,0.15,0.15],
    [0.25,0.25,0.25,0.25],
    [0.3, 0.3, 0.1, 0.3],
    [0.4,0.3,0.2,0.1],
    [0.3,0.3,0.3,0.1],
    [0.3,0.25,0.25,0.2],
    [0.35,0.3,0.2,0.15],
    [0.35,0.35,0.15,0.15],
    [0.35,0.15,0.35,0.15],
]

# weights_combos = [
#     [0.2, 0.5, 0.3],
#     [0.3333, 0.3333, 0.3333],
#     [0.4, 0.3, 0.3],
#     [0.4, 0.35, 0.25],
#     [0.4, 0.4, 0.2],
#     [0.45, 0.35, 0.2],
#     [0.5, 0.25, 0.25],
#     [0.5, 0.3, 0.2],
#     [0.6, 0.2, 0.2],
#     [0.7, 0.2, 0.1],
# ]

optimal_weights = eval_optimal_ensemble_weights(weights_combos, 
                                                y_predictions,
                                                y_valid, 
                                                weighted_ensemble_f2_score_optimizer2)

print('optimal_weights: {}'.format(optimal_weights))

> F2 score : 0.934267925187 for weights: [0.3333, 0.3333, 0.3333, 0.0]
> F2 score : 0.933530810491 for weights: [0.4, 0.2, 0.2, 0.2]
> F2 score : 0.934267925187 for weights: [0.3333, 0.3333, 0.2333, 0.1]
> F2 score : 0.933530810491 for weights: [0.3333, 0.2333, 0.2333, 0.2]
> F2 score : 0.9337629471 for weights: [0.35, 0.3, 0.15, 0.2]
> F2 score : 0.933585076014 for weights: [0.4, 0.4, 0, 0.2]
> F2 score : 0.933585076014 for weights: [0.35, 0.35, 0, 0.3]
> F2 score : 0.933812090054 for weights: [0.4, 0.4, 0.1, 0.1]
> F2 score : 0.933812090054 for weights: [0.3, 0.3, 0.2, 0.2]
> F2 score : 0.933585076014 for weights: [0.35, 0.35, 0.1, 0.2]
> F2 score : 0.933579662732 for weights: [0.4, 0.3, 0.2, 0.1]
> F2 score : 0.933812090054 for weights: [0.35, 0.35, 0.15, 0.15]
> F2 score : 0.933190461032 for weights: [0.25, 0.25, 0.25, 0.25]
> F2 score : 0.933585076014 for weights: [0.3, 0.3, 0.1, 0.3]
> F2 score : 0.933579662732 for weights: [0.4, 0.3, 0.2, 0.1]
> F2 score : 0.934267925187 for wei

In [12]:
def generate_ensemble_submission2(ensemble_submission_filename, submission_files, weights):
	""" 
		Generate a submission file based on majority vote amongst the submission files. 
		Each submission is weighted according to its performance / confidence.
	"""
	print('ensembling kaggle submission files: {}'.format(submission_files))
	class_names = ['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 
		'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 
		'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 
		'agriculture', 'road', 'selective_logging']
	num_classes = len(class_names)

	num = len(submission_files)
	for n in range(0, num):
		df = pd.read_csv(submission_dir + submission_files[n])
		for c in class_names:
			df[c] = df['tags'].apply(lambda x: 1 if c in x.split(' ') else 0)

		if n == 0:
			names  = df.iloc[:, 0].values
			N = df.shape[0]
			predictions = np.zeros((N, num_classes), dtype=np.float32)

		l = df.iloc[:,2:].values.astype(np.float32)
		predictions = predictions + (l * weights[n])

	binary_predictions = (np.array(predictions) >= 0.5).astype(int)
	predict_df = pd.DataFrame(binary_predictions, columns = class_names)
	df_submission = pd.read_csv(submission_dir + submission_files[0])
	submit_df = submission_dataframe(df_submission, predict_df)

	ensemble_submission_filepath = ensemble_output_dir + ensemble_submission_filename
	submit_df.to_csv(ensemble_submission_filepath, index=False)
	print('submission file generated: {}'.format(ensemble_submission_filepath))

In [13]:
submission_files = [
# WARNING!!!: ONLY USE FILES generated AFTER 6/25/2017 at 7pm (post Kaggle test data patch).
    'tta_submission_resnet50_20170718-085651_score_092912.csv',
    'tta_submission_densenet121_20170717-161234_score_092785.csv',
    'tta_submission_vgg16_20170717-171015_score_092680.csv',
    #'submission_20170626-025551_score_091200.csv'
    'submission_20170718-163707.csv'
]

generate_ensemble_submission2('weighted_resnet50_densenet121_vgg16_ekami_tta.csv', submission_files, optimal_weights)

ensembling kaggle submission files: ['tta_submission_resnet50_20170718-085651_score_092912.csv', 'tta_submission_densenet121_20170717-161234_score_092785.csv', 'tta_submission_vgg16_20170717-171015_score_092680.csv']


100%|██████████████████████████████████████████████████████████████████████████| 61191/61191 [00:57<00:00, 1067.55it/s]


submission file generated: D:/Downloads/amazon/my_submissions/ensemble/weighted_resnet50_densenet121_vgg16_tta.csv
