In this notebook, I use each of the sets of features previously extracted using the pre-trained deep models of the Keras library as data to train an XGBoost model, calibrating the probability predictions of the models using Scikit-Learn's CalibratedClassifierCV, and ensemble the results by averaging

In [1]:
from __future__ import print_function

In [3]:
#Data source
dpath = "/home/ubuntu/data/"
import pandas as pd, numpy as np

#Metrics to use
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import cross_val_score

#Classifier
from xgboost import XGBClassifier

The Datasets

In [4]:
train_images = pd.read_csv(dpath+"train_set.csv")
test_images = pd.read_csv(dpath+"test_set.csv")

In [5]:
test_image_list = test_images["Image"]
y_train = train_images["Class"].as_matrix()

In [6]:
feature_sets = ["inception_v3_2a", "vgg19_2a", "resnet50_2a"]

For each set of deep features, we'll train a XGBoost model

In [7]:
%%time
#This will take some time...

test_predictions = {}

for feat in feature_sets:
    print("- Processing Features:", feat)
    
    #Loading the feature set
    X_train = np.load(dpath+"features_%s_train.npy" % (feat))
    X_test = np.load(dpath+"features_%s_test.npy" % (feat))
        
    #Calibrating the model
    print("training...")
    model = XGBClassifier()
    model = model.fit(X_train, y_train)
    
    #Predicting
    print("predicting...")
    prediction = model.predict_proba(X_test)
    test_predictions[feat] = prediction
    
    #Creating an individual submission
    submission = pd.DataFrame(prediction)
    submission.columns = model.classes_
    submission = submission.set_index(test_image_list)
    submission.to_csv(dpath + "submission_xgb_with_%s.csv" % (feat))
    

- Processing Features: inception_v3_2a
training...
predicting...
- Processing Features: vgg19_2a
training...
predicting...
- Processing Features: resnet50_2a
training...
predicting...
CPU times: user 24min 1s, sys: 220 ms, total: 24min 2s
Wall time: 3min


In [8]:
predictions = pd.Panel(test_predictions)

Ensembling Predictions (Average)

In [9]:
#Average ensemble of the predictions

average = pd.DataFrame(np.zeros((1000, 8)))
total_weight = 0

for feat in feature_sets:
    weight = 1
    total_weight += weight
    prediction = predictions[feat]
    average += (prediction * weight)
    
average /= total_weight

In [11]:
#Create submission

average.columns = model.classes_
average = average.set_index(test_image_list)
average.to_csv(dpath + "average_ensemble-deep_featsX2-simple_xgb.csv")