# Classifier on top of the augmented training data predictions
### a replacement of the mean tta predictions

In [8]:
# print_function for compatibility with Python 3
from __future__ import print_function
print('print function is ready to serve')

# Matplotlib for visualization
from matplotlib import pyplot as plt

# display plots in the notebook
%matplotlib inline

# NumPy for numerical computing
import numpy as np
np.random.seed(123)
import random
random.seed(123)

# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_column', 100)

# sklearn for model training
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import fbeta_score, make_scorer


import os
import gc
from skimage import io
from skimage.transform import rescale, resize, downscale_local_mean

print function is ready to serve


In [2]:
import sys
sys.path.append('./utils')

from data import Data
from models import Models
from tags import Tags
tags = Tags()

Using TensorFlow backend.


In [3]:
PLANET_KAGGLE_ROOT = '/data/planet-data/'
if not os.path.exists(PLANET_KAGGLE_ROOT):
    PLANET_KAGGLE_ROOT = '/Users/jiayou/Documents/Kaggle Data/Amazon'

N_TAGS = 17
N_TRAIN = 40479
# N_TRAIN = 10
N_TEST_T = 40669
N_TEST_F = 20522
N_TEST = N_TEST_T + N_TEST_F

In [4]:
val = 4
train_pred = np.load(os.path.join(PLANET_KAGGLE_ROOT, 'ensemble', 'raw_train_pred.v9.f{}.tta8.npy'.format(val)))
y_true = tags.y_train(range(N_TRAIN))

In [5]:
y_true.shape

(40479, 17)

In [6]:
pipelines = {
    'l1':make_pipeline(StandardScaler(), LogisticRegression(random_state = 123, penalty = 'l1')),
    'l2':make_pipeline(StandardScaler(), LogisticRegression(random_state = 123, penalty = 'l2')),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier(random_state = 123)),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 123))
}


In [7]:
l1_hyperparameters = {
    'logisticregression__C':np.linspace(1e-3, 1e3, 10)
}

l2_hyperparameters = {
    'logisticregression__C':np.linspace(1e-3, 1e3, 10)
}

rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [20, 100, 200],
    'randomforestclassifier__max_features':['auto', 'sqrt', 0.33]
}

gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators':[20, 100, 200],
    'gradientboostingclassifier__learning_rate':[0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth':[1,3,5]
}

hyperparameters = {
    'l1':l1_hyperparameters,
    'l2':l2_hyperparameters,
    'rf':rf_hyperparameters,
    'gb':gb_hyperparameters
}

In [18]:
ftwo_scorer = make_scorer(fbeta_score, beta=2)

def top_classifier(tag_id):
    tag_name = tags.idx_to_tag(tag_id)
    feature = train_pred[:,tag_id]
    feature = feature.reshape((N_TRAIN, 8))
    feature.sort(axis = 1)
    X = pd.DataFrame(feature)
#     X['mean'] = feature.mean(axis = 1)
#     X['std'] = feature.std(axis = 1)
#     X['median'] = np.median(feature, axis = 1)
#     X['min'] = feature.min(axis = 1)
#     X['max'] = feature.max(axis = 1)
    y = y_true[:,tag_id]
    # X = X.drop(['0', '1', '2', '3', '4', '5', '6', '7'], axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1234, test_size = 0.2, stratify = y)
    X_average = X_test.mean(axis = 1)
    fitted_models = {}
    for name in pipelines:
        model = GridSearchCV(pipelines[name], hyperparameters[name], scoring=ftwo_scorer, cv = 10, n_jobs = -1)
        model.fit(X_train, y_train)
        fitted_models[name] = model
        print('{} has been fitted on {}'.format(name, tag_name))
    for name, model in fitted_models.items():
        pred = fitted_models[name].predict_proba(X_test)
        pred = [p[1] for p in pred]
        f2_average = fbeta_score(y_true = y_test, y_pred = (X_average > 0.2).astype('int'), beta = 2)
        f2_modeled = fbeta_score(y_true = y_test, y_pred = (pred > 0.2).astype('int'), beta = 2)
        print('{} on {}: best_score is {}; average score is {}'.format(tag_name, name, f2_modeled, f2_average))
#         fpr, tpr, thresholds = roc_curve(y_test, pred)
#         auc_classifier = auc(fpr, tpr)
#         fpr_ave, tpr_ave, thresholds_ave = roc_curve(y_test, X_average)
#         auc_ave = auc(fpr_ave, tpr_ave)
#         print('{} on {}: best_score is {}; auc is {}; average auc is{}'.format(tag_name, name, model.best_score_, auc_classifier, auc_ave))
    return fitted_models
  

In [None]:
result_f2 = []
for tag_id in range(N_TAGS):
    result_f2.append(top_classifier(tag_id))

l1 has been fitted on haze
l2 has been fitted on haze
rf has been fitted on haze


In [111]:
result = []
for tag_id in range(N_TAGS):
    result.append(top_classifier(tag_id))

l1 has been fitted on haze
l2 has been fitted on haze
rf has been fitted on haze
gb has been fitted on haze
haze on l1: best_score is 0.965012506562085; auc is 0.9796537042042628; average auc is0.9797801397075486
haze on l2: best_score is 0.9639934533551555; auc is 0.9797875048824972; average auc is0.9797801397075486
haze on rf: best_score is 0.962943519748016; auc is 0.9715138356038941; average auc is0.9797801397075486
haze on gb: best_score is 0.9647037025599852; auc is 0.978871277118881; average auc is0.9797801397075486
l1 has been fitted on primary
l2 has been fitted on primary
rf has been fitted on primary
gb has been fitted on primary
primary on l1: best_score is 0.972114998610382; auc is 0.9874770721278662; average auc is0.9875386551394058
primary on l2: best_score is 0.972084118210172; auc is 0.9872318638592904; average auc is0.9875386551394058
primary on rf: best_score is 0.9702312941975728; auc is 0.9797546973341074; average auc is0.9875386551394058
primary on gb: best_score 

In [104]:
for name, model in fitted_models.items():
    print(model.best_estimator_.steps[1])

('logisticregression', LogisticRegression(C=111.11200000000001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=123,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))
('logisticregression', LogisticRegression(C=111.11200000000001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=123,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))
('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=123,
            verbose=0, warm

In [103]:
for name, model in fitted_models.items():
    pred = fitted_models[name].predict_proba(X_test)
    pred = [p[1] for p in pred]
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    print(name, auc(fpr, tpr))

l1 0.95162624817
l2 0.951631553238
rf 0.940551867853
gb 0.937436670752


In [55]:
X_average = X_test.mean(axis = 1)
X_average.shape

(8096,)

In [56]:
fpr, tpr, thresholds = roc_curve(y_test, X_average)
print(auc(fpr, tpr))

0.950677151113


array([[ 0.99384856,  0.00615144],
       [ 0.51562938,  0.48437062],
       [ 0.99173061,  0.00826939],
       ..., 
       [ 0.81506768,  0.18493232],
       [ 0.78540844,  0.21459156],
       [ 0.94157646,  0.05842354]])