In [1]:
%matplotlib inline

# Model Training
Code for finding the best predictive model   
_Author: Jimmy Charité_  
_Email: jimmy.charite@gmail.com_  
_Date: January 8, 2017_

### Directory & Initial Packages

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML

The default directory is the code subdirectory. Changing to the main repo directory above.

### Upload Data

In [3]:
retval=os.chdir("..")

In [4]:
clean_data=pd.read_csv("./clean_data/modeling_data.csv")
clean_data.head()

Unnamed: 0,height_0t50,height_100t150,height_150t200,height_200t250,height_250t300,height_300t350,height_350t400,height_400t,height_50t100,width_0t50,...,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you,image_type
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1


### Random States

In [5]:
my_rand_state=0

### Training and Testing Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = (clean_data.iloc[:,:-1]).as_matrix()
y = (clean_data.iloc[:,-1]).tolist()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                              random_state=my_rand_state)

### Class Imbalance Corrections

In [9]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks



In [77]:
ros = RandomOverSampler(random_state=my_rand_state)
smote = SMOTE(random_state=my_rand_state)
rus = RandomUnderSampler(random_state=my_rand_state)
tl = TomekLinks(random_state=my_rand_state)

### Feature Selection

In [11]:
from sklearn.feature_selection import VarianceThreshold

In [12]:
vt = VarianceThreshold()
threshold=[p*(1-p) for p in [0, 0.05, 0.1, 0.15]]

Note, since the formula for the variance of binary variables is p*(1-p), where p is the proportion of times that the binary variable is 1, I use the proportion to define the variance thresholds. The max variance is 0.25 at p=0.5.

### Classification Models

In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

Although tuning is not necessary for Naive Bayes, I pass the default parameters of those models to GridSearchCV anyway so that I can do a direct pair-wise comparison with the other models across the different steps of the cross-validation.

In [16]:
nb_clf=GaussianNB()
priors=[None]

In [35]:
qda_clf=QuadraticDiscriminantAnalysis()
reg_param=[0.0, 0.25, 0.5, 0.75]

In [38]:
log_clf=LogisticRegression()
C=[0.001 , 0.01, 10, 100,1000]

In [53]:
knn_clf=KNeighborsClassifier(n_jobs=4)
n_neighbors=list(range(1,17,2))
weights=['uniform','distance']

In [58]:
rf_clf=RandomForestClassifier()
n_estimators=[100]
max_features=[.1,.3,.5]

In [23]:
class_weight=['balanced']
class_weight.extend([{1: w} for w in [1, 2, 10]])

### Creating Pipelines

In [24]:
from imblearn import pipeline #needed if blending imblearn with sklearn classes
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [25]:
n_jobs=4

In [26]:
n_folds=10
skfold = StratifiedKFold(n_splits=n_folds,random_state=my_rand_state, shuffle=False)

#### Naive Bayes Estimators

In [30]:
nb_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',nb_clf)])
nb_clf_est_b = GridSearchCV(estimator=nb_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__priors=priors))

In [75]:
nb_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
                                      ('clf',nb_clf)])
nb_clf_est_ros = GridSearchCV(estimator=nb_clf_ros,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__priors=priors))

In [76]:
nb_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
                                      ('clf',nb_clf)])
nb_clf_est_smote = GridSearchCV(estimator=nb_clf_smote,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__priors=priors))

In [78]:
nb_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
                                      ('clf',nb_clf)])
nb_clf_est_rus = GridSearchCV(estimator=nb_clf_rus,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__priors=priors))

In [79]:
nb_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
                                      ('clf',nb_clf)])
nb_clf_est_tl = GridSearchCV(estimator=nb_clf_tl,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__priors=priors))

#### QDA Estimators

In [32]:
qda_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',qda_clf)])
qda_clf_est_b = GridSearchCV(estimator=qda_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__reg_param=reg_param))

In [80]:
qda_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
                                       ('clf',qda_clf)])
qda_clf_est_ros = GridSearchCV(estimator=qda_clf_ros,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__reg_param=reg_param))

In [81]:
qda_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
                                       ('clf',qda_clf)])
qda_clf_est_smote = GridSearchCV(estimator=qda_clf_smote,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__reg_param=reg_param))

In [137]:
qda_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
                                       ('clf',qda_clf)])
qda_clf_est_rus = GridSearchCV(estimator=qda_clf_rus,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__reg_param=reg_param))

In [83]:
qda_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
                                       ('clf',qda_clf)])
qda_clf_est_tl = GridSearchCV(estimator=qda_clf_tl,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
                              clf__reg_param=reg_param))

#### Logistic Estimators

In [51]:
log_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',log_clf)])
log_clf_est_b = GridSearchCV(estimator=log_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__C=C,
              clf__class_weight=class_weight))

In [105]:
log_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
                                     ('clf',log_clf)])
log_clf_est_ros = GridSearchCV(estimator=log_clf_ros,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__C=C,
              clf__class_weight=class_weight))

In [106]:
log_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
                                     ('clf',log_clf)])
log_clf_est_smote = GridSearchCV(estimator=log_clf_smote,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__C=C,
              clf__class_weight=class_weight))

In [107]:
log_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
                                     ('clf',log_clf)])
log_clf_est_rus = GridSearchCV(estimator=log_clf_rus,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__C=C,
              clf__class_weight=class_weight))

In [108]:
log_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
                                     ('clf',log_clf)])
log_clf_est_tl = GridSearchCV(estimator=log_clf_tl,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,clf__C=C,
              clf__class_weight=class_weight))

#### KNN Estimators

In [42]:
knn_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',knn_clf)])
knn_clf_est_b = GridSearchCV(estimator=knn_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_neighbors=n_neighbors,
              clf__weights=weights))

In [109]:
knn_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
                                     ('clf',knn_clf)])
knn_clf_est_ros = GridSearchCV(estimator=knn_clf_ros,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_neighbors=n_neighbors,
              clf__weights=weights))

In [110]:
knn_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
                                     ('clf',knn_clf)])
knn_clf_est_smote = GridSearchCV(estimator=knn_clf_smote,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_neighbors=n_neighbors,
              clf__weights=weights))

In [111]:
knn_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
                                     ('clf',knn_clf)])
knn_clf_est_rus = GridSearchCV(estimator=knn_clf_rus,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_neighbors=n_neighbors,
              clf__weights=weights))

In [112]:
knn_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
                                     ('clf',knn_clf)])
knn_clf_est_tl = GridSearchCV(estimator=knn_clf_tl,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_neighbors=n_neighbors,
              clf__weights=weights))

#### Random Forest Estimators

In [48]:
rf_clf_b = pipeline.Pipeline(steps=[('vt',vt),('clf',rf_clf)])
rf_clf_est_b = GridSearchCV(estimator=rf_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [113]:
rf_clf_ros = pipeline.Pipeline(steps=[('ros',ros),('vt',vt),
                                    ('clf',rf_clf)])
rf_clf_est_ros = GridSearchCV(estimator=rf_clf_ros,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [114]:
rf_clf_smote = pipeline.Pipeline(steps=[('smote',smote),('vt',vt),
                                    ('clf',rf_clf)])
rf_clf_est_smote = GridSearchCV(estimator=rf_clf_smote,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [115]:
rf_clf_rus = pipeline.Pipeline(steps=[('rus',rus),('vt',vt),
                                    ('clf',rf_clf)])
rf_clf_est_rus = GridSearchCV(estimator=rf_clf_rus,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [116]:
rf_clf_tl = pipeline.Pipeline(steps=[('tl',tl),('vt',vt),
                                    ('clf',rf_clf)])
rf_clf_est_tl = GridSearchCV(estimator=rf_clf_tl,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

### Fitting Estimators

In [92]:
from sklearn.externals import joblib

#### Basic Estimators

#### Naive Bayes Estimators

In [31]:
nb_clf_est_b.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('vt', VarianceThreshold(threshold=0.0)), ('clf', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__priors': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [93]:
joblib.dump(nb_clf_est_b, './other_output/nb_clf_est_b.pkl')

['./other_output/nb_clf_est_b.pkl']

In [84]:
nb_clf_est_ros.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('ros', RandomOverSampler(random_state=0, ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__priors': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [94]:
joblib.dump(nb_clf_est_ros, './other_output/nb_clf_est_ros.pkl')

['./other_output/nb_clf_est_ros.pkl']

In [85]:
nb_clf_est_smote.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('smote', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=0,
   ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__priors': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [95]:
joblib.dump(nb_clf_est_smote, './other_output/nb_clf_est_smote.pkl')

['./other_output/nb_clf_est_smote.pkl']

In [86]:
nb_clf_est_rus.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('rus', RandomUnderSampler(random_state=0, ratio='auto', replacement=True,
          return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__priors': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [96]:
joblib.dump(nb_clf_est_rus, './other_output/nb_clf_est_rus.pkl')

['./other_output/nb_clf_est_rus.pkl']

In [87]:
nb_clf_est_tl.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('tl', TomekLinks(n_jobs=-1, random_state=0, return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__priors': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [97]:
joblib.dump(nb_clf_est_tl, './other_output/nb_clf_est_tl.pkl')

['./other_output/nb_clf_est_tl.pkl']

#### QDA Estimators

In [None]:
qda_clf_est_b.fit(X_train,y_train)

In [98]:
joblib.dump(qda_clf_est_b, './other_output/qda_clf_est_b.pkl')

['./other_output/qda_clf_est_b.pkl']

In [None]:
qda_clf_est_ros.fit(X_train,y_train)

In [99]:
joblib.dump(qda_clf_est_ros, './other_output/qda_clf_est_ros.pkl')

['./other_output/qda_clf_est_ros.pkl']

In [None]:
qda_clf_est_smote.fit(X_train,y_train)

In [100]:
joblib.dump(qda_clf_est_smote, './other_output/qda_clf_est_smote.pkl')

['./other_output/qda_clf_est_smote.pkl']

In [None]:
qda_clf_est_rus.fit(X_train,y_train)

In [139]:
joblib.dump(qda_clf_est_rus, './other_output/qda_clf_est_rus.pkl')

['./other_output/qda_clf_est_rus.pkl']

In [None]:
qda_clf_est_tl.fit(X_train,y_train)

In [102]:
joblib.dump(qda_clf_est_tl, './other_output/qda_clf_est_tl.pkl')

['./other_output/qda_clf_est_tl.pkl']

#### Logistic Estimators

In [52]:
log_clf_est_b.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('vt', VarianceThreshold(threshold=0.0)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__C': [0.001, 0.01, 10, 100, 1000], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [103]:
joblib.dump(log_clf_est_b, './other_output/log_clf_est_b.pkl')

['./other_output/log_clf_est_b.pkl']

In [121]:
log_clf_est_ros.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('ros', RandomOverSampler(random_state=0, ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__C': [0.001, 0.01, 10, 100, 1000], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [122]:
joblib.dump(log_clf_est_ros, './other_output/log_clf_est_ros.pkl')

['./other_output/log_clf_est_ros.pkl']

In [123]:
log_clf_est_smote.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('smote', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=0,
   ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__C': [0.001, 0.01, 10, 100, 1000], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [124]:
joblib.dump(log_clf_est_smote, './other_output/log_clf_est_smote.pkl')

['./other_output/log_clf_est_smote.pkl']

In [125]:
log_clf_est_rus.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('rus', RandomUnderSampler(random_state=0, ratio='auto', replacement=True,
          return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__C': [0.001, 0.01, 10, 100, 1000], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [126]:
joblib.dump(log_clf_est_rus, './other_output/log_clf_est_rus.pkl')

['./other_output/log_clf_est_rus.pkl']

In [127]:
log_clf_est_tl.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('tl', TomekLinks(n_jobs=-1, random_state=0, return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__C': [0.001, 0.01, 10, 100, 1000], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [128]:
joblib.dump(log_clf_est_tl, './other_output/log_clf_est_tl.pkl')

['./other_output/log_clf_est_tl.pkl']

#### KNN Estimators

In [43]:
knn_clf_est_b.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('vt', VarianceThreshold(threshold=0.0)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__weights': ['uniform', 'distance'], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [149]:
joblib.dump(knn_clf_est_b, './other_output/knn_clf_est_b.pkl')

['./other_output/knn_clf_est_b.pkl']

In [150]:
knn_clf_est_ros.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('ros', RandomOverSampler(random_state=0, ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=4, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__weights': ['uniform', 'distance'], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [151]:
joblib.dump(knn_clf_est_ros, './other_output/knn_clf_est_ros.pkl')

['./other_output/knn_clf_est_ros.pkl']

In [152]:
knn_clf_est_smote.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('smote', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=0,
   ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=4, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__weights': ['uniform', 'distance'], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [153]:
joblib.dump(knn_clf_est_smote, './other_output/knn_clf_est_smote.pkl')

['./other_output/knn_clf_est_smote.pkl']

In [154]:
knn_clf_est_rus.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('rus', RandomUnderSampler(random_state=0, ratio='auto', replacement=True,
          return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=4, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__weights': ['uniform', 'distance'], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [155]:
joblib.dump(knn_clf_est_rus, './other_output/knn_clf_est_rus.pkl')

['./other_output/knn_clf_est_rus.pkl']

In [156]:
knn_clf_est_tl.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('tl', TomekLinks(n_jobs=-1, random_state=0, return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=4, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__weights': ['uniform', 'distance'], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [157]:
joblib.dump(knn_clf_est_tl, './other_output/knn_clf_est_tl.pkl')

['./other_output/knn_clf_est_tl.pkl']

#### Random Forest Estimators

In [59]:
rf_clf_est_b.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('vt', VarianceThreshold(threshold=0.0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__max_features': [0.1, 0.3, 0.5], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_estimators': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [158]:
joblib.dump(rf_clf_est_b, './other_output/rf_clf_est_b.pkl')

['./other_output/rf_clf_est_b.pkl']

In [159]:
rf_clf_est_ros.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('ros', RandomOverSampler(random_state=0, ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impu...mators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__max_features': [0.1, 0.3, 0.5], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [160]:
joblib.dump(rf_clf_est_ros, './other_output/rf_clf_est_ros.pkl')

['./other_output/rf_clf_est_ros.pkl']

In [161]:
rf_clf_est_smote.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('smote', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=0,
   ratio='auto')), ('vt', VarianceThreshold(threshold=0.0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__max_features': [0.1, 0.3, 0.5], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [162]:
joblib.dump(rf_clf_est_smote, './other_output/rf_clf_est_smote.pkl')

['./other_output/rf_clf_est_smote.pkl']

In [163]:
rf_clf_est_rus.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('rus', RandomUnderSampler(random_state=0, ratio='auto', replacement=True,
          return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__max_features': [0.1, 0.3, 0.5], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [164]:
joblib.dump(rf_clf_est_rus, './other_output/rf_clf_est_rus.pkl')

['./other_output/rf_clf_est_rus.pkl']

In [165]:
rf_clf_est_tl.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(steps=[('tl', TomekLinks(n_jobs=-1, random_state=0, return_indices=False)), ('vt', VarianceThreshold(threshold=0.0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_spl...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}], 'clf__max_features': [0.1, 0.3, 0.5], 'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_estimators': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [166]:
joblib.dump(rf_clf_est_tl, './other_output/rf_clf_est_tl.pkl')

['./other_output/rf_clf_est_tl.pkl']

In [148]:
np.max(pd.DataFrame(log_clf_est_tl.cv_results_)['mean_test_score'])

0.9774792476399301