In [1]:
import pandas as pd
import pipeline as p
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

import assess_clf_models as acm

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

#### Load & Prep data

In [3]:
X_train = p.open_pkl('Data/X_train_unprocessed.pkl')
y_train = p.open_pkl('Data/y_train_unprocessed.pkl')

In [4]:
preproc1 = p.open_pkl('Data/preprocess_wo_sub.pkl')
preproc2 = p.open_pkl('Data/preprocess_with_sub.pkl')

### Logisitic Regression

In [6]:
log_reg = (LogisticRegression(solver='liblinear'))

In [43]:
lr1_row = acm.assess_model(preproc1, log_reg, X_train, y_train)

In [44]:
lr1_row

Precision-0                   0.327107
Recall-0 (Specificty)         0.315016
F1score-0                     0.326450
Precision-1                   0.918088
Recall-1 (Sensitivity)        0.917305
F1score-1                     0.918082
TN                          617.000000
FN                          344.600000
FP                         4450.200000
TP                        25685.800000
AUC                           0.733301
dtype: float64

### SVM

*Neither linear or rbF gave good results & is slow to run (see MVP-02). Skip*

### Random Forest

In [18]:
rand_for = RandomForestClassifier(n_estimators=10)

In [45]:
rf1_row = acm.assess_model(preproc1, rand_for, X_train, y_train)

### Gradient Boosting

In [5]:
grad_boost = GradientBoostingClassifier()

In [46]:
gb1_row = acm.assess_model(preproc1, grad_boost, X_train, y_train)

### Naive Bayes

In [22]:
nbg = GaussianNB()

In [47]:
nb1_row = acm.assess_model(preproc1, nbg, X_train, y_train)

### AdaBoost + Decision Tree

In [24]:
ada = AdaBoostClassifier()

In [48]:
ab1_row = acm.assess_model(preproc1, ada, X_train, y_train)

### Compare them:

In [49]:
comp_df = pd.DataFrame([lr1_row, rf1_row, gb1_row, nb1_row, ab1_row],
             index=['LR','RF','GB','NB','ADA'])

# FNR = 1 - Recall
comp_df['FNR'] = comp_df['FN'] / (comp_df['TP'] + comp_df['FN'])

In [50]:
comp_df

Unnamed: 0,Precision-0,Recall-0 (Specificty),F1score-0,Precision-1,Recall-1 (Sensitivity),F1score-1,TN,FN,FP,TP,AUC,FNR
LR,0.327107,0.315016,0.32645,0.918088,0.917305,0.918082,617.0,344.6,4450.2,25685.8,0.733301,0.013238
RF,0.309627,0.319091,0.327053,0.887247,0.888294,0.889237,1358.0,2273.8,3709.2,23756.6,0.663475,0.087352
GB,0.362546,0.351866,0.363848,0.918476,0.917631,0.918725,879.2,515.8,4188.0,25514.6,0.754076,0.019815
NB,0.342748,0.341545,0.352036,0.899577,0.899948,0.899976,1351.2,1739.8,3716.0,24290.6,0.714498,0.066837
ADA,0.36219,0.346193,0.359682,0.918569,0.91744,0.918339,854.2,509.8,4213.0,25520.6,0.750114,0.019585


In [29]:
p.pkl_this('Data/model_comparison_df.pkl', comp_df)

**Best Models**:
- Logistic Regression 
- Gradient Boost
- AdaBoost

Because they have:
- high Sensitivity (Recall-1) & low FNR (these are related: a = 1 - b)
- high Precision-0 (TN / TN + FN)
- higher AUC

However, they also have:
- lower Specificty (Recall-0)

**How to score?**
- need to calculate FNR by hand. Also, is this a thing?
- Recall-1 is already high (0.98) because # pos's vs neg's is lopsided
- Precision-0 is also a candidate but not as straightforward to extract
- Define scoring function
- **Roberto recommends: AUC** as a catch-all 

### Polynomial Features

In [54]:
from sklearn.preprocessing import PolynomialFeatures

In [57]:
pipe = Pipeline(steps=[('preprocessor', preproc1),
                       ('polynomial', PolynomialFeatures(degree=2)),
                      ('classifier', log_reg)])
acm.assess_pipe_auc(pipe, X_train, y_train)

0.75151595706868035

an improvement, but still not better than Gradient Boosting

In [58]:
pipe = Pipeline(steps=[('preprocessor', preproc1),
                       ('polynomial', PolynomialFeatures(degree=2)),
                      ('classifier', GradientBoostingClassifier())])
acm.assess_pipe_auc(pipe, X_train, y_train)

0.7544916958393556

In [61]:
pipe = Pipeline(steps=[('preprocessor', preproc1),
                       ('polynomial', PolynomialFeatures(degree=2)),
                      ('classifier', nbg)])
acm.assess_pipe_auc(pipe, X_train, y_train)

0.71444251364919875

### Optimize parameters

In [3]:
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')

In [71]:
gb_opt = BayesSearchCV(grad_boost, {'learning_rate': (1e-5, 1, 'log-uniform'),
                                    'max_depth': (3,7),
                                   }, 
                       n_iter=10, cv=3, scoring='roc_auc')

gb_opt.fit(X_train, y_train)

BayesSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=True, n_iter=10, n_jobs=1, n_points=1,
       optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=None,
       refit=True, return_train_score=False, scoring='roc_auc',
       search_spaces={'learning_rate': (1e-05, 1, 'log-uniform'), 'max_depth': (3, 7)},
       verbose=0)

In [72]:
gb_opt.best_params_

{'learning_rate': 0.24237221346429236, 'max_depth': 4}

In [73]:
acm.assess_preproc_model_auc(preproc1, GradientBoostingClassifier(**gb_opt.best_params_), X_train, y_train)

0.75395118008609063

No improvement over baseline

**Randomized Search CV - Gradient Boosting**

I'm curious to see how this compares to BayesSearchCV

In [36]:
gb_opt = RandomizedSearchCV(grad_boost, {'learning_rate': np.logspace(-5, 1),
                                         'n_estimators': np.arange(50, 550, 50),
                                         'max_depth': np.arange(3,8)}, 
                       n_iter=10, cv=5, scoring='roc_auc')

gb_opt.fit(X_train, y_train)
gb_opt.cv_results_['mean_test_score']

array([ 0.72667351,  0.71126289,  0.6681285 ,  0.52917014,  0.75350935,
        0.58678682,  0.75403409,  0.7519298 ,  0.7519134 ,  0.57360011])

In [63]:
gb_opt.cv_results_['param_max_depth']

masked_array(data = [7 3 6 6 4 7 3 3 4 3],
             mask = [False False False False False False False False False False],
       fill_value = ?)

In [64]:
gb_opt.cv_results_['param_learning_rate']

masked_array(data = [0.19306977288832497 0.00039069399370546167 1.0481131341546852
 2.4420530945486498 0.25595479226995332 5.689866029018293
 0.25595479226995332 0.33932217718953295 0.0625055192527397
 4.2919342601287784],
             mask = [False False False False False False False False False False],
       fill_value = ?)

In [67]:
gb_opt.best_params_

{'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.25595479226995332}

In [40]:
acm.assess_preproc_model_auc(preproc1, GradientBoostingClassifier(**gb_opt.best_params_), X_train, y_train)

0.75424620841247525

They arrive at more or less the same parameters. But, my search space wasn't that complex

**Log Reg**

LogReg wouldn't play nice with BayesSearchCV:

In [70]:
lr_opt = BayesSearchCV(log_reg, {'penalty': ('l1','l2'),
                                      'C': np.logspace(-3,2),
                                      'class_weight':(None, 'balanced')
                                   }, 
                       n_iter=10, cv=5, scoring='roc_auc')

lr_opt.fit(X_train, y_train)
acm.assess_model(preproc1, LogisticRegression(**lr_opt.best_params_), X_train, y_train)

ValueError: could not convert string to float: 'l2'

In [None]:
lr_opt = RandomizedSearchCV(log_reg, {'penalty': ['l1','l2'],
                                      'C': np.logspace(-3,2),
                                      'class_weight':[None, 'balanced']
                                   }, 
                       n_iter=10, cv=3, scoring='roc_auc')

lr_opt.fit(X_train, y_train)

In [76]:
acm.assess_preproc_model_auc(preproc1, LogisticRegression(solver = 'liblinear', **lr_opt.best_params_), X_train, y_train)

0.73513452533803902

In [28]:
lr_opt.best_params_

{'penalty': 'l2', 'class_weight': 'balanced', 'C': 79.060432109077013}

In [14]:
lr_opt.cv_results_['param_C']

masked_array(data = [0.0025595479226995358 0.016768329368110083 5.9636233165946431
 79.060432109077013 0.0082864277285468416 30.88843596477485
 0.0012648552168552957 0.068664884500430012 0.042919342601287783 0.001],
             mask = [False False False False False False False False False False],
       fill_value = ?)

## Feature Importance

In [10]:
pipe =  Pipeline(steps=[('preprocessor', preproc1), ('classifier', grad_boost)])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', StandardScaler(copy=True, with_mean=True, with_std=True), [2, 4, 7]), ('cat', OneHotEncoder(categorical_features=None, categories='a...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

In [22]:
fi = pipe.named_steps['classifier'].feature_importances_
fs = X_train.columns
fi_list = list(zip(fs, fi))
fi_list

[('MAR_STAT_MOD', 0.33442504732183481),
 ('RACE_MOD', 0.19985317797358021),
 ('AGE_DX', 0.17990343283991236),
 ('GRADE', 0.00064410819169437338),
 ('TUMSIZ', 0.0087121978440028586),
 ('SURG', 0.00036701424624263905),
 ('SEQ_NUM', 0.02180110364499285),
 ('POS_NODES', 0.063869192645609404),
 ('HST_STGA', 0.0039115455700251519),
 ('INVAS', 0.017598492725618517)]