# Project 4: Predict West Nile Virus
### Section 5. Model Exploration

## Problem Statement

1. As an employee of Disease And Treatment Agency, division of Societal Cures In Epidemiology and New Creative Engineering (DATA-SCIENCE), we are tasked to better understand the mosquito population and advise on appropriate interventions which are beneficial and cost-effective for the city.


2. Through this project, we hope to:
- Identify features which are most important to predict presence of West Nile Virus (which can be done by ranking the coefficients of each feature in a logistic regression model)
- Predict the probability of West Nile Virus by location to provide decision makers an effective plan to deploy pesticides throughout the city, which consequently can help to reduce cost.

## Import Libraries

In [1]:
#!pip install shapely
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from shapely import geometry
# from shapely.geometry import Point, Polygon
# import geopandas as gpd
# from datetime import timedelta
# import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, plot_confusion_matrix, classification_report, 
                             plot_roc_curve, roc_auc_score, accuracy_score, precision_score, 
                             recall_score, f1_score)
from imblearn.over_sampling import SMOTE

## Load Data

In [2]:
# Load datasets
df = pd.read_csv('../data/final_df.csv', index_col='Unnamed: 0')

### Split into train and test (Kaggle) data

In [3]:
# Split into train and test (kaggle) data 
train = df[df['dataset']=='train'].copy()
test = df[df['dataset']=='test'].copy()
print(train.shape)
print(test.shape)

(8304, 240)
(43035, 240)


In [4]:
train.drop(columns='dataset', inplace=True)
test.drop(columns='dataset', inplace=True)

In [5]:
train.describe()

Unnamed: 0,latitude,longitude,tavg,preciptotal,stnpressure,resultdir,avgspeed,is_spray,stnpressure_7,stnpressure_10,...,codesum_TSRA BR HZ VCTS,codesum_TSRA FG+ BR HZ,codesum_TSRA RA,codesum_TSRA RA BR,codesum_TSRA RA BR HZ,codesum_TSRA RA BR HZ VCTS,codesum_TSRA RA BR VCTS,codesum_TSRA RA VCTS,codesum_VCTS,wnvpresent
count,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,...,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0
mean,41.8458,-87.696229,72.093931,0.182431,29.26211,17.842245,7.476903,0.008911,29.2772,29.26599,...,0.006142,0.0,0.029383,0.037211,0.0,0.0,0.010597,0.0,0.003974,0.055034
std,0.106658,0.08444,7.63033,0.47045,0.118606,9.433945,2.543438,0.093984,0.126573,0.142227,...,0.078132,0.0,0.168889,0.18929,0.0,0.0,0.102402,0.0,0.062918,0.22806
min,41.644612,-87.930995,50.0,0.0,28.89,1.0,2.1,0.0,28.89,28.91,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,41.750498,-87.752411,69.0,0.0,29.18,8.0,5.8,0.0,29.21,29.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,41.862292,-87.696269,73.0,0.0,29.26,19.0,7.1,0.0,29.28,29.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,41.947227,-87.648064,78.0,0.16,29.34,25.0,9.4,0.0,29.38,29.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,42.01743,-87.531635,87.0,3.97,29.65,36.0,16.3,1.0,29.65,29.65,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8304 entries, 0 to 8303
Columns: 239 entries, latitude to wnvpresent
dtypes: float64(11), int64(228)
memory usage: 15.2 MB


### Further Split Train Data into Train and Holdout

In [7]:
# Split train data into X (all features except wnvpresent) and y (wnvpresent)
features = [col for col in train.columns if col != 'wnvpresent']
X = train[features]
y = train['wnvpresent']

In [8]:
X.columns[X.isna().any()].tolist()

[]

In [9]:
y.value_counts(normalize = True)

0.0    0.944966
1.0    0.055034
Name: wnvpresent, dtype: float64

- wnvpresent is highly inbalance, with only about 5.5% of the data points having West Nile Virus. 
    - It is important to stratify proportionally to ensure that our train and holdout dataset have about the same proportion of presence and absence of West Nile Virus.
    - We also need to use SMOTE as the accuracy of our models without SMOTE is about 95% (i.e. close to the proportion of absence of WNV)

In [10]:
# Further split train data into train and holdout data
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, 
    y,
    stratify = y,
    random_state=42
)

In [11]:
# Scale our data
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_holdout = ss.transform(X_holdout)

## Create synthetic data with SMOTE

In [12]:
# create synthetic data for train set
sm = SMOTE()
Xsm_train, ysm_train = sm.fit_resample(Xs_train, y_train)

In [13]:
Xsm_train.shape

(11770, 238)

## Model Exploration

### Metrics to Evaluate Models

In [14]:
# Creacting a summary dataframe.
summary_df = pd.DataFrame(columns=[
    'model', 
    'best_score', 
    'train_score',
    'holdout_score',
    'sensitivity',
    'specificity',
    'f1_score',
    'best_params', 
])

In [15]:
summary_df

Unnamed: 0,model,best_score,train_score,holdout_score,sensitivity,specificity,f1_score,best_params


In [16]:
def model_metrics(gs, X_train, y_train, X_test, y_test, modelname):
    '''Generates confusion matrix and adds scores to summary_df'''
    
    #Generate confusion matrix
    y_pred = gs.predict(X_test)
    confusion_matrix(y_test, # True values.
                     y_pred)  # Predicted values.
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel();
    y_train_pred_prob = gs.predict_proba(X_train)[:,1]
    y_test_pred_prob = gs.predict_proba(X_test)[:,1]
    
    summary_df_rows = summary_df.shape[0]
    # Adding the scores into summary_df
    summary_df.loc[summary_df_rows] = [
        modelname,
        #'CountVec/LogisticRegression',
        round(gs.best_score_, 3),
        round(metrics.roc_auc_score(y_train, y_train_pred_prob),3),
        round(metrics.roc_auc_score(y_test, y_test_pred_prob),3),
        round(metrics.recall_score(y_test, y_pred),3),
        round(tn/(tn+fp),3),
        round(metrics.f1_score(y_test, y_pred),3),
        str(gs.best_params_),
    ]
    
    
    
    # Plot Confusion Matrix
    plot_confusion_matrix(gs, X_test, y_test, cmap='Blues', 
                          display_labels=['WNV Present', 'WNV Not Present'],
                          normalize='true');  
    plt.title(label=modelname, fontsize=14)
    plt.grid(False)

### Log-Reg: Testing for importance of week data

In [56]:
df2 = df.copy()

In [57]:
list(df2)

['latitude',
 'longitude',
 'tavg',
 'preciptotal',
 'stnpressure',
 'resultdir',
 'avgspeed',
 'is_spray',
 'stnpressure_7',
 'stnpressure_10',
 'preciptotal_7',
 'preciptotal_10',
 'DZ',
 'BR',
 'TSRA',
 'VCTS',
 'BCFG',
 'FG',
 'TS',
 'FG+',
 'HZ',
 'RA',
 'year_2008',
 'year_2009',
 'year_2010',
 'year_2011',
 'year_2012',
 'year_2013',
 'year_2014',
 'week_23',
 'week_24',
 'week_25',
 'week_26',
 'week_27',
 'week_28',
 'week_29',
 'week_30',
 'week_31',
 'week_32',
 'week_33',
 'week_34',
 'week_35',
 'week_36',
 'week_37',
 'week_38',
 'week_39',
 'week_40',
 'week_41',
 'dayofweek_1',
 'dayofweek_2',
 'dayofweek_3',
 'dayofweek_4',
 'species_CULEX PIPIENS/RESTUANS',
 'species_CULEX RESTUANS',
 'trap_T002',
 'trap_T002A',
 'trap_T002B',
 'trap_T003',
 'trap_T004',
 'trap_T005',
 'trap_T006',
 'trap_T007',
 'trap_T008',
 'trap_T009',
 'trap_T009A',
 'trap_T011',
 'trap_T012',
 'trap_T013',
 'trap_T014',
 'trap_T015',
 'trap_T016',
 'trap_T017',
 'trap_T018',
 'trap_T019',
 'trap

In [58]:
df2.drop(columns = ['week_23',
 'week_24',
 'week_25',
 'week_26',
 'week_27',
 'week_28',
 'week_29',
 'week_30',
 'week_31',
 'week_32',
 'week_33',
 'week_34',
 'week_35',
 'week_36',
 'week_37',
 'week_38',
 'week_39',
 'week_40',
 'week_41'], inplace = True)

In [59]:
train2 = df2[df2['dataset']=='train'].copy()
test2 = df2[df2['dataset']=='test'].copy()
print(train2.shape)
print(test2.shape)

(8304, 221)
(43035, 221)


In [61]:
# Split train data into X (all features except wnvpresent) and y (wnvpresent)
features = [col for col in train2.columns if col != 'wnvpresent']
X2 = train2[features]
y2 = train2['wnvpresent']

In [62]:
X2.drop(columns = ["dataset"],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [63]:
# Further split train data into train and holdout data
X_train2, X_holdout2, y_train2, y_holdout2 = train_test_split(
    X2, 
    y2,
    stratify = y2,
    random_state=42
)

In [64]:
# Scale our data
ss = StandardScaler()
Xs_train2 = ss.fit_transform(X_train2)
Xs_holdout2 = ss.transform(X_holdout2)

In [65]:
# create synthetic data for train set
sm = SMOTE()
Xsm_train2, ysm_train2 = sm.fit_resample(Xs_train2, y_train2)

In [66]:
# Instantiate model
logreg = LogisticRegression()

# Fit model
logreg.fit(Xs_train2, y_train2)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-4.48467519]
Logistic Regression Coefficient: [[-2.84539335e-01 -2.42277797e-01  4.84172474e-01 -9.22336824e-01
   1.64020640e-01 -4.32521780e-02 -2.22748182e-01  3.73633499e-02
   1.47731157e-01 -2.30512490e-01 -2.21067959e-01  7.85127024e-02
  -4.45918678e-02  1.79736404e-02  9.40293021e-02 -1.22251185e-01
   0.00000000e+00  2.75973388e-02  1.05858532e-01  2.75973388e-02
  -1.00507319e-01  1.42261960e-02  0.00000000e+00 -3.31278442e-01
   0.00000000e+00  4.36152471e-02  0.00000000e+00  5.92570781e-01
   0.00000000e+00  6.24633727e-01  5.18538132e-01  2.32622824e-01
   2.17583247e-01 -1.10945522e-01 -5.85637578e-01  2.21551636e-01
   0.00000000e+00  0.00000000e+00  2.89941854e-01 -1.39768626e-01
  -1.67168059e-01  6.51022077e-02 -2.32254187e-01  1.86073368e-01
   6.62779790e-02  1.16660966e-02  1.41525824e-01  3.06517016e-02
   2.14595795e-01  8.79375855e-02  4.33173566e-02  1.03212026e-01
  -3.73360970e-01 -2.80398676e-01 -2.01125437e-01 -2.46521462e-0

In [67]:
logreg.score(Xs_train2, y_train2), logreg.score(Xs_holdout2, y_holdout2)

(0.945247270391779, 0.9446050096339114)

In [69]:
# Display Coefficients
coefs = pd.DataFrame({'variable':X2.columns,
                            'coef':logreg.coef_[0],
                            'abs_coef':np.abs(logreg.coef_[0])
                     })

coefs.sort_values('abs_coef', inplace=True, ascending=False)
coefs.head(40)

Unnamed: 0,variable,coef,abs_coef
3,preciptotal,-0.922337,0.922337
29,dayofweek_1,0.624634,0.624634
27,year_2013,0.592571,0.592571
34,species_CULEX RESTUANS,-0.585638,0.585638
30,dayofweek_2,0.518538,0.518538
2,tavg,0.484172,0.484172
52,trap_T017,-0.373361,0.373361
71,trap_T046,-0.37222,0.37222
183,trap_T900,0.366521,0.366521
210,codesum_TSRA BR HZ VCTS,-0.347207,0.347207


### Log-Reg Smote: Testing of importance of week data

In [75]:
# Smote-ing
logreg = LogisticRegression(max_iter=10_000)

# Fit model
logreg.fit(Xsm_train2, ysm_train2)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-2.68930626]
Logistic Regression Coefficient: [[-7.39264541e-01 -3.36484294e-01  6.94687030e-01 -9.78585268e-01
   2.12214099e-01  4.56613975e-02 -3.25204655e-01  1.08251806e-02
   3.25023704e-01 -1.56176731e-01 -2.84403867e-01  2.84222221e-01
  -7.92633610e-02  2.26357345e-02  9.18743011e-02 -2.16048003e-01
   0.00000000e+00  2.53640321e-02  8.52633079e-02  2.53640321e-02
  -1.53301957e-01  2.95397131e-03  0.00000000e+00 -4.33890813e-01
   0.00000000e+00  1.11830653e-01  0.00000000e+00  7.77180224e-01
   0.00000000e+00  8.60721829e-01  5.82677011e-01  3.15506577e-01
   2.50425350e-01 -1.17804278e-01 -7.52136591e-01  4.80098447e-01
   0.00000000e+00  0.00000000e+00  5.13228136e-01 -1.83426220e-01
  -2.17098668e-01  1.01505453e-01 -3.11268754e-01  4.09105273e-01
   1.27292578e-01 -8.48815492e-04  3.07905335e-01  1.25021133e-01
   4.35414587e-01  1.24161846e-01  8.86097259e-02  1.49069879e-01
  -4.95413815e-01 -3.44942113e-01 -2.60622727e-01 -3.27283804e-0

In [76]:
#logreg of no-week data, with SMOTE
logreg.score(Xsm_train2, ysm_train2), logreg.score(Xs_holdout2, y_holdout2)

(0.8415463041631266, 0.7726396917148363)

In [77]:
# Display Coefficients
coefs = pd.DataFrame({'variable':X2.columns,
                            'coef':logreg.coef_[0],
                            'abs_coef':np.abs(logreg.coef_[0])
                     })

coefs.sort_values('abs_coef', inplace=True, ascending=False)
coefs.head(40)

Unnamed: 0,variable,coef,abs_coef
3,preciptotal,-0.978585,0.978585
29,dayofweek_1,0.860722,0.860722
183,trap_T900,0.808328,0.808328
27,year_2013,0.77718,0.77718
34,species_CULEX RESTUANS,-0.752137,0.752137
0,latitude,-0.739265,0.739265
2,tavg,0.694687,0.694687
30,dayofweek_2,0.582677,0.582677
38,trap_T003,0.513228,0.513228
210,codesum_TSRA BR HZ VCTS,-0.510484,0.510484


### Logistic Regression

In [78]:
# Instantiate model
logreg = LogisticRegression()

# Fit model
logreg.fit(Xs_train, y_train)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-5.18323133]
Logistic Regression Coefficient: [[-2.90981683e-01 -2.39494869e-01  1.20605658e-01 -4.46864105e-02
  -1.77380567e-01  9.33355749e-02  4.64534896e-03  1.82344515e-02
   2.41123229e-01  3.66215617e-01 -2.51145035e-01  1.07436511e-01
   7.66391295e-02  2.13631312e-02  8.83433505e-02  4.47363859e-02
   0.00000000e+00  1.90894301e-03  1.23503032e-01  1.90894301e-03
  -1.58690488e-03 -1.24579015e-01  0.00000000e+00 -3.25159298e-01
   0.00000000e+00 -1.45101636e-01  0.00000000e+00  7.98163586e-01
   0.00000000e+00 -5.35010920e-01 -6.57602378e-01 -4.93489871e-01
  -5.27740017e-01 -2.67095294e-01 -1.76984073e-01 -3.81070128e-01
   1.45818271e-01  6.15254514e-01  6.89091224e-01  7.61921264e-01
   5.50280090e-01  4.46966814e-01  4.50268033e-01  4.20930769e-01
   3.64039416e-01 -2.12861050e-01  2.55178077e-01 -1.60453802e-01
   1.26126812e-01 -1.19745888e-03 -5.79895061e-01  9.20801592e-02
  -4.43876973e-02 -4.58541676e-01  2.23141936e-01  0.00000000e+0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [79]:
# Display Coefficients
coefs = pd.DataFrame({'variable':X.columns,
                            'coef':logreg.coef_[0],
                            'abs_coef':np.abs(logreg.coef_[0])
                     })

coefs.sort_values('abs_coef', inplace=True, ascending=False)
coefs.head(40)

Unnamed: 0,variable,coef,abs_coef
27,year_2013,0.798164,0.798164
39,week_33,0.761921,0.761921
38,week_32,0.689091,0.689091
30,week_24,-0.657602,0.657602
37,week_31,0.615255,0.615255
50,dayofweek_3,-0.579895,0.579895
40,week_34,0.55028,0.55028
212,codesum_RA BR,-0.541623,0.541623
29,week_23,-0.535011,0.535011
32,week_26,-0.52774,0.52774


In [80]:
logreg.score(Xs_train, y_train), logreg.score(Xs_holdout, y_holdout)

(0.9449261400128453, 0.9436416184971098)

In [21]:
# Summary scores of CountVectorizer and LogisticRegression.
print("LogisticRegression summary of accuracy scores:")
#print(f"GridSearchCV best accuracy = {round(gs_pipe.best_score_, 3)}")
print("\nUsing GridSearchCV best params suggested,")
print(f"Training corpus accuracy = {round(logreg.score(X_train, y_train), 3)}")
print(f"Testing corpus accuracy = {round(logreg.score(X_holdout, y_holdout), 3)}")

LogisticRegression summary of accuracy scores:

Using GridSearchCV best params suggested,
Training corpus accuracy = 0.055
Testing corpus accuracy = 0.055


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Logistic Regression - SMOTE

In [81]:
# Instantiate model
logreg = LogisticRegression()

# Fit model
logreg.fit(Xsm_train, ysm_train)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-4.0393546]
Logistic Regression Coefficient: [[-0.7854128  -0.3540895   0.18976067 -0.26150111 -0.32217405  0.1409549
  -0.05158282 -0.03938765  0.40711489  0.5193404  -0.15713833  0.23154066
   0.10696026  0.06936505  0.10788299  0.10492512  0.         -0.00770498
   0.20037573 -0.00770498 -0.03383628 -0.16691723  0.         -0.53671252
   0.         -0.37623478  0.          1.08769044  0.         -1.0252048
  -0.96177798 -0.74243898 -0.39709599 -0.39176784  0.26683796 -0.64129842
   0.5978474   1.1932077   1.23122561  1.43604273  1.08548406  0.88142001
   0.81564063  0.89629061  0.84941832 -0.08996238  0.60263988 -0.22123119
   0.15748479 -0.14029242 -0.84769282  0.13066833 -0.0243645  -0.60596868
   0.52484629  0.          0.          0.55668629 -0.16942742 -0.20616249
   0.09037879 -0.30509608  0.48930478  0.10580869 -0.01293925  0.35616158
   0.14988916  0.42151377  0.12623674  0.11358225  0.20168006 -0.52514597
  -0.3675839  -0.26014399 -0.32321204

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [84]:
#log-reg of with-week data, with SMOTE
logreg.score(Xsm_train, ysm_train), logreg.score(Xs_holdout, y_holdout)

(0.8620220900594733, 0.7952793834296724)

In [82]:
# Display Coefficients
coefs = pd.DataFrame({'variable':X.columns,
                            'coef':logreg.coef_[0],
                            'abs_coef':np.abs(logreg.coef_[0])
                     })

coefs.sort_values('abs_coef', inplace=True, ascending=False)
coefs.head(40)

Unnamed: 0,variable,coef,abs_coef
39,week_33,1.436043,1.436043
38,week_32,1.231226,1.231226
37,week_31,1.193208,1.193208
27,year_2013,1.08769,1.08769
40,week_34,1.085484,1.085484
29,week_23,-1.025205,1.025205
30,week_24,-0.961778,0.961778
43,week_37,0.896291,0.896291
41,week_35,0.88142,0.88142
44,week_38,0.849418,0.849418


(0.8620220900594733, 0.7952793834296724)

### Multinomial NB

### Random Forest

### SVM

### AdaBoostClassifier

In [48]:
ada = AdaBoostClassifier(
    n_estimators = 100,
    learning_rate = 0.9
    # in general, if n_estimators is high, then learning_rate should be low
    # but we can use GridSearchCV
)
ada.fit(X_train, y_train)
ada.score(X_holdout, y_holdout)

0.9450867052023122

In [49]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'n_estimators': [50,100, 150],
    'base_estimator__max_depth': [1,2],
    'learning_rate': [.8, .9, 1.]
    # you can also tune by all hyperparameters of DecisionTree
}
gs = GridSearchCV(
    ada, 
    param_grid=ada_params, 
    cv=5
)
gs.fit(X_train, y_train)
gs.score(X_holdout, y_holdout)

0.9421965317919075

In [50]:
tn, fp, fn, tp = confusion_matrix(y_holdout, gs.predict(Xs_holdout)).ravel()

  "X does not have valid feature names, but"


In [51]:
sensitivity = tp / (tp + fn)
sensitivity

0.0

In [52]:
precision = tp / (tp + fp)
precision

  """Entry point for launching an IPython kernel.


nan

In [53]:
ada = AdaBoostClassifier(
    n_estimators = 100,
    learning_rate = 0.9
    # in general, if n_estimators is high, then learning_rate should be low
    # but we can use GridSearchCV
)
ada.fit(Xsm_train, ysm_train)
ada.score(Xs_holdout, y_holdout)

0.9089595375722543

In [None]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [1,2],
    'learning_rate': [.8, .9, 1.]
    # you can also tune by all hyperparameters of DecisionTree
}
gs = GridSearchCV(
    ada, 
    param_grid=ada_params, 
    cv=5
)
gs.fit(Xsm_train, ysm_train)
gs.score(Xs_holdout, y_holdout)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_holdout, gs.predict(Xs_holdout)).ravel()

In [None]:
sensitivity = tp / (tp + fn)
sensitivity

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
model_metrics(gs, Xsm_train, ysm_train, Xs_holdout, y_holdout, 'ADA Boost Classifier')

In [None]:
summary_df

In [None]:
print(gs.best_score_)
gs.best_params_

In [None]:
pd.Series(gs.best_estimator_.feature_importances_).plot.bar()

### Gradient Boosting Classifier

In [None]:
gboost = GradientBoostingClassifier()
gboost_params = {
    'max_depth': [2, 3, 4],
    'n_estimators': [100, 125, 150],
    'learning_rate': [.08, .1, .12]
}
gb_gs = GridSearchCV(
    gboost, 
    param_grid=gboost_params, 
    cv=3
)
gb_gs.fit(Xsm_train, ysm_train)
gb_gs.score(Xs_holdout, y_holdout)

In [None]:
print(gb_gs.best_score_)
gb_gs.best_params_

In [None]:
tn, fp, fn, tp = confusion_matrix(y_holdout, gb_gs.predict(Xs_holdout)).ravel()

In [None]:
sensitivity = tp / (tp + fn)
sensitivity

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
confusion_matrix(y_holdout, gb_gs.predict(Xs_holdout))

In [None]:
model_metrics(gb_gs, X_train, y_train, X_holdout, y_holdout, 'Gradient Boosting Classifier')

### Neutro Network

In [None]:
model = Sequential()
model.add(Dense(32,
                input_shape=(238,),
                activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
history = model.fit(Xsm_train, ysm_train, validation_data=(Xs_holdout, y_holdout), epochs=10, batch_size=512, verbose=0)

In [None]:
pd.DataFrame(model.predict_classes(Xs_holdout))[0].value_counts()

In [None]:
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend();

In [None]:
plt.plot(history.history['accuracy'], label='Train accuracy')
plt.plot(history.history['val_accuracy'], label='Val accuracy')
plt.legend();

### AUC-ROC Curve

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12,10))
plot_roc_curve(cvec_lr_gs, X_test, y_test, ax=ax, name='LogisticRegression-CVEC(GS)', color='lightgrey')
plot_roc_curve(cvec_nb_gs, X_test, y_test, ax=ax, name='MultinomialNB-CVEC(GS)', color='lightgrey')
plot_roc_curve(tvec_svc_gs, X_test, y_test, ax=ax, name='SupportVectorClassifier-TVEC(GS)', color='lightgrey')
plot_roc_curve(tvec_lr_gs, X_test, y_test, ax=ax, name='LogisticRegression-TVEC(GS)', color='blue')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random Guess')
plt.legend()