### This notebook will use the Train and Test CSV's to build and tune the following models: Decision Tree, Bagged Tree, Boosted Tree, Extra Tree, and Random Forest.

### Imports

In [155]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
import os
from imblearn.over_sampling import RandomOverSampler
os.chdir('/Users/gil/desktop/notebooks/DSI-US-4/Project4/')
pd.set_option('display.max_columns', 500)

### Import CSV's to a pandas Dataframe and set X and y

In [156]:
train = pd.read_csv('./merged_train_weather-finalv2')

### Do some EDA on the Cleaned Data

In [157]:
train.head()

Unnamed: 0.1,Unnamed: 0,Date,BCFG,BR,CALM,DZ,FG,FG+,FU,GR,HZ,MIFG,RA,SN,SQ,TS,TSRA,VCFG,VCTS,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Latitude,Longitude,WnvPresent,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,month_10,month_5,month_6,month_7,month_8,month_9,daytime
0,0,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,0.0,29.415,30.1,5.8,17.0,6.95,41.95469,-87.800991,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1496
1,1,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,0.0,29.415,30.1,5.8,17.0,6.95,41.95469,-87.800991,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1496
2,2,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,0.0,29.415,30.1,5.8,17.0,6.95,41.994991,-87.769279,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1496
3,3,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,0.0,29.415,30.1,5.8,17.0,6.95,41.974089,-87.824812,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1496
4,4,2007-05-29,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,88.0,62.5,75.25,21.25,58.5,65.5,0.0,10.5,0.0,29.415,30.1,5.8,17.0,6.95,41.974089,-87.824812,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1496


### Drop Unnamed and Date Column and Split Dataset into categorical and numerical for scaling.

In [158]:
train = train.drop(columns=['Unnamed: 0','Date'])

In [159]:
train.columns

Index(['BCFG', 'BR', 'CALM', 'DZ', 'FG', 'FG+', 'FU', 'GR', 'HZ', 'MIFG', 'RA',
       'SN', 'SQ', 'TS', 'TSRA', 'VCFG', 'VCTS', 'Tmax', 'Tmin', 'Tavg',
       'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed',
       'Latitude', 'Longitude', 'WnvPresent', 'Species_CULEX ERRATICUS',
       'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS',
       'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS',
       'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'month_10',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'daytime'],
      dtype='object')

In [160]:
dummy = train.drop(columns=['Tmax','Tmin','Tavg','Depart','DewPoint','WetBulb','Heat','Cool','daytime','PrecipTotal','StnPressure','SeaLevel','ResultSpeed','ResultDir','AvgSpeed','Latitude','Longitude'])

In [161]:
numerical = train.drop(columns=['BCFG', 'BR', 'CALM', 'DZ', 'FG', 'FG+', 'FU', 'GR', 'HZ', 'MIFG', 'RA',
       'SN', 'SQ', 'TS', 'TSRA', 'VCFG', 'VCTS', 'WnvPresent',
       'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS', 'month_10', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9'])

In [162]:
y = train['WnvPresent']

In [163]:
ss = StandardScaler()
scaled_num = ss.fit_transform(numerical,y)

In [164]:
scaled_num = pd.DataFrame(scaled_num,columns=numerical.columns)

In [165]:
fin_train = pd.concat([scaled_num,dummy], axis=1, ignore_index=False)

In [166]:
fin_train = fin_train.drop(columns=['WnvPresent'])

In [167]:
X = fin_train
y = train['WnvPresent']

### Train / Test Split the Dataset 70 / 30

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Balance the Dataset

In [169]:
rus = RandomOverSampler(random_state=42)
X_res, y_res = rus.fit_sample(X_train, y_train)

In [170]:
X_bal = pd.DataFrame(X_res,columns=fin_train.columns)

In [171]:
y_bal = pd.DataFrame(y_res,columns=['WnvPresent'])

### Decision Tree Model

In [172]:
dt = DecisionTreeClassifier()
dt_params = {'max_depth':[None,1,2,3,4,5]}
gs = GridSearchCV(dt, param_grid=dt_params)
gs.fit(X_bal,y_bal['WnvPresent'])
print(gs.best_score_)
print(gs.best_params_)

0.945952023988006
{'max_depth': None}


In [173]:
dt_2 = DecisionTreeClassifier(max_depth=None)
dt_2.fit(X_bal,y_bal)
dt_2.score(X_test,y_test)

0.893567926160946

In [174]:
confusion_matrix(y_test, dt_2.predict(X_test), labels=None, sample_weight=None)

array([[3041,  244],
       [ 125,   57]])

In [175]:
roc_auc_score(y_test, dt_2.predict(X_test))

0.6194548982220215

In [176]:
feature = pd.DataFrame(dt_2.feature_importances_)
feature['names'] = X_train.columns
feature.columns = ['values', 'names']
feature.sort_values(by=['values'],ascending=False).head(20)

Unnamed: 0,values,names
16,0.264007,daytime
15,0.224428,Longitude
14,0.158373,Latitude
0,0.07356,Tmax
36,0.037076,Species_CULEX PIPIENS/RESTUANS
35,0.035559,Species_CULEX PIPIENS
11,0.034237,ResultSpeed
9,0.030662,StnPressure
37,0.018748,Species_CULEX RESTUANS
10,0.015353,SeaLevel


### Bagged Ensemble

In [177]:
ba= BaggingClassifier()
ba_params = {'n_estimators':[25,30,35]}
gs = GridSearchCV(ba, param_grid=ba_params)
gs.fit(X_bal,y_bal['WnvPresent'])
print(gs.best_score_)
print(gs.best_params_)

0.9536731634182909
{'n_estimators': 30}


In [178]:
ba_2 = BaggingClassifier(n_estimators=30)
ba_2.fit(X_bal,y_bal['WnvPresent'])
ba_2.score(X_test,y_test)

0.901067205076435

In [179]:
confusion_matrix(y_test, ba_2.predict(X_test), labels=None, sample_weight=None)

array([[3065,  220],
       [ 123,   59]])

In [180]:
roc_auc_score(y_test, ba_2.predict(X_test))

0.6286023717530567

### Boosted Tree Model

In [181]:
gt = GradientBoostingClassifier()
gt_params = {}
gt = GridSearchCV(gt, param_grid=dt_params)
gs.fit(X_bal,y_bal['WnvPresent'])
print(gs.best_score_)
print(gs.best_params_)

0.9532983508245877
{'n_estimators': 35}


In [182]:
gt_2 = GradientBoostingClassifier(max_depth= None, max_features=4, min_samples_leaf=2, min_samples_split= 3, n_estimators= 2)
gt_2.fit(X_bal,y_bal['WnvPresent'])
gt_2.score(X_test,y_test)

0.8932794923565042

In [183]:
confusion_matrix(y_test, gt_2.predict(X_test), labels=None, sample_weight=None)

array([[3037,  248],
       [ 122,   60]])

In [184]:
roc_auc_score(y_test, gt_2.predict(X_test))

0.6270878284576914

In [185]:
feature = pd.DataFrame(gt_2.feature_importances_)
feature['names'] = X_train.columns
feature.columns = ['values', 'names']
feature.sort_values(by=['values'],ascending=False).head(20)

Unnamed: 0,values,names
15,0.24463,Longitude
14,0.216543,Latitude
16,0.121397,daytime
5,0.044408,WetBulb
2,0.037853,Tavg
37,0.02891,Species_CULEX RESTUANS
7,0.02708,Cool
11,0.024686,ResultSpeed
45,0.022256,month_8
0,0.021854,Tmax


### Random Forest Ensemble

In [186]:
rf = RandomForestClassifier()
rf_params = {'n_estimators': [4,5,6,7,8,9,10],'max_features':[3,4,5],'max_depth':[None,2,3,4]}
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_bal,y_bal['WnvPresent'])
print(gs.best_score_)
print(gs.best_params_)

0.9562968515742128
{'max_depth': None, 'max_features': 3, 'n_estimators': 4}


In [187]:
rf_2 = RandomForestClassifier(max_depth=None,max_features=4,n_estimators=8)
rf_2.fit(X_bal,y_bal['WnvPresent'])
rf_2.score(X_test,y_test)

0.8973175656186905

In [188]:
confusion_matrix(y_test, rf_2.predict(X_test), labels=None, sample_weight=None)

array([[3059,  226],
       [ 130,   52]])

In [189]:
roc_auc_score(y_test, rf_2.predict(X_test))

0.6084583605131549

In [190]:
feature = pd.DataFrame(rf_2.feature_importances_)
feature['names'] = X_train.columns
feature.columns = ['values', 'names']
feature.sort_values(by=['values'],ascending=False).head(20)

Unnamed: 0,values,names
15,0.257453,Longitude
14,0.234146,Latitude
16,0.065692,daytime
7,0.038125,Cool
13,0.035065,AvgSpeed
5,0.031697,WetBulb
2,0.031101,Tavg
35,0.026602,Species_CULEX PIPIENS
0,0.026572,Tmax
10,0.023705,SeaLevel


### Extra Tree Ensemble

In [191]:
ex = ExtraTreesClassifier()
ex_params = {'n_estimators': [1,2,3,4,5],'max_features':[3,4,5],'max_depth':[None,2,3,4],'min_samples_leaf':[1,2,3],'min_samples_split':[2,3,4]}
gs = GridSearchCV(ex, param_grid=ex_params)
gs.fit(X_bal,y_bal['WnvPresent'])
print(gs.best_score_)
print(gs.best_params_)

0.9625937031484257
{'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 2}


In [192]:
ex_2 = ExtraTreesClassifier(max_depth=None,max_features=3,n_estimators=2,min_samples_split=2,min_samples_leaf=1)
ex_2.fit(X_bal,y_bal['WnvPresent'])
ex_2.score(X_test,y_test)

0.9045284107297376

In [193]:
confusion_matrix(y_test, ex_2.predict(X_test), labels=None, sample_weight=None)

array([[3089,  196],
       [ 135,   47]])

In [194]:
roc_auc_score(y_test, ex_2.predict(X_test))

0.5992883068225534

In [195]:
feature = pd.DataFrame(ex_2.feature_importances_)
feature['names'] = X_train.columns
feature.columns = ['values', 'names']
feature.sort_values(by=['values'],ascending=False).head(20)

Unnamed: 0,values,names
15,0.256434,Longitude
14,0.234043,Latitude
16,0.073693,daytime
45,0.067148,month_8
7,0.052162,Cool
37,0.034014,Species_CULEX RESTUANS
3,0.027545,Depart
35,0.027076,Species_CULEX PIPIENS
43,0.024566,month_6
36,0.022407,Species_CULEX PIPIENS/RESTUANS
