# Import Libraries

In [1]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",500)

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

# Basic Model
## Import Train/Test Data

In [2]:
#import data
test = pd.read_csv('./data/west_nile_test.csv', index_col=0)
train = pd.read_csv('./data/west_nile_train.csv', index_col=0)

## Set X and y variables

In [3]:
#id X and y
X = train.drop(['Date', 'NumMosquitos', 'WnvPresent'], axis=1)
y = train.WnvPresent

## Train Test Split

In [4]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Cross Val Score Potential Classification Models

In [5]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
svm1 = svm.SVC()


print('lr', cross_val_score(lr, X_train, y_train).mean())
print('knn', cross_val_score(knn, X_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, X_train, y_train).mean()) 
print('et', cross_val_score(et, X_train, y_train).mean()) 
print('bag', cross_val_score(bag, X_train, y_train).mean())
print('rf', cross_val_score(rf, X_train, y_train).mean()) 
print('gb', cross_val_score(gb, X_train, y_train).mean()) 
print('ad', cross_val_score(ad, X_train, y_train).mean()) 
print('svm', cross_val_score(svm1, X_train, y_train).mean())

lr 0.9473283716764404
knn 0.9455515608074526
dt 0.9464399179222774
et 0.9464399179222774
bag 0.9463129821517772
rf 0.9465668536927776
gb 0.9468207252337778
ad 0.9473283716764404
svm 0.9473283716764404


- All models seem to reveal similar enough results. 
- LogReg choosen for ease of interpretability.

In [6]:
#fit and score
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.948229920060906

In [7]:
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

Species_CULEX PIPIENS             1.440582
Trap_T228                         1.287088
Trap_T003                         1.193752
zipcode_60631                     1.172928
Trap_T014                         1.095087
Trap_T096                         0.998186
Trap_T082                         0.967534
Trap_T230                         0.888810
Trap_T225                         0.867154
Species_CULEX PIPIENS/RESTUANS    0.782485
Trap_T143                         0.775290
Trap_T114                         0.767602
Trap_T005                         0.714212
Trap_T027                         0.692716
Trap_T231                         0.670558
Trap_T002                         0.660451
Trap_T011                         0.653571
zipcode_60656                     0.625830
Trap_T235                         0.622289
zipcode_60621                     0.622289
Trap_T227                         0.564517
Trap_T215                         0.532436
zipcode_60666                     0.510498
Trap_T900  

# Balanced Model
## Import Data

In [8]:
#import data
balenced_train = pd.read_csv('./data/BALANCED_west_nile_train.csv', index_col=0)

In [9]:
balenced_train.WnvPresent.value_counts()

0    9955
1    9951
Name: WnvPresent, dtype: int64

# Set X and y variables

In [18]:
#id X and y
X = balenced_train.drop(['Date', 'NumMosquitos', 'WnvPresent', 'Latitude', 'Longitude'], axis=1)
y = balenced_train.WnvPresent

# Train Test Split

In [19]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Cross Val Score Potential Classification Models

In [20]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
#svm1 = svm.SVC() #takes a long time, doesn't score partic well.


print('lr', cross_val_score(lr, X_train, y_train).mean())
print('knn', cross_val_score(knn, X_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, X_train, y_train).mean()) 
print('et', cross_val_score(et, X_train, y_train).mean()) 
print('bag', cross_val_score(bag, X_train, y_train).mean())
print('rf', cross_val_score(rf, X_train, y_train).mean()) 
print('gb', cross_val_score(gb, X_train, y_train).mean()) 
print('ad', cross_val_score(ad, X_train, y_train).mean()) 
#print('svm', cross_val_score(svm1, X_train, y_train).mean()) #.644


lr 0.6881901118133769
knn 0.6634062911256734
dt 0.7114337824302618
et 0.7111658430495176
bag 0.7131754355135445
rf 0.711031651276472
gb 0.6825635732515337
ad 0.6687646480347648


In [21]:
lr.fit(X_train, y_train)
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

Trap_T228                         2.287452
Trap_T225                         2.241649
Trap_T003                         2.239990
Species_CULEX PIPIENS             2.147821
Trap_T230                         1.612484
Species_CULEX PIPIENS/RESTUANS    1.595746
Trap_T096                         1.495935
zipcode_60631                     1.480597
Trap_T014                         1.457076
Trap_T128                         1.397276
Trap_T231                         1.392636
Trap_T215                         1.345216
Trap_T155                         1.327030
Trap_T047                         1.250585
Trap_T086                         1.170282
Trap_T027                         1.148324
Trap_T005                         1.135671
Trap_T070                         1.127253
Trap_T002                         1.046191
zipcode_60656                     1.033332
Trap_T013                         1.018753
Trap_T082                         0.970078
Trap_T114                         0.960377
Trap_T107  