# Import Libraries

In [60]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",500)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

# Import Train/Test Data

In [47]:
#import data
train = pd.read_csv('./west-nile-project-4/data/weather-data/02_weather_spray_train.csv', index_col=0)
test = pd.read_csv('./west-nile-project-4/data/weather-data/02_weather_spray_test.csv', index_col='Id')

In [48]:
train.WnvPresent.value_counts()

0    9955
1    9951
Name: WnvPresent, dtype: int64

# Set X and y variables

In [64]:
#id X and y
X = train.drop(['NumMosquitos', 'WnvPresent', 'Latitude', 'Longitude', 'Sprays_In_Last_Month'], axis=1)
y = train.WnvPresent

# Train Test Split

In [65]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Cross Val Score Potential Classification Models

In [66]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
#svm1 = svm.SVC() #too slow, not strong performer


print('lr', cross_val_score(lr, X_train, y_train).mean())
print('knn', cross_val_score(knn, X_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, X_train, y_train).mean()) 
print('et', cross_val_score(et, X_train, y_train).mean()) 
print('bag', cross_val_score(bag, X_train, y_train).mean())
print('rf', cross_val_score(rf, X_train, y_train).mean()) 
print('gb', cross_val_score(gb, X_train, y_train).mean()) 
print('ad', cross_val_score(ad, X_train, y_train).mean()) 
#print('svm', cross_val_score(svm1, X_train, y_train).mean()) #too slow, not strong performer

lr 0.7384941989432798
knn 0.8717254748728572
dt 0.9283933295840935
et 0.9322782689820409
bag 0.9304025451751423
rf 0.9409198426665509
gb 0.8032005584146032
ad 0.7670299390597082
svm 0.7757370066459819


# Making an Interpretable Equation

In [67]:
#printing the model coefs
lr.fit(X_train, y_train)
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

FG_dummy                          4.930031
SeaLevel                          2.869533
Trap_T003                         2.470125
Species_CULEX PIPIENS             2.018140
Trap_T215                         1.907735
Trap_T225                         1.843590
Trap_T228                         1.841689
Species_CULEX PIPIENS/RESTUANS    1.603058
Trap_T230                         1.586091
Trap_T013                         1.563639
Trap_T002                         1.472549
zipcode_60631                     1.319873
Trap_T096                         1.286431
Trap_T014                         1.243646
Trap_T107                         1.184685
Trap_T231                         1.163873
Trap_T226                         1.130865
Trap_T223                         1.116895
Trap_T086                         1.084546
Trap_T047                         1.075692
Trap_T147                         1.065068
Trap_T070                         1.055791
TS_dummy                          1.051935
Trap_T027  

# ROC AUC Score

In [68]:
#roc auc score
lr_proba = lr.predict_proba(X_test)
roc_auc_score(y_test, lr_proba[:, 1])

0.8133292112303783

# Random Forest Model

In [69]:
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9574040586698814

In [70]:
rf_proba = rf.predict_proba(X_test)
roc_auc_score(y_test, rf_proba[:, 1])

0.9889877195996791

# Prepping for Export

In [71]:
rf_preds = rf.predict(test.drop(['Date', 'Latitude', 'Longitude', 'Sprays_In_Last_Month'], axis=1))
rf_preds

array([0, 0, 0, ..., 0, 0, 0])

In [72]:
#exporting to csv
pd.DataFrame({'WnvPresent':rf_preds}, index=test.index).to_csv('./preds3.csv')