# Import Libraries

In [4]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",500)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Import Train/Test Data

In [5]:
#import data
train = pd.read_csv('./west-nile-project-4/data/weather-data/02_weather_spray_train.csv', index_col=0)
test = pd.read_csv('./west-nile-project-4/data/weather-data/02_weather_spray_test.csv', index_col='Id')

In [6]:
train.WnvPresent.value_counts()

0    9955
1    9951
Name: WnvPresent, dtype: int64

# Set X and y variables

In [7]:
#id X and y
X = train.drop(['NumMosquitos', 'WnvPresent', 'Latitude', 'Longitude', 'Sprays_In_Last_Month'], axis=1)
y = train.WnvPresent

# Train Test Split

In [8]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standard Scaler

In [9]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# Cross Val Score Potential Classification Models

In [10]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
#svm1 = svm.SVC() #too slow, not strong performer


print('lr', cross_val_score(lr, X_train, y_train).mean())
print('knn', cross_val_score(knn, X_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, X_train, y_train).mean()) 
print('et', cross_val_score(et, X_train, y_train).mean()) 
print('bag', cross_val_score(bag, X_train, y_train).mean())
print('rf', cross_val_score(rf, X_train, y_train).mean()) 
print('gb', cross_val_score(gb, X_train, y_train).mean()) 
print('ad', cross_val_score(ad, X_train, y_train).mean()) 
#print('svm', cross_val_score(svm1, X_train, y_train).mean()) #too slow, not strong performer

lr 0.7447239007979419
knn 0.8621470085677198
dt 0.931609194589921
et 0.9335517585247247
bag 0.9309397163299588
rf 0.9462790073870794
gb 0.8005243685442146
ad 0.7711843121575327


# Making an Interpretable Equation

In [11]:
#printing the model coefs
lr.fit(X_train, y_train)
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

WetBulb                           1.384336
SeaLevel                          1.187198
TS_dummy                          0.561065
FG_dummy                          0.545424
Tmin                              0.372405
Species_CULEX PIPIENS             0.367546
AvgSpeed                          0.293973
Trap_T002                         0.182876
Trap_T003                         0.171800
Species_CULEX PIPIENS/RESTUANS    0.149579
ResultDir                         0.141900
zipcode_60666                     0.141460
Trap_T900                         0.141460
Trap_T228                         0.139513
Trap_T013                         0.132356
Trap_T225                         0.129771
Trap_T230                         0.122289
Trap_T223                         0.122037
Trap_T215                         0.116259
Trap_T155                         0.114328
Trap_T096                         0.103182
Trap_T143                         0.103176
zipcode_60631                     0.100405
Trap_T027  

# ROC AUC Score

In [12]:
#roc auc score
lr_proba = lr.predict_proba(X_test)
roc_auc_score(y_test, lr_proba[:, 1])

0.8069875020629653

# Random Forest Model

In [13]:
rf.fit(X_train, y_train)
#rf.score(X_test, y_test)

0.951979103877838

In [14]:
rf_proba = rf.predict_proba(X_test)
roc_auc_score(y_test, rf_proba[:, 1])

0.9883315284740902

# Prepping for Export

In [15]:
test_ss = ss.transform(test.drop(['Date', 'Latitude', 'Longitude', 'Sprays_In_Last_Month'], axis=1))

In [16]:
rf_preds = rf.predict(test_ss)
rf_preds

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
#exporting to csv
pd.DataFrame({'WnvPresent':rf_preds}, index=test.index).to_csv('./preds4.csv')