# Import Libraries

In [2]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",500)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

# Import Train/Test Data

In [14]:
#import data
balanced_train = pd.read_csv('./west-nile-project-4/data/BALANCED_west_nile_train.csv', index_col=0)
test = pd.read_csv('./west-nile-project-4/data/west_nile_test.csv', index_col='Id')
spray = pd.read_csv('./west-nile-project-4/data/spray.csv')
train_spray_dummy = pd.read_csv('./west-nile-project-4/data/west_nile_train_spray.csv', index_col=0)
pd.read_csv('./', index_col=0)

# Set X and y variables

In [16]:
#id X and y
X = train_spray_dummy.drop(['Date', 'NumMosquitos', 'WnvPresent', 'Latitude', 'Longitude'], axis=1)
y = train_spray_dummy.WnvPresent

# Train Test Split

In [17]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Cross Val Score Potential Classification Models

In [18]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
svm1 = svm.SVC()


print('lr', cross_val_score(lr, X_train, y_train).mean())
print('knn', cross_val_score(knn, X_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, X_train, y_train).mean()) 
print('et', cross_val_score(et, X_train, y_train).mean()) 
print('bag', cross_val_score(bag, X_train, y_train).mean())
print('rf', cross_val_score(rf, X_train, y_train).mean()) 
print('gb', cross_val_score(gb, X_train, y_train).mean()) 
print('ad', cross_val_score(ad, X_train, y_train).mean()) 
print('svm', cross_val_score(svm1, X_train, y_train).mean())

lr 0.7133769250664524
knn 0.7285814182538713
dt 0.7499498900731144
et 0.7488781998479178
bag 0.7492800214319203
rf 0.7512894389217194
gb 0.7086209909635771
ad 0.7041328616458863
svm 0.6918750825070781


# Making an Interpretable Equation

In [19]:
#printing the model coefs
lr.fit(X_train, y_train)
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

Species_CULEX PIPIENS             2.127675
Trap_T228                         2.051289
Trap_T003                         2.034390
Trap_T014                         1.990741
Trap_T225                         1.865276
Trap_T070                         1.815498
Trap_T107                         1.582188
zipcode_60631                     1.513486
Trap_T096                         1.489098
Species_CULEX PIPIENS/RESTUANS    1.477714
Trap_T230                         1.451553
Trap_T128                         1.440091
Trap_T155                         1.439869
Trap_T215                         1.398097
Trap_T086                         1.330856
Trap_T005                         1.275161
Sprays_In_Last_Month              1.243696
Trap_T081                         1.233224
zipcode_60656                     1.228005
Trap_T006                         1.049547
Trap_T002                         1.030661
zipcode_60666                     0.984612
Trap_T900                         0.984612
Trap_T035  

# Accuracy Score

In [20]:
#accuracy score
lr.score(X_test, y_test)

0.7185051235684147

# Prepping for Export

In [26]:
#prepping the test data
test.drop(['Unnamed: 0', 'Date', 'Latitude', 'Longitude'], axis=1, inplace=True)

In [28]:
#predictions for test data
preds = lr.predict(test)

In [36]:
#exporting to csv
pd.DataFrame({'WnvPresent':preds}, index=test.index).to_csv('./preds1.csv')