# Import Libraries

In [1]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",500)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

# Import Train/Test Data

In [16]:
#import data
balenced_train = pd.read_csv('./west-nile-project-4/data/BALANCED_west_nile_train.csv', index_col=0)
test = pd.read_csv('./west-nile-project-4/data/west_nile_test.csv', index_col='Id')

In [3]:
balenced_train.WnvPresent.value_counts()

0    9955
1    9951
Name: WnvPresent, dtype: int64

# Set X and y variables

In [5]:
#id X and y
X = balenced_train.drop(['Date', 'NumMosquitos', 'WnvPresent', 'Latitude', 'Longitude'], axis=1)
y = balenced_train.WnvPresent

# Train Test Split

In [6]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Cross Val Score Potential Classification Models

In [7]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
svm1 = svm.SVC()


print('lr', cross_val_score(lr, X_train, y_train).mean())
print('knn', cross_val_score(knn, X_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, X_train, y_train).mean()) 
print('et', cross_val_score(et, X_train, y_train).mean()) 
print('bag', cross_val_score(bag, X_train, y_train).mean())
print('rf', cross_val_score(rf, X_train, y_train).mean()) 
print('gb', cross_val_score(gb, X_train, y_train).mean()) 
print('ad', cross_val_score(ad, X_train, y_train).mean()) 
print('svm', cross_val_score(svm1, X_train, y_train).mean())

lr 0.6834352006367554
knn 0.6613975331541168
dt 0.7076166254373549
et 0.7072817113060917
bag 0.7083531457997249
rf 0.7093581170059661
gb 0.6801531148319785
ad 0.6668905260016009
svm 0.6486038887133763


# Making an Interpretable Equation

In [8]:
#printing the model coefs
lr.fit(X_train, y_train)
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

Trap_T003                         2.232976
Trap_T225                         2.226825
Species_CULEX PIPIENS             2.126706
Trap_T228                         2.062375
zipcode_60631                     1.572814
Trap_T230                         1.551791
Trap_T231                         1.550477
Species_CULEX PIPIENS/RESTUANS    1.547181
Trap_T014                         1.496524
Trap_T128                         1.476684
Trap_T096                         1.370413
Trap_T215                         1.247445
Trap_T047                         1.235089
Trap_T005                         1.229313
Trap_T086                         1.125798
Trap_T002                         1.090446
zipcode_60656                     1.088002
Trap_T013                         1.085761
Trap_T223                         1.063234
Trap_T008                         1.046680
Trap_T155                         1.035123
Trap_T082                         0.991742
Trap_T103                         0.975406
Trap_T070  

# Accuracy Score

In [10]:
#accuracy score
lr.score(X_test, y_test)

0.6833433795459112

# Prepping for Export

In [26]:
#prepping the test data
test.drop(['Unnamed: 0', 'Date', 'Latitude', 'Longitude'], axis=1, inplace=True)

In [28]:
#predictions for test data
preds = lr.predict(test)

In [36]:
#exporting to csv
pd.DataFrame({'WnvPresent':preds}, index=test.index).to_csv('./preds1.csv')