# Import Libraries

In [2]:
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",500)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Import Train/Test Data

In [3]:
#import data
train = pd.read_csv('./west-nile-project-4/data/weather-data/02_weather_spray_train.csv', index_col=0)
test = pd.read_csv('./west-nile-project-4/data/weather-data/02_weather_spray_test.csv', index_col='Id')

In [4]:
train.WnvPresent.value_counts()

0    9955
1    9951
Name: WnvPresent, dtype: int64

# Set X and y variables

In [5]:
#id X and y
X = train.drop(['NumMosquitos', 'WnvPresent', 'Latitude', 'Longitude', 'Sprays_In_Last_Month'], axis=1)
y = train.WnvPresent

# Train Test Split

In [6]:
#train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standard Scaler

In [7]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# PCA

In [8]:
pca = PCA()

Z_train = pca.fit_transform(X_train)
Z_test = pca.transform(X_test)
Z_train.shape

(14929, 258)

# Cross Val Score Potential Classification Models

In [9]:
#cross val score all potential classification models
lr = LogisticRegression() 
knn = KNeighborsClassifier() 
#nb = MultinomialNB() won't take neg vals
dt = DecisionTreeClassifier() 
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
#svm1 = svm.SVC() #too slow, not strong performer


print('lr', cross_val_score(lr, Z_train, y_train).mean())
print('knn', cross_val_score(knn, Z_train, y_train).mean())
#print('nb', cross_val_score(nb, X_train, y_train).mean()) 
print('dt', cross_val_score(dt, Z_train, y_train).mean()) 
print('et', cross_val_score(et, Z_train, y_train).mean()) 
print('bag', cross_val_score(bag, Z_train, y_train).mean())
print('rf', cross_val_score(rf, Z_train, y_train).mean()) 
print('gb', cross_val_score(gb, Z_train, y_train).mean()) 
print('ad', cross_val_score(ad, Z_train, y_train).mean()) 
#print('svm', cross_val_score(svm1, X_train, y_train).mean()) #too slow, not strong performer

lr 0.7431176848659246
knn 0.865697602863849
dt 0.9338201977757922
et 0.929801201281522
bag 0.954049345101077
rf 0.9586043414928391
gb 0.8875344672309343
ad 0.8200822255042003


# Making an Interpretable Equation

# ROC AUC Score

In [11]:
#printing the model coefs
lr.fit(Z_train, y_train)
lr_coefs = pd.DataFrame(lr.coef_, columns=X.columns, index=['coef']).T
lr_coefs.coef.sort_values(ascending=False)
#higher coefs are more likely to indicate WNV

Trap_T149                         8.903825e-01
Trap_T154                         8.277625e-01
Trap_T151                         3.942377e-01
zipcode_60612                     3.875950e-01
Trap_T115                         3.771950e-01
Trap_T129                         3.209799e-01
Trap_T013                         3.140627e-01
Trap_T072                         2.975615e-01
Trap_T103                         2.819704e-01
Trap_T073                         2.617306e-01
Trap_T030                         2.615404e-01
Trap_T145                         2.553208e-01
Trap_T044                         2.538420e-01
Trap_T049                         2.240524e-01
Trap_T070                         2.112273e-01
Trap_T107                         2.045096e-01
Trap_T071                         1.991253e-01
zipcode_60106                     1.921862e-01
Trap_T005                         1.848916e-01
Trap_T011                         1.700044e-01
zipcode_60625                     1.668054e-01
Trap_T095    

In [12]:
#roc auc score
lr_proba = lr.predict_proba(Z_test)
roc_auc_score(y_test, lr_proba[:, 1])

0.8044638546142754

# Random Forest Model

In [13]:
rf.fit(Z_train, y_train)
#rf.score(X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
rf_proba = rf.predict_proba(Z_test)
roc_auc_score(y_test, rf_proba[:, 1])

0.9886957560842485

# Gradient Boost

In [15]:
gb.fit(Z_train, y_train)
gb_proba = gb.predict_proba(Z_test)
roc_auc_score(y_test, gb_proba[:, 1])

0.950856986450022

# Prepping for Export

In [16]:
test_ss = ss.transform(test.drop(['Date', 'Latitude', 'Longitude', 'Sprays_In_Last_Month'], axis=1))
test_pca = pca.transform(test_ss)

In [17]:
gb_preds = gb.predict(test_pca)
gb_preds

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
#exporting to csv
pd.DataFrame({'WnvPresent':gb_preds}, index=test.index).to_csv('./preds5.csv')