# Modeling

In [82]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.combine import SMOTEENN

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [83]:
# Load data
train = pd.read_csv('feat_train.csv')
test = pd.read_csv('feat_test.csv')
spray = pd.read_csv('feat_spray.csv')

#### Balance Classes

Since our target variable only makes up for 5% of the data, we will need to artificially balance the classes so that our models perform better. We want to use a combination of under and over sampling, so we will use the SMOTE method to accomplish this. 

In [84]:
# Define X and y
X = train.drop('WnvPresent', axis = 1)
y = train['WnvPresent']

In [85]:
# Convert Year and Month to Dummies
X = pd.get_dummies(X, columns = ['Year','Month'])

In [86]:
# Use SMOTE method to balance classes
sme = SMOTEENN()
X_res, y_res = sme.fit_sample(X, y)

In [87]:
# Check class balance 
pd.Series(y_res).value_counts(normalize = True)

1    0.506225
0    0.493775
dtype: float64

With the balanced data, the baseline accuracy score to beat is 50%

#### Logistic Regression

In [88]:
# Train test split and standardize data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state = 42)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [89]:
# Fit model and accuracy score
logreg = LogisticRegression()
model_logreg = logreg.fit(X_train, y_train)
model_logreg.score(X_train, y_train), model_logreg.score(X_test, y_test)

(0.8147, 0.8092381523695261)

In [90]:
# Evaluate model with AUC-ROC score
y_score = model_logreg.predict(X_test)
metrics.roc_auc_score(y_test, y_score)

0.8088389836608364

In [104]:
# Gridsearch on logistic regression model above
lr_params = {'penalty':['l1', 'l2'], 
             'C': [0.2, 0.3, 0.35, 0.5]}
gs = GridSearchCV(LogisticRegression(), param_grid = lr_params)
gs.fit(X_train, y_train)

# Results 
gs.best_score_, gs.best_params_

(0.8155, {'C': 0.3, 'penalty': 'l2'})

In [105]:
# Gridsearch test accuracy score
gs.score(X_test, y_test)

0.8092381523695261

In [106]:
# Evaluate gridsearched model with AUC-ROC score
y_score = gs.predict(X_test)
metrics.roc_auc_score(y_test, y_score)

0.8088389836608364

Gridsearching did not seem to improve the AUC-ROC score. Neither model was overfit. 

In [108]:
# Determine top predictors in model 
coefs = pd.DataFrame(model_logreg.coef_[0], index = X.columns, columns = ['coef'])
coefs['coef'] = np.exp(coefs['coef'])
coefs.sort_values(by='coef', ascending = False, inplace=True)
coefs.head()

Unnamed: 0,coef
Month_8,7224.185052
Year_2013,4196.963922
Month_9,116.975105
Tavg,113.768122
Day,80.3318


In [None]:
# ROC Plot 