# Modeling

## Importing modules and data

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')



In [33]:
np.random.seed(32)

In [2]:
df = pd.read_csv('../data/train_weather_merged.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10440 entries, 0 to 10439
Data columns (total 31 columns):
date                      10440 non-null object
address                   10440 non-null object
species                   10440 non-null object
block                     10440 non-null int64
street                    10440 non-null object
trap                      10440 non-null object
addressnumberandstreet    10440 non-null object
latitude                  10440 non-null float64
longitude                 10440 non-null float64
addressaccuracy           10440 non-null int64
nummosquitos              10440 non-null int64
wnvpresent                10440 non-null int64
closest_station           10440 non-null int64
station                   10440 non-null int64
tmax                      10440 non-null int64
tmin                      10440 non-null int64
tavg                      10440 non-null int64
depart                    10440 non-null object
dewpoint                  10440 no

In [4]:
df.preciptotal = df.preciptotal.map(lambda x: x.replace('T', '0.001')).astype(float)

In [5]:
df_dummied = pd.get_dummies(df, columns=['species'])

## Creating Validation Set, Scaling

In [6]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet', 'nummosquitos', 'codesum', 'depart', 'sunrise', 'sunset'], axis=1, inplace=True)

In [7]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [8]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## Logistic Regression

In [11]:
logreg = LogisticRegression()
gs_logreg = GridSearchCV(logreg, param_grid = {'penalty': ['l1', 'l2'],
                                               'C': np.logspace(0.0001, 1, 20)}, scoring='roc_auc')

In [12]:
gs_logreg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([ 1.00023,  1.12908,  1.27454,  1.43873,  1.62407,  1.83329,
        2.06946,  2.33606,  2.637  ,  2.97671,  3.36018,  3.79306,
        4.2817 ,  4.83328,  5.45593,  6.15878,  6.95218,  7.84779,
        8.85878, 10.     ])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [13]:
gs_logreg.best_params_

{'C': 6.158780653245749, 'penalty': 'l2'}

In [14]:
gs_logreg.score(X_train, y_train), gs_logreg.score(X_test, y_test)

(0.730117329066541, 0.7539636275569973)

## Decision Tree

In [110]:
gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid={'min_samples_split' : [2, 3, 4],
                                                           'min_samples_leaf': [2]},
                                               scoring='roc_auc')

In [111]:
gs_dt.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [112]:
gs_dt.score(X_train, y_train), gs_dt.score(X_test, y_test)

(0.984363359071514, 0.6559400510204082)

## Random Forest

In [29]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'min_samples_split': [2,3,4],
                                                           'min_samples_leaf': [2],
                                                           'class_weight': ['balanced'],},
                                                           scoring='roc_auc')

In [30]:
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2], 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [31]:
gs_rf.score(X_train, y_train), gs_rf.score(X_test, y_test)

(0.9735798588688256, 0.7687635171668018)

In [32]:
gs_rf.best_params_

{'class_weight': 'balanced', 'min_samples_leaf': 2, 'min_samples_split': 3}