# Modeling

## Importing modules and data

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

%matplotlib inline
sns.set_style('darkgrid')

In [2]:
np.random.seed(32)

In [3]:
df = pd.read_csv('../data/train_weather_spray_merged.csv')

In [4]:
df_dummied = pd.get_dummies(df, columns=['species'])

## Creating Validation Set, Scaling

In [5]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet', 'nummosquitos', 'sunrise', 'sunset'], axis=1, inplace=True)

In [6]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [7]:
df_dummied.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'wnvpresent',
       'spray_nearby', 'station', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed', 'tsra', 'sn', 'br', 'vcfg',
       'bcfg', 'hz', 'ra', 'dz', 'gr', 'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts',
       'fu', 'species_CULEX OTHER', 'species_CULEX PIPIENS',
       'species_CULEX PIPIENS/RESTUANS', 'species_CULEX RESTUANS'],
      dtype='object')

In [8]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [9]:
X.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'spray_nearby', 'station',
       'tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed', 'tsra', 'sn', 'br', 'vcfg', 'bcfg', 'hz', 'ra', 'dz', 'gr',
       'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts', 'fu', 'species_CULEX OTHER',
       'species_CULEX PIPIENS', 'species_CULEX PIPIENS/RESTUANS',
       'species_CULEX RESTUANS'],
      dtype='object')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [29]:
with open('../models/scaler.pkl', 'wb') as file:
    pickle.dump(ss, file)

## AdaBoost

In [30]:
np.random.seed(41)

ada = AdaBoostClassifier()
params = {
    'n_estimators': range(40, 70),
#     'learning_rate': np.linspace()
    
}
gs_ada = GridSearchCV(ada, param_grid = params, scoring='roc_auc', verbose=1)

%time  gs_ada.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   29.4s finished


CPU times: user 27.5 s, sys: 276 ms, total: 27.8 s
Wall time: 29.8 s


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(40, 70)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=1)

In [31]:
gs_ada.best_params_

{'n_estimators': 67}

In [32]:
gs_ada.score(X_train, y_train), gs_ada.score(X_test, y_test)

(0.8445409051521444, 0.8257814338073762)

In [33]:
model_file = 'ada.pkl'
with open(f'../models/{model_file}', 'wb') as file:
    pickle.dump(gs_ada, file)

## Bagging

In [40]:
np.random.seed(41)

bag = BaggingClassifier()
params = {
    'n_estimators': range(40, 70),
#     'learning_rate': np.linspace()
}
gs_bag = GridSearchCV(bag, param_grid = params, scoring='roc_auc', verbose=1)
%time gs_bag.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.6min finished


CPU times: user 1min 29s, sys: 3.01 s, total: 1min 32s
Wall time: 1min 38s


GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(40, 70)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=1)

In [41]:
gs_bag.score(X_train, y_train), gs_bag.score(X_test, y_test)

(0.9916244960662787, 0.7796593353950467)

In [42]:
model_file = 'bag.pkl'
with open(f'../models/{model_file}', 'wb') as file:
    pickle.dump(gs_bag, file)

## KNN

In [38]:
knn = KNeighborsClassifier()
params = {
    'n_neighbors': range(25, 100),    
    'weights':['uniform', 'distance']
}
%time gs_knn = GridSearchCV(knn, param_grid = params, scoring='roc_auc', verbose=1)

CPU times: user 148 µs, sys: 51 µs, total: 199 µs
Wall time: 204 µs


In [None]:
gs_knn.fit(X_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


In [None]:
gs_knn.best_params_

In [None]:
gs_knn.score(X_train, y_train), gs_knn.score(X_test, y_test)

In [None]:
model_file = 'knn.pkl'
with open(f'../models/{model_file}', 'wb') as file:
    pickle.dump(gs_knn, file)