# Modeling

## Importing modules and data

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')

In [12]:
np.random.seed(32)

In [13]:
df = pd.read_csv('../data/train_weather_spray_merged.csv')

In [15]:
df_dummied = pd.get_dummies(df, columns=['species'])

## Creating Validation Set, Scaling

In [16]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet', 'nummosquitos', 'sunrise', 'sunset'], axis=1, inplace=True)

In [17]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [18]:
df_dummied.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'wnvpresent',
       'spray_nearby', 'station', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed', 'tsra', 'sn', 'br', 'vcfg',
       'bcfg', 'hz', 'ra', 'dz', 'gr', 'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts',
       'fu', 'species_CULEX OTHER', 'species_CULEX PIPIENS',
       'species_CULEX PIPIENS/RESTUANS', 'species_CULEX RESTUANS'],
      dtype='object')

In [19]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [20]:
X.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'spray_nearby', 'station',
       'tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed', 'tsra', 'sn', 'br', 'vcfg', 'bcfg', 'hz', 'ra', 'dz', 'gr',
       'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts', 'fu', 'species_CULEX OTHER',
       'species_CULEX PIPIENS', 'species_CULEX PIPIENS/RESTUANS',
       'species_CULEX RESTUANS'],
      dtype='object')

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## Logistic Regression

In [23]:
logreg = LogisticRegression()
gs_logreg = GridSearchCV(logreg, param_grid = {'penalty': ['l1', 'l2'],
                                               'C': np.linspace(1, 10, 10),
                                               'class_weight':['balanced']}, scoring='roc_auc', verbose=1)

In [24]:
%%time
gs_logreg.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
gs_logreg.best_params_

In [None]:
gs_logreg.score(X_train, y_train), gs_logreg.score(X_test, y_test)

## Decision Tree

In [None]:
gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid={'min_samples_split' : [2, 3, 4, 5],
                                                           'min_samples_leaf': [2]},
                                               scoring='roc_auc')

In [None]:
gs_dt.fit(X_train, y_train)

In [None]:
gs_dt.score(X_train, y_train), gs_dt.score(X_test, y_test)

## Random Forest

In [None]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'min_samples_split': [2,3,4],
                                                           'min_samples_leaf': [2],
                                                           'class_weight': ['balanced'],},
                                                           scoring='roc_auc')

In [None]:
gs_rf.fit(X_train, y_train)

In [None]:
gs_rf.score(X_train, y_train), gs_rf.score(X_test, y_test)

In [None]:
gs_rf.best_params_