# Modeling

## Importing modules and data

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')

In [2]:
np.random.seed(32)

In [3]:
df = pd.read_csv('../data/train_weather_spray_merged.csv')

In [4]:
df_dummied = pd.get_dummies(df, columns=['species'])

## Creating Validation Set, Scaling

In [5]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet', 'nummosquitos', 'sunrise', 'sunset'], axis=1, inplace=True)

In [6]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [7]:
df_dummied.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'wnvpresent',
       'spray_nearby', 'station', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed', 'tsra', 'sn', 'br', 'vcfg',
       'bcfg', 'hz', 'ra', 'dz', 'gr', 'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts',
       'fu', 'species_CULEX OTHER', 'species_CULEX PIPIENS',
       'species_CULEX PIPIENS/RESTUANS', 'species_CULEX RESTUANS'],
      dtype='object')

In [8]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [9]:
X.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'spray_nearby', 'station',
       'tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed', 'tsra', 'sn', 'br', 'vcfg', 'bcfg', 'hz', 'ra', 'dz', 'gr',
       'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts', 'fu', 'species_CULEX OTHER',
       'species_CULEX PIPIENS', 'species_CULEX PIPIENS/RESTUANS',
       'species_CULEX RESTUANS'],
      dtype='object')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## Logistic Regression

In [12]:
logreg = LogisticRegression()
gs_logreg = GridSearchCV(logreg, param_grid = {'penalty': ['l1', 'l2'],
                                               'C': np.linspace(1, 10, 10),
                                               'class_weight':['balanced']}, scoring='roc_auc', verbose=1)

In [13]:
%%time
gs_logreg.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  4.7min finished


CPU times: user 5min 3s, sys: 1.52 s, total: 5min 5s
Wall time: 5min 9s


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]), 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [14]:
gs_logreg.best_params_

{'C': 3.0, 'class_weight': 'balanced', 'penalty': 'l1'}

In [15]:
gs_logreg.score(X_train, y_train), gs_logreg.score(X_test, y_test)

(0.7588769609164064, 0.7250985739027895)

## Decision Tree

In [16]:
gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid={'min_samples_split' : [2, 3, 4, 5],
                                                           'min_samples_leaf': [2]},
                                               scoring='roc_auc')

In [17]:
gs_dt.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [18]:
gs_dt.score(X_train, y_train), gs_dt.score(X_test, y_test)

(0.9814505497698944, 0.6474413304571391)

## Random Forest

In [19]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'min_samples_split': [2,3,4],
                                                           'min_samples_leaf': [2],
                                                           'class_weight': ['balanced'],},
                                                           scoring='roc_auc')

In [20]:
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2], 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [21]:
gs_rf.score(X_train, y_train), gs_rf.score(X_test, y_test)

(0.9744550845129213, 0.7755689790549944)

In [22]:
gs_rf.best_params_

{'class_weight': 'balanced', 'min_samples_leaf': 2, 'min_samples_split': 2}

#### Support Vector Classifier

In [116]:
param_grid = {
    'kernel': ['rbf', 'sigmoid'],
    'gamma': np.logspace(-5, 2, 20)
}

clf = SVC(probability=True)

grid = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', verbose=1)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

grid.score(X_test, y_test)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 15.7min finished


{'gamma': 0.00379269019073225, 'kernel': 'rbf'}
0.7233082500485667


0.716600383807518

In [74]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'n_estimators': range(10, 51, 10),
                                                          'max_depth': [7, 15, 25, 35],
                                                          'min_samples_leaf': [2, 4, 8],},
                                                           scoring='roc_auc')

In [75]:
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(10, 51, 10), 'max_depth': [7, 15, 25, 35], 'min_samples_leaf': [2, 4, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [77]:
print(gs_rf.best_params_)
print(gs_rf.best_score_)

{'max_depth': 7, 'min_samples_leaf': 4, 'n_estimators': 40}
0.8080572265284448


In [122]:
import pickle

In [123]:
with open('../data/rf_01.pkl', 'wb+') as f:
    pickle.dump(gs_rf, f)

## New Params

In [134]:
from sklearn.metrics import roc_auc_score

In [136]:
rf_2 = RandomForestClassifier(n_estimators=1000, max_depth=7, min_samples_leaf=4)

In [137]:
rf_2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [139]:
roc_auc_score(y_train, rf_2.predict_proba(X_train)[:,1])

0.8990692747053012

In [140]:
roc_auc_score(y_test, rf_2.predict_proba(X_test)[:, 1])

0.8288513270272493

In [129]:
rf_2.score(X_train, y_train)

0.9482758620689655

In [130]:
rf_2.score(X_test, y_test)

0.9452107279693487

In [152]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'n_estimators': [1000],
                                                          'max_depth': [5, 7, 15],
                                                          'min_samples_leaf': [2, 4, 8],},
                                                           scoring='roc_auc')

In [153]:
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1000], 'max_depth': [5, 7, 15], 'min_samples_leaf': [2, 4, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [154]:
print(gs_rf.best_params_)
print(gs_rf.best_score_)

{'max_depth': 7, 'min_samples_leaf': 8, 'n_estimators': 1000}
0.8083519584066448


In [108]:
test = pd.read_csv('../data/test_merged.csv')

In [111]:
test = ss.fit_transform(test)

In [119]:
preds = grid.predict_proba(test)[:, 1]

In [114]:
preds

array([0, 0, 0, ..., 0, 0, 0])

In [155]:
preds_forest = gs_rf.predict_proba(test)[:, 1]

In [156]:
submission = pd.DataFrame(preds, index=None, columns=['WnvPresent'])

index_array = np.array(range(1,116294))

submission = submission.set_index(index_array)

submission.index.rename('Id', inplace=True)

In [121]:
submission.to_csv('../data/Submission_1')

In [157]:
submission_forest = pd.DataFrame(preds_forest, index=None, columns=['WnvPresent'])

index_array = np.array(range(1,116294))

submission_forest = submission_forest.set_index(index_array)

submission_forest.index.rename('Id', inplace=True)


In [158]:
submission_forest.to_csv('../data/Submission_2')

In [109]:
preds_forest = gs_rf.predict_proba(test)[:,1]

In [91]:
logreg = LogisticRegression()
gs_logreg = GridSearchCV(logreg, param_grid = {'penalty': ['l1', 'l2'],
                                               'C': np.linspace(1, 10, 10),
                                               'class_weight':['balanced']}, scoring='roc_auc', verbose=1)

In [92]:
gs_logreg.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  4.6min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]), 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [93]:
print(gs_logreg.best_params_)
print(gs_logreg.best_score_)

{'C': 4.0, 'class_weight': 'balanced', 'penalty': 'l1'}
0.7398731117345887


In [99]:
preds_log = gs_logreg.predict_proba(test)[:,1]

In [100]:
submission_log = pd.DataFrame(preds_log, index=None, columns=['WnvPresent'])

index_array = np.array(range(1,116294))

submission_log = submission_log.set_index(index_array)

submission_log.index.rename('Id', inplace=True)


In [101]:
submission_log.to_csv('../data/Submission_3')

In [143]:
rf_2_preds = rf_2.predict_proba(test)[:, 1]

In [144]:
submission_rf_2 = pd.DataFrame(rf_2_preds, index=None, columns=['WnvPresent'])

index_array = np.array(range(1,116294))

submission_rf_2 = submission_rf_2.set_index(index_array)

submission_rf_2.index.rename('Id', inplace=True)


In [145]:
submission_rf_2.to_csv('../data/Submission_4')