# Modeling

## Importing modules and data

In [184]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline
sns.set_style('darkgrid')

In [185]:
np.random.seed(32)

In [186]:
df = pd.read_csv('../data/train_weather_spray_merged.csv')

In [187]:
df_dummied = pd.get_dummies(df, columns=['species'])

In [188]:
test = pd.read_csv('../data/test_merged.csv')

## Creating Validation Set, Scaling

In [189]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'addressnumberandstreet', 'nummosquitos', 'sunrise', 'sunset', 'trap'], axis=1, inplace=True)

In [190]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [191]:
df_dummied.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'wnvpresent',
       'spray_nearby', 'station', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed', 'tsra', 'sn', 'br', 'vcfg',
       'bcfg', 'hz', 'ra', 'dz', 'gr', 'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts',
       'fu', 'species_CULEX OTHER', 'species_CULEX PIPIENS',
       'species_CULEX PIPIENS/RESTUANS', 'species_CULEX RESTUANS'],
      dtype='object')

In [192]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [194]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

scaled_test = ss.transform(test)

## Making dataset without using all features

In [195]:
limited_cols = ['latitude', 'longitude', 'tmax', 'species', 'wnvpresent']
lim_features = [col for col in df[limited_cols] if col != 'wnvpresent']
X_limited = pd.get_dummies(df[lim_features])
X_limited.drop(columns=['species_CULEX OTHER','species_CULEX PIPIENS/RESTUANS','species_CULEX RESTUANS'], inplace=True)
y_limited = df[limited_cols]['wnvpresent']

In [196]:
X_trl, X_tsl, y_trl, y_tsl = train_test_split(X_limited, y_limited)

In [197]:
lim_features_test = ['latitude', 'longitude', 'tmax', 'species_CULEX PIPIENS']

In [198]:
test_limited = test[lim_features_test]

In [199]:
test_limited.columns

Index(['latitude', 'longitude', 'tmax', 'species_CULEX PIPIENS'], dtype='object')

In [200]:
X_limited.columns

Index(['latitude', 'longitude', 'tmax', 'species_CULEX PIPIENS'], dtype='object')

In [201]:
ss_limited = StandardScaler()
X_trl = ss_limited.fit_transform(X_trl)
X_tsl = ss_limited.transform(X_tsl)
test_scl = ss_limited.transform(test_limited)

## Creating Principal Components

In [202]:
pca = PCA(n_components=4)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

test_scaled_pca = pca.transform(scaled_test)

## Decision Tree

In [142]:
gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid={'min_samples_split' : [2, 3, 4],
                                                           'min_samples_leaf': [2, 3, 4],
                                                           'max_depth': range(5, 25),
                                                           'class_weight' : ['balanced'],
                                                           'max_features': ['auto', None],
                                                           'random_state': [32]}, scoring='roc_auc')

In [143]:
gs_dt.fit(X_train_pca, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3, 4], 'max_depth': range(5, 25), 'class_weight': ['balanced'], 'max_features': ['auto', None], 'random_state': [32]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [144]:
gs_dt.score(X_train_pca, y_train), gs_dt.score(X_test_pca, y_test)

(0.8504802891245423, 0.7330525170006321)

In [145]:
gs_dt.best_params_

{'class_weight': 'balanced',
 'max_depth': 8,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'random_state': 32}

In [66]:
submission = pd.DataFrame(gs_dt.predict_proba(scaled_test))

submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

In [67]:
submission.to_csv('../submissions/dt_pca.csv', index=False)

In [171]:
gs_dt.fit(X_trl, y_trl)
gs_dt.score(X_trl, y_trl), gs_dt.score(X_tsl, y_tsl)

(0.7029046715916514, 0.6783096214496678)

In [172]:
submission = pd.DataFrame(gs_dt.predict_proba(test_scl))

submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

submission.to_csv('../submissions/dt_limited.csv', index=False)

In [72]:
# test_df = pd.DataFrame(X_train_pca, gs_dt.best_estimator_.feature_importances_,)

In [None]:
test_df.sort_index(ascending=False).head(10)

## Random Forest

In [177]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'min_samples_split': [3],
                                                           'min_samples_leaf': range(5, 11, 1),
                                                           'class_weight': ['balanced'],
                                                           'n_estimators': [1000]},
                                                            scoring='roc_auc', verbose=1)

### Full dataset

In [None]:
gs_rf.fit(X_train, y_train)

In [None]:
gs_rf.best_params_

In [None]:
gs_rf.score(X_train, y_train), gs_rf.score(X_test, y_test)

In [None]:
model_file = 'rf_george.pkl'
with open(f'../models/{model_file}', 'wb') as file:
    pickle.dump(gs_rf, file)

In [None]:
submission = pd.DataFrame(gs_rf.predict_proba(scaled_test))

submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

submission.to_csv('../submissions/rf_full.csv', index=False)

### Limited Dataset

In [None]:
gs_rf.fit(X_trl, y_trl)
gs_rf.score(X_trl, y_trl), gs_rf.score(X_tsl, y_tsl)

In [None]:
model_file = 'rf_george_lim.pkl'
with open(f'../models/{model_file}', 'wb') as file:
    pickle.dump(gs_rf, file)

## RF with PCA

In [178]:
gs_rf.fit(X_train_pca, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  3.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [3], 'min_samples_leaf': range(5, 11), 'class_weight': ['balanced'], 'n_estimators': [1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [179]:
gs_rf.score(X_train_pca, y_train), gs_rf.score(X_test_pca, y_test)

(0.9619094134554926, 0.8035778570841401)

In [180]:
model_file = 'rf_pca.pkl'
with open(f'../models/{model_file}', 'wb') as file:
    pickle.dump(gs_rf, file)

In [204]:
submission = pd.DataFrame(gs_rf.predict_proba(test_scaled_pca))
submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

submission.to_csv('../submissions/rf_pca.csv', index=False)

## Logistic Regression

In [None]:
logreg = LogisticRegression()
gs_logreg = GridSearchCV(logreg, param_grid = {'penalty': ['l1'],
                                               'C': np.linspace(1.5, 2, 20),
                                               'class_weight':['balanced']}, scoring='roc_auc', verbose=1)

In [None]:
gs_logreg.fit(X_trl, y_trl)

In [None]:
gs_logreg.score(X_trl, y_trl), gs_logreg.score(X_tsl, y_tsl)

In [None]:
gs_logreg.best_params_

In [203]:
submission = pd.DataFrame(gs_logreg.predict_proba(test_sc))
submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

NameError: name 'gs_logreg' is not defined

In [None]:
submission.to_csv('../submissions/4features.csv', index=False)