# Modeling

## Importing modules and data

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

%matplotlib inline
sns.set_style('darkgrid')

In [3]:
np.random.seed(32)

In [4]:
df = pd.read_csv('../data/train_weather_spray_merged.csv')

In [5]:
df_dummied = pd.get_dummies(df, columns=['species'])

In [6]:
test = pd.read_csv('../data/test_merged.csv')

## Creating Validation Set, Scaling

In [7]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'addressnumberandstreet', 'nummosquitos', 'sunrise', 'sunset', 'trap'], axis=1, inplace=True)

In [8]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [9]:
df_dummied.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'wnvpresent',
       'spray_nearby', 'station', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed', 'tsra', 'sn', 'br', 'vcfg',
       'bcfg', 'hz', 'ra', 'dz', 'gr', 'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts',
       'fu', 'species_CULEX OTHER', 'species_CULEX PIPIENS',
       'species_CULEX PIPIENS/RESTUANS', 'species_CULEX RESTUANS'],
      dtype='object')

In [10]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

scaled_test = ss.transform(test)

In [13]:
pca = PCA(n_components=7)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

scaled_test = pca.transform(scaled_test)

## Making dataset without using all features

In [140]:
limited_cols = ['latitude', 'longitude', 'tmax', 'tmin', 'species', 'wnvpresent']
lim_features = [col for col in df[limited_cols] if col != 'wnvpresent']
X_limited = pd.get_dummies(df[lim_features])
X_limited.drop(columns=['species_CULEX OTHER','species_CULEX PIPIENS/RESTUANS','species_CULEX RESTUANS'], inplace=True)
y_limited = df[limited_cols]['wnvpresent']

In [141]:
X_trl, X_tsl, y_trl, y_tsl = train_test_split(X_limited, y_limited)

In [142]:
lim_features_test = ['latitude', 'longitude', 'tmax', 'tmin', 'species_CULEX PIPIENS']

In [143]:
test_limited = test[lim_features_test]

In [144]:
test_limited.columns

Index(['latitude', 'longitude', 'tmax', 'tmin', 'species_CULEX PIPIENS'], dtype='object')

In [145]:
X_limited.columns

Index(['latitude', 'longitude', 'tmax', 'tmin', 'species_CULEX PIPIENS'], dtype='object')

In [146]:
ss_limited = StandardScaler()
X_trl = ss_limited.fit_transform(X_trl)
X_tsl = ss_limited.transform(X_tsl)
test_scl = ss_limited.transform(test_limited)

## Logistic Regression

In [98]:
logreg = LogisticRegression()
gs_logreg = GridSearchCV(logreg, param_grid = {'penalty': ['l1'],
                                               'C': np.linspace(1.5, 2, 20),
                                               'class_weight':['balanced']}, scoring='roc_auc', verbose=1)

In [147]:
gs_logreg.fit(X_trl, y_trl)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1'], 'C': array([1.5    , 1.52632, 1.55263, 1.57895, 1.60526, 1.63158, 1.65789,
       1.68421, 1.71053, 1.73684, 1.76316, 1.78947, 1.81579, 1.84211,
       1.86842, 1.89474, 1.92105, 1.94737, 1.97368, 2.     ]), 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [148]:
gs_logreg.score(X_trl, y_trl), gs_logreg.score(X_tsl, y_tsl)

(0.6636218657234889, 0.6984507176123942)

In [149]:
gs_logreg.best_params_

{'C': 1.6052631578947367, 'class_weight': 'balanced', 'penalty': 'l1'}

In [150]:
submission = pd.DataFrame(gs_logreg.predict_proba(test_scl))
submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

In [151]:
submission.to_csv('../submissions/4features.csv', index=False)

## Decision Tree

In [30]:
gs_dt = GridSearchCV(DecisionTreeClassifier(), param_grid={'min_samples_split' : [2, 3, 4],
                                                           'min_samples_leaf': [2, 3, 4],
                                                           'max_depth': [8],
                                                           'class_weight' : ['balanced'],
                                                           'max_features': ['auto', None],
                                                           'random_state': [32]}, scoring='roc_auc')

In [31]:
gs_dt.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3, 4], 'max_depth': [8], 'class_weight': ['balanced'], 'max_features': ['auto', None], 'random_state': [32]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [32]:
gs_dt.score(X_train, y_train), gs_dt.score(X_test, y_test)

(0.8824714943913432, 0.7698076143556484)

In [33]:
gs_dt.best_params_

{'class_weight': 'balanced',
 'max_depth': 8,
 'max_features': None,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'random_state': 32}

In [35]:
test_df = pd.DataFrame(X.columns, gs_dt.best_estimator_.feature_importances_,)

In [36]:
test_df.sort_index(ascending=False).head(10)

Unnamed: 0,0
0.170169,cool
0.12219,tmax
0.112395,latitude
0.090702,sealevel
0.083701,longitude
0.072739,species_CULEX RESTUANS
0.071403,avgspeed
0.070551,resultspeed
0.067269,wetbulb
0.040518,br


In [45]:
submission = pd.DataFrame(gs_dt.predict_proba(scaled_test))

submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

In [46]:
submission.to_csv('../submissions/try3.csv', index=False)

In [154]:
gs_dt.fit(X_trl, y_trl)
gs_dt.score(X_trl, y_trl), gs_dt.score(X_tsl, y_tsl)

(0.824736796078298, 0.7486284573709723)

In [155]:
submission = pd.DataFrame(gs_dt.predict_proba(test_scl))

submission['id'] = test.index
submission = submission.rename(columns={1: 'WnvPresent'})
submission.drop(0, axis=1, inplace=True)
submission.columns = ['WnvPresent', 'id']
order = ['id', 'WnvPresent']
submission = submission[order]
submission['id'] = submission['id']+1

submission.to_csv('../submissions/dt_limited.csv', index=False)

## Random Forest

In [158]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid={'min_samples_split': [2,3,4],
                                                           'min_samples_leaf': [2, 5],
                                                           'class_weight': ['balanced'],
                                                           'n_estimators': [50, 100, 200, 500]},
                                                            scoring='roc_auc', verbose=1)

In [None]:
gs_rf.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
gs_rf.score(X_train, y_train), gs_rf.score(X_test, y_test)

In [None]:
gs_rf.best_params_

In [None]:
rf_df = pd.DataFrame(gs_rf.best_estimator_.feature_importances_, X.columns)

In [None]:
rf_df.sort_values(by=0, ascending=False)

In [None]:
submission_1 = pd.DataFrame(gs_rf.predict_proba(test_df))

In [None]:
len(submission_1[0])

In [None]:
submission_1['id'] = test_df.index

submission_1 = submission_1.rename(columns={1: 'WnvPresent'})

submission_1.drop(0, axis=1, inplace=True)

submission_1.columns = ['WnvPresent', 'id']

order = ['id', 'WnvPresent']

submission_1 = submission_1[order]

submission_1.head()

In [None]:
submission_1.to_csv('../submissions/testsubmission1.csv', index=False)