# Modeling

We're going to try a few different tactics to see what gets us the best results.

1. Models based on a Limited feature set with and without PCA 
1. Models based on the full feature set with and without PCA (for dimensionality notebooks)
1. 

## Importing modules and data

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

%matplotlib inline
sns.set_style('darkgrid')

In [3]:
np.random.seed(32)

In [4]:
df = pd.read_csv('../data/train_weather_spray_merged.csv')

In [5]:
df_dummied = pd.get_dummies(df, columns=['species'])

In [6]:
test = pd.read_csv('../data/test_merged.csv')

## Creating Validation Set, Scaling

In [7]:
df_dummied.drop(columns=['date', 'address', 'block', 'street', 'addressnumberandstreet', 'nummosquitos', 'sunrise', 'sunset', 'trap'], axis=1, inplace=True)

In [8]:
features = [col for col in df_dummied if col != 'wnvpresent']

In [9]:
df_dummied.columns

Index(['latitude', 'longitude', 'addressaccuracy', 'wnvpresent',
       'spray_nearby', 'station', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed', 'tsra', 'sn', 'br', 'vcfg',
       'bcfg', 'hz', 'ra', 'dz', 'gr', 'mifg', 'sq', 'fg', 'ts', 'fg+', 'vcts',
       'fu', 'species_CULEX OTHER', 'species_CULEX PIPIENS',
       'species_CULEX PIPIENS/RESTUANS', 'species_CULEX RESTUANS'],
      dtype='object')

In [10]:
X = df_dummied[features]
y = df_dummied.wnvpresent

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

scaled_test = ss.transform(test)

In [13]:
pca = PCA(n_components=7)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

scaled_test = pca.transform(scaled_test)

## Making dataset without using all features

In [140]:
limited_cols = ['latitude', 'longitude', 'tmax', 'tmin', 'species', 'wnvpresent']
lim_features = [col for col in df[limited_cols] if col != 'wnvpresent']
X_limited = pd.get_dummies(df[lim_features])
X_limited.drop(columns=['species_CULEX OTHER','species_CULEX PIPIENS/RESTUANS','species_CULEX RESTUANS'], inplace=True)
y_limited = df[limited_cols]['wnvpresent']

In [141]:
X_trl, X_tsl, y_trl, y_tsl = train_test_split(X_limited, y_limited)

In [142]:
lim_features_test = ['latitude', 'longitude', 'tmax', 'tmin', 'species_CULEX PIPIENS']

In [143]:
test_limited = test[lim_features_test]

In [144]:
test_limited.columns

Index(['latitude', 'longitude', 'tmax', 'tmin', 'species_CULEX PIPIENS'], dtype='object')

In [145]:
X_limited.columns

Index(['latitude', 'longitude', 'tmax', 'tmin', 'species_CULEX PIPIENS'], dtype='object')

In [146]:
ss_limited = StandardScaler()
X_trl = ss_limited.fit_transform(X_trl)
X_tsl = ss_limited.transform(X_tsl)
test_scl = ss_limited.transform(test_limited)