# Project 4: Predict West Nile Virus
### Section 5. Model Exploration

## Problem Statement

1. As an employee of Disease And Treatment Agency, division of Societal Cures In Epidemiology and New Creative Engineering (DATA-SCIENCE), we are tasked to better understand the mosquito population and advise on appropriate interventions which are beneficial and cost-effective for the city.


2. Through this project, we hope to:
- Identify features which are most important to predict presence of West Nile Virus (which can be done by ranking the coefficients of each feature in a logistic regression model)
- Predict the probability of West Nile Virus by location to provide decision makers an effective plan to deploy pesticides throughout the city, which consequently can help to reduce cost.

## Import Libraries

In [1]:
#!pip install shapely
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from shapely import geometry
# from shapely.geometry import Point, Polygon
# import geopandas as gpd
# from datetime import timedelta
# import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (confusion_matrix, plot_confusion_matrix, classification_report, 
                             plot_roc_curve, roc_auc_score, accuracy_score, precision_score, 
                             recall_score, f1_score)

## Load Data

In [2]:
# Load datasets
df = pd.read_csv('../data/final_df.csv', index_col='Unnamed: 0')

In [3]:
# Split into train and test (kaggle) data 
train = df[df['dataset']=='train'].copy()
test = df[df['dataset']=='test'].copy()
print(train.shape)
print(test.shape)

(8304, 252)
(43035, 252)


In [4]:
train.drop(columns='dataset', inplace=True)
test.drop(columns='dataset', inplace=True)

In [5]:
train.describe()

Unnamed: 0,latitude,longitude,nummosquitos,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,...,codesum_TSRA BR HZ VCTS,codesum_TSRA FG+ BR HZ,codesum_TSRA RA,codesum_TSRA RA BR,codesum_TSRA RA BR HZ,codesum_TSRA RA BR HZ VCTS,codesum_TSRA RA BR VCTS,codesum_TSRA RA VCTS,codesum_VCTS,wnvpresent
count,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,...,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0,8304.0
mean,41.8458,-87.696229,16.095255,81.248434,62.443401,72.093931,2.591402,59.334056,64.267943,1.050819,...,0.006142,0.0,0.029383,0.037211,0.0,0.0,0.010597,0.0,0.003974,0.055034
std,0.106658,0.08444,69.585928,8.402787,7.802554,7.63033,6.624498,7.977426,6.911066,2.960102,...,0.078132,0.0,0.168889,0.18929,0.0,0.0,0.102402,0.0,0.062918,0.22806
min,41.644612,-87.930995,1.0,57.0,41.0,50.0,-12.0,38.0,47.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,41.750498,-87.752411,2.0,78.0,58.0,69.0,-2.0,54.0,60.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,41.862292,-87.696269,4.0,83.0,64.0,73.0,4.0,59.0,65.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,41.947227,-87.648064,12.0,87.0,69.0,78.0,7.0,67.0,70.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,42.01743,-87.531635,2206.0,97.0,79.0,87.0,20.0,73.0,76.0,15.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8304 entries, 0 to 8303
Columns: 251 entries, latitude to wnvpresent
dtypes: bool(10), float64(14), int64(227)
memory usage: 15.4 MB


## Preparing Train-Test (Kaggle) Data and Further Split Train Data into Train and Holdout

In [7]:
# Split train data into X (all features except wnvpresent) and y (wnvpresent)
features = [col for col in train.columns if col != 'wnvpresent']
X = train[features]
y = train['wnvpresent']

In [None]:
X.columns[X.isna().any()].tolist()

In [8]:
y.value_counts(normalize = True)

0.0    0.944966
1.0    0.055034
Name: wnvpresent, dtype: float64

y is highly inbalance, with only about 6% of the data points having West Nile Virus. Hence, it is important to stratify proportionally to ensure that our train and holdout dataset have about the same proportion of presence and absence of West Nile Virus.

In [14]:
# Further split train data into train and holdout data
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, 
    y,
    stratify = y,
    random_state=42
)

## Model Exploration

### Logistic Regression

In [15]:
# Instantiate model
logreg = LogisticRegression()

# Fit model
logreg.fit(X_train, y_train)

print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [-0.00142603]
Logistic Regression Coefficient: [[-4.74656039e-02  1.02145312e-01  1.12836912e-03  1.09385507e-01
   1.64556480e-01  1.32718804e-01 -4.23415589e-01 -6.17052303e-02
   2.25172064e-03 -1.25473363e-01  9.98878403e-02  2.65034593e-02
  -1.37801633e-02 -4.20597531e-02 -3.40538189e-02 -4.10518589e-02
   3.68005060e-02 -4.30413370e-03 -8.99090461e-03  1.41627565e-02
  -2.85197925e-02 -3.79139801e-02 -3.62143017e-02 -1.03563761e-02
  -1.33495541e-02 -1.09956210e-02  1.81836063e-02 -5.58110563e-04
   0.00000000e+00 -3.58208376e-04  4.08746443e-02 -3.58208376e-04
  -3.28952184e-02 -2.38229214e-02  0.00000000e+00 -8.06548955e-02
   0.00000000e+00 -6.41885967e-02  0.00000000e+00  1.67640117e-01
   0.00000000e+00 -4.46828680e-03 -6.22073377e-03 -1.06136547e-02
  -1.47186390e-02 -2.01694998e-03 -2.86085930e-02  2.12069235e-03
  -2.18986250e-02  1.48970138e-02  3.80656229e-02  2.95780545e-02
   2.94386712e-02  6.02468619e-03  3.13109169e-02  1.90748174e-0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
# Display Coefficients
coefs = pd.DataFrame({'variable':X.columns,
                            'coef':logreg.coef_[0],
                            'abs_coef':np.abs(logreg.coef_[0])
                     })

coefs.sort_values('abs_coef', inplace=True, ascending=False)
coefs.head(20)

Unnamed: 0,variable,coef,abs_coef
6,depart,-0.423416,0.423416
39,year_2013,0.16764,0.16764
4,tmin,0.164556,0.164556
5,tavg,0.132719,0.132719
9,heat,-0.125473,0.125473
3,tmax,0.109386,0.109386
1,longitude,0.102145,0.102145
10,cool,0.099888,0.099888
65,species_CULEX RESTUANS,-0.088209,0.088209
35,year_2009,-0.080655,0.080655


In [13]:
# Summary scores of CountVectorizer and LogisticRegression.
print("LogisticRegression summary of accuracy scores:")
print(f"GridSearchCV best accuracy = {round(gs_pipe.best_score_, 3)}")
print("\nUsing GridSearchCV best params suggested,")
print(f"Training corpus accuracy = {round(gs_pipe.score(X_train, y_train), 3)}")
print(f"Testing corpus accuracy = {round(gs_pipe.score(X_test, y_test), 3)}")

LogisticRegression summary of accuracy scores:


NameError: name 'gs_pipe' is not defined

In [None]:
# Creacting a summary dataframe.
summary_df = pd.DataFrame(columns=[
    'transformer_estimator', 
    'best_score', 
    'train_score',
    'test_score',
    'accuracy', 
    'sensitivity',
    'specificity',
    'best_params', 
])

In [None]:
def model_metrics(gs_pipe, X_test, y_test):
    '''Generates confusion matrix and adds scores to summary_df'''
    #Generate confusion matrix
    y_pred = gs_pipe.predict(X_test)
    confusion_matrix(y_test, # True values.
                     y_pred)  # Predicted values.
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel();

    # Adding the scores into summary_df
    summary_df.loc[1] = [
        'CountVec | LogisticRegression',
        round(gs_pipe.best_score_, 3),
        round(metrics.accuracy_score(y_test, y_pred),3),
        round(metrics.recall_score(y_test, y_pred),3),
        round(tn/(tn+fp),3),
        str(gs_pipe.best_params_),
    ]

    # Plot Confusion Matrix
    plot_confusion_matrix(gs_pipe, X_test, y_test, cmap='Blues', 
                          display_labels=['WNV Present', 'WNV Not Present'],
                          normalize='true');  
    plt.title(label="Model 1: Logistic Regression", fontsize=14)
    plt.grid(False)

In [None]:
summary_df

### AUC-ROC Curve

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12,10))
plot_roc_curve(cvec_lr_gs, X_test, y_test, ax=ax, name='LogisticRegression-CVEC(GS)', color='lightgrey')
plot_roc_curve(cvec_nb_gs, X_test, y_test, ax=ax, name='MultinomialNB-CVEC(GS)', color='lightgrey')
plot_roc_curve(tvec_svc_gs, X_test, y_test, ax=ax, name='SupportVectorClassifier-TVEC(GS)', color='lightgrey')
plot_roc_curve(tvec_lr_gs, X_test, y_test, ax=ax, name='LogisticRegression-TVEC(GS)', color='blue')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random Guess')
plt.legend()