<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 4: Predicting West Nile Virus in Mosquitoes

In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import time
import re
import datetime
import random
import pickle
import geopandas as gpd
from shapely.geometry import Point, Polygon
from matplotlib import dates, cm
from geopy.distance import geodesic

pd.set_option('display.max_columns', 50)
pd.options.mode.chained_assignment = None  # default='warn'

# Read in data files

In [2]:
train = pd.read_csv('../assets/train.csv/train.csv')
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [3]:
spray = pd.read_csv('../assets/processed_data/spray_processed.csv')
spray['lat_long'] = list(zip(spray['Latitude'], spray['Longitude'])) # to compute distances later
spray.head()

Unnamed: 0,Latitude,Longitude,lat_long
0,41.981433,-87.787777,"(41.9814333333333, -87.7877766666667)"
1,41.980998,-87.787778,"(41.9809983333333, -87.7877783333333)"
2,41.98056,-87.787762,"(41.98056, -87.7877616666667)"
3,41.980198,-87.787758,"(41.9801983333333, -87.7877583333333)"
4,41.979752,-87.787765,"(41.9797516666667, -87.787765)"


In [4]:
spray.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14199 entries, 0 to 14198
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   14199 non-null  float64
 1   Longitude  14199 non-null  float64
 2   lat_long   14199 non-null  object 
dtypes: float64(2), object(1)
memory usage: 332.9+ KB


In [5]:
weather = pd.read_csv('../assets/processed_data/weather_processed.csv')
weather['Date'] = pd.to_datetime(weather['Date'])
weather.head()

Unnamed: 0,Date,Tavg,DewPoint,WetBulb,PrecipTotal,AvgSpeed
0,2007-05-01,67.5,51.0,56.5,0.0,9.4
1,2007-05-02,51.5,42.0,47.0,0.0,13.4
2,2007-05-03,57.0,40.0,49.0,0.0,12.55
3,2007-05-04,61.25,41.5,50.0,0.0,10.6
4,2007-05-05,60.0,38.5,49.5,0.0,11.75


In [6]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         1472 non-null   datetime64[ns]
 1   Tavg         1472 non-null   float64       
 2   DewPoint     1472 non-null   float64       
 3   WetBulb      1472 non-null   float64       
 4   PrecipTotal  1472 non-null   float64       
 5   AvgSpeed     1472 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 69.1 KB


In [7]:
wnv_density = pd.read_csv('../assets/processed_data/wnv_density_processed.csv')
wnv_density.columns = ['year', 'Address', 'wnv_density']
wnv_density.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         343 non-null    int64 
 1   Address      343 non-null    object
 2   wnv_density  343 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 8.2+ KB


# Engineer Features

In [8]:
train['Date'] = pd.to_datetime(train['Date'])
train['month'] = train['Date'].dt.month
train['year'] = train['Date'].dt.year

In [9]:
# Merge in wnv density data on year and trap address
train_wnv = train.merge(wnv_density, how='left', left_on = ['year', 'Address'], right_on = ['year', 'Address'])
train_wnv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10506 entries, 0 to 10505
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    10506 non-null  datetime64[ns]
 1   Address                 10506 non-null  object        
 2   Species                 10506 non-null  object        
 3   Block                   10506 non-null  int64         
 4   Street                  10506 non-null  object        
 5   Trap                    10506 non-null  object        
 6   AddressNumberAndStreet  10506 non-null  object        
 7   Latitude                10506 non-null  float64       
 8   Longitude               10506 non-null  float64       
 9   AddressAccuracy         10506 non-null  int64         
 10  NumMosquitos            10506 non-null  int64         
 11  WnvPresent              10506 non-null  int64         
 12  month                   10506 non-null  int64 

In [10]:
# define function to check whether two lat-long coordinates are within 50 metres. Output 1 if so, else 0

def check_within_50m(coord):
    
    for spray_coord in spray['lat_long']:
        if geodesic(coord, spray_coord).meters <= 50:
            return 1
    return 0

In [11]:
# To reduce computing time, we first take all distinct lat-long combinations to compute distances, since there are many repeat locations in the train dataset

train_latlong = train[['Latitude', 'Longitude']].drop_duplicates(subset=['Latitude', 'Longitude']).reset_index(drop=True)
train_latlong['lat_long'] = list(zip(train_latlong['Latitude'], train_latlong['Longitude']))

In [12]:
train_latlong.head(1)

Unnamed: 0,Latitude,Longitude,lat_long
0,41.95469,-87.800991,"(41.95469, -87.800991)"


In [13]:
# Takes awhile to run
train_latlong['spray_indicator'] = train_latlong['lat_long'].apply(check_within_50m)

In [14]:
train_latlong.spray_indicator.value_counts()

0    120
1     18
Name: spray_indicator, dtype: int64

There are 18 locations earmarked for sprays

In [15]:
train_wnv_spray = train_wnv.merge(train_latlong, how='left', left_on = ['Latitude', 'Longitude'], right_on = ['Latitude', 'Longitude'])
train_wnv_spray.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10506 entries, 0 to 10505
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    10506 non-null  datetime64[ns]
 1   Address                 10506 non-null  object        
 2   Species                 10506 non-null  object        
 3   Block                   10506 non-null  int64         
 4   Street                  10506 non-null  object        
 5   Trap                    10506 non-null  object        
 6   AddressNumberAndStreet  10506 non-null  object        
 7   Latitude                10506 non-null  float64       
 8   Longitude               10506 non-null  float64       
 9   AddressAccuracy         10506 non-null  int64         
 10  NumMosquitos            10506 non-null  int64         
 11  WnvPresent              10506 non-null  int64         
 12  month                   10506 non-null  int64 

In [16]:
def get_weather(ls_date):
    '''
    This function takes in a list of dates and returns the preceding 30-day averages of 5 weather indicators: temperature, dew point, wetbulb,
    precipitation (total instead of average), wind speed.
    
    Returns one list for each indicator.
    '''
    temp_list = []
    dewpt_list = []
    wetbulb_list = []
    precip_list = []
    windspeed_list = []
    
    for date in ls_date:
        date = pd.to_datetime(date)
        weather_subset = weather[(weather['Date'] <= date + pd.Timedelta(days=-1)) & (weather['Date'] >= date + pd.Timedelta(days=-31))]
        
        temp_list.append(weather_subset['Tavg'].mean())
        dewpt_list.append(weather_subset['DewPoint'].mean())
        wetbulb_list.append(weather_subset['WetBulb'].mean())
        precip_list.append(weather_subset['PrecipTotal'].sum())
        windspeed_list.append(weather_subset['AvgSpeed'].mean())

    return temp_list, dewpt_list, wetbulb_list, precip_list, windspeed_list

In [17]:
avg_temp, avg_dewpt, avg_wetbulb, tot_precip, avg_windspeed = get_weather(train_wnv_spray['Date'])

In [18]:
train_wnv_spray['avg_temp'] = avg_temp
train_wnv_spray['avg_dewpt'] = avg_dewpt
train_wnv_spray['avg_wetbulb'] = avg_wetbulb
train_wnv_spray['tot_precip'] = tot_precip
train_wnv_spray['avg_windspeed'] = avg_windspeed

In [19]:
train_wnv_spray.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10506 entries, 0 to 10505
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    10506 non-null  datetime64[ns]
 1   Address                 10506 non-null  object        
 2   Species                 10506 non-null  object        
 3   Block                   10506 non-null  int64         
 4   Street                  10506 non-null  object        
 5   Trap                    10506 non-null  object        
 6   AddressNumberAndStreet  10506 non-null  object        
 7   Latitude                10506 non-null  float64       
 8   Longitude               10506 non-null  float64       
 9   AddressAccuracy         10506 non-null  int64         
 10  NumMosquitos            10506 non-null  int64         
 11  WnvPresent              10506 non-null  int64         
 12  month                   10506 non-null  int64 

In [20]:
train_wnv_spray.head(3)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,month,year,wnv_density,lat_long,spray_indicator,avg_temp,avg_dewpt,avg_wetbulb,tot_precip,avg_windspeed
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,5,2007,8,"(41.95469, -87.800991)",0,63.4375,44.375,53.410714,1.52,10.2875
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,5,2007,8,"(41.95469, -87.800991)",0,63.4375,44.375,53.410714,1.52,10.2875
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,5,2007,0,"(41.994991, -87.769279)",0,63.4375,44.375,53.410714,1.52,10.2875


In [21]:
# Retain final variables

train_final = train_wnv_spray[['Species', 'month', 'wnv_density', 'spray_indicator', 'avg_temp', 'avg_dewpt', 'avg_wetbulb',
       'tot_precip', 'avg_windspeed', 'WnvPresent']]
train_final.shape

(10506, 10)

In [22]:
train_final.head()

Unnamed: 0,Species,month,wnv_density,spray_indicator,avg_temp,avg_dewpt,avg_wetbulb,tot_precip,avg_windspeed,WnvPresent
0,CULEX PIPIENS/RESTUANS,5,8,0,63.4375,44.375,53.410714,1.52,10.2875,0
1,CULEX RESTUANS,5,8,0,63.4375,44.375,53.410714,1.52,10.2875,0
2,CULEX RESTUANS,5,0,0,63.4375,44.375,53.410714,1.52,10.2875,0
3,CULEX PIPIENS/RESTUANS,5,4,1,63.4375,44.375,53.410714,1.52,10.2875,0
4,CULEX RESTUANS,5,4,1,63.4375,44.375,53.410714,1.52,10.2875,0


In [23]:
train_final.to_csv('../assets/processed_data/train_processed.csv', index=False)