# Import Statements

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

In [2]:
# Mapping packages:
import folium

In [3]:
# import geopandas as gpd

To install geopandas and folium:

`conda install -c conda-forge geopandas`

`conda install -c conda-forge folium `

then:

`jupyter nbextension enable vega --py --sys-prefix`

to enable the necessary notebook extension to use Folium.

# Importing Data

In [4]:
# Don't edit raw dataframes after they have been imported:
train_raw = pd.read_csv('assets/train.csv')
test_raw = pd.read_csv('assets/test.csv', index_col=0)
weather_raw = pd.read_csv('assets/weather.csv')
spray_raw = pd.read_csv('assets/spray.csv')

# These can be the cleaned versions:
train = train_raw.copy().drop(['Address','Block','Street','AddressNumberAndStreet','AddressAccuracy'], axis=1)
test = test_raw.copy().drop(['Address','Block','Street','AddressNumberAndStreet', 'AddressAccuracy'], axis=1)
# If we're using latitude and longitude we can drop out the address info.
weather = weather_raw.copy()
spray = spray_raw.copy()

# Cleaning Data and EDA, train and test

## Simple Map showing the Weather Stations (airports)

In [5]:
m = folium.Map(
    location=[41.883844, -87.632162],
    tiles='openstreetmap',
    zoom_start=10,
    max_zoom = 10,
    min_zoom = 10
)

#folium.tooltip = 'Click Me!' #WARNING: Tooltips are not in this release; use the dev one if you want it

folium.Marker([41.973312, -87.910576], popup='OHare Iternational Airport', icon=folium.Icon(icon='plane', color = 'red')).add_to(m)
folium.Marker([41.786, -87.752], popup='Midway International Airport', icon=folium.Icon(icon='plane', color = 'green')).add_to(m)

m

## Examining the training set:

In [5]:
test['Date'] = pd.to_datetime(test['Date'], format='%Y/%m/%d')
train['Date'] = pd.to_datetime(train['Date'], format='%Y/%m/%d')
train.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,1,0
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,1,0
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,1,0
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,1,0
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,4,0


### Exploring Species Column -- Ben

In [6]:
train['Species'].value_counts() # WNV is transmitted by all mosquitos in the Culex genus.

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64

[Which mosquitos spread WNV?](https://wwwnc.cdc.gov/eid/article/7/6/01-0617_article)

In [7]:
# In the training dataset, only Culex Pipiens and Culex Restuans carried WNV.
train.groupby(by ='Species')['NumMosquitos','WnvPresent'].mean()

Unnamed: 0_level_0,NumMosquitos,WnvPresent
Species,Unnamed: 1_level_1,Unnamed: 2_level_1
CULEX ERRATICUS,7.0,0.0
CULEX PIPIENS,16.550945,0.088922
CULEX PIPIENS/RESTUANS,13.945286,0.055135
CULEX RESTUANS,8.55146,0.017883
CULEX SALINARIUS,1.686047,0.0
CULEX TARSALIS,1.166667,0.0
CULEX TERRITANS,2.297297,0.0


### Exploring the Trap Column

In [8]:
print(str(train['Trap'].nunique()) + ' traps in the training set \n')
print(str(test['Trap'].nunique()) + ' traps in the test set')

136 traps in the training set 

149 traps in the test set


In [9]:
set([x for x in test['Trap'] if x not in train['Trap']]) # ????

{'T001',
 'T002',
 'T002A',
 'T002B',
 'T003',
 'T004',
 'T005',
 'T006',
 'T007',
 'T008',
 'T009',
 'T011',
 'T012',
 'T013',
 'T014',
 'T015',
 'T016',
 'T017',
 'T018',
 'T019',
 'T025',
 'T027',
 'T028',
 'T030',
 'T031',
 'T033',
 'T034',
 'T035',
 'T036',
 'T037',
 'T039',
 'T040',
 'T043',
 'T044',
 'T045',
 'T046',
 'T047',
 'T048',
 'T049',
 'T050',
 'T051',
 'T054',
 'T054C',
 'T060',
 'T061',
 'T062',
 'T063',
 'T065',
 'T065A',
 'T066',
 'T067',
 'T069',
 'T070',
 'T071',
 'T072',
 'T073',
 'T074',
 'T075',
 'T076',
 'T077',
 'T078',
 'T079',
 'T080',
 'T081',
 'T082',
 'T083',
 'T084',
 'T085',
 'T086',
 'T088',
 'T089',
 'T090',
 'T090A',
 'T090B',
 'T090C',
 'T091',
 'T092',
 'T094',
 'T094B',
 'T095',
 'T096',
 'T097',
 'T099',
 'T100',
 'T102',
 'T103',
 'T107',
 'T114',
 'T115',
 'T128',
 'T128A',
 'T129',
 'T135',
 'T138',
 'T141',
 'T142',
 'T143',
 'T144',
 'T145',
 'T146',
 'T147',
 'T148',
 'T149',
 'T150',
 'T151',
 'T152',
 'T153',
 'T154',
 'T155',
 'T156',
 

# Cleaning Data and EDA, weather -- Steve

NB: Weather data is from May 2007 to the end of October 2014

## Checking for Missing Data and Non-numeric Values

In [10]:
# Formatting datetime
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y/%m/%d')

In [11]:
# Replacing missing with Null and only keep certain columns:
weather = weather[['Station', 'Date', 'Tmax', 'Tmin', 'Tavg',
                   'Depart', 'DewPoint', 'WetBulb', 'PrecipTotal','Sunrise','Sunset']].replace('M', np.NaN)
# Sunrise and sunset data is only available from station. That is fine. Replace '-' from that station with Null.
weather = weather[['Station', 'Date', 'Tmax', 'Tmin', 'Tavg',
                   'Depart', 'DewPoint', 'WetBulb', 'PrecipTotal','Sunrise','Sunset']].replace('-', np.NaN)
# Sunrise and sunset data is only available from station. That is fine. Replace '-' from that station with Null.
weather = weather[['Station', 'Date', 'Tmax', 'Tmin', 'Tavg',
                   'Depart', 'DewPoint', 'WetBulb', 'PrecipTotal','Sunrise','Sunset']].replace('  T', .01)

In [16]:
weather_consolidated = weather.groupby(by='Date').agg(lambda x: np.nanmean(pd.to_numeric(x)))
weather_consolidated.drop(['Station','Sunrise','Sunset'], axis = 1, inplace=True)

In [17]:
# No null values in our averaged and consolidated data set!
weather_consolidated.isnull().sum()

Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
PrecipTotal    0
dtype: int64

## Labeling Weather Stations -- Steve

In [14]:
weather['Station'] = weather['Station'].astype(str)

#Converting to string so I can run the Map function below.

weather["Station Location"] = weather["Station"].map(lambda x: "O'HARE" if '1' in x else "MIDWAY" if '2' in x else "")

In [15]:
weather['Station'] = weather['Station'].astype(int)
#And back to integer now for math usage

# O'Hare: 41.970748, -87.908336
# Midway: 41.787957, -87.752359

weather["Latitude"] = weather["Station Location"].map(lambda x: "41.970748" if "O'HARE" in x else "41.787957" if "MIDWAY" in x else "")
weather["Longitude"] = weather["Station Location"].map(lambda x: "-87.908336" if "O'HARE" in x else "-87.752359" if "MIDWAY" in x else "")

In [16]:
#Mapping latitude and longitude for the airports

weather["Latitude"] = weather["Latitude"].astype(float)
weather["Longitude"] = weather["Longitude"].astype(float)

#Converting the string entries of location back into floats for math usage

In [17]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,PrecipTotal,Sunrise,Sunset,Station Location,Latitude,Longitude
0,1,2007-05-01,83,50,67,14.0,51,56,0.0,448.0,1849.0,O'HARE,41.970748,-87.908336
1,2,2007-05-01,84,52,68,,51,57,0.0,,,MIDWAY,41.787957,-87.752359
2,1,2007-05-02,59,42,51,-3.0,42,47,0.0,447.0,1850.0,O'HARE,41.970748,-87.908336
3,2,2007-05-02,60,43,52,,42,47,0.0,,,MIDWAY,41.787957,-87.752359
4,1,2007-05-03,66,46,56,2.0,40,48,0.0,446.0,1851.0,O'HARE,41.970748,-87.908336


In [18]:
weather2011 = weather.loc[(weather['Date'] > '2011-05-01') & (weather['Date'] <= '2011-10-31')]
weather2013 = weather.loc[(weather['Date'] > '2013-05-01') & (weather['Date'] <= '2013-10-31')]

#Making a subset of the weather data over just these years as thats all the spray data we have.

weather2011 = weather2011.reset_index(drop=True)
weather2013 = weather2013.reset_index(drop=True)

# Cleaning Data and EDA, spray

## Missing data and the like:

In [18]:
spray.drop('Time',axis = 1, inplace=True)

In [19]:
spray['Date'] = pd.to_datetime(spray['Date'], format='%Y/%m/%d')

## Steve's work mapping the spray data:

In [20]:
spray.head()

Unnamed: 0,Date,Latitude,Longitude
0,2011-08-29,42.391623,-88.089163
1,2011-08-29,42.391348,-88.089163
2,2011-08-29,42.391022,-88.089157
3,2011-08-29,42.390637,-88.089158
4,2011-08-29,42.39041,-88.088858


In [22]:
spray['CombinedLoc'] = spray.Latitude.astype(str).str.cat(spray.Longitude.astype(str), sep=', ')

spray.Date.dtype

spray['Date'] = pd.to_datetime(spray['Date'], format='%Y/%m/%d')

#2011: 08-29 to 09/07
#2013: 07-17 to 09/05

spray2011 = spray.loc[(spray['Date'] > '2011-08-29') & (spray['Date'] <= '2011-09-07')]
spray2013 = spray.loc[(spray['Date'] > '2013-07-17') & (spray['Date'] <= '2013-09-05')]

spray2011 = spray2011.reset_index(drop=True)
spray2013 = spray2013.reset_index(drop=True)

In [23]:
spray2011.head()

Unnamed: 0,Date,Latitude,Longitude,CombinedLoc
0,2011-09-07,41.981433,-87.787777,"41.9814333333, -87.7877766667"
1,2011-09-07,41.980998,-87.787778,"41.9809983333, -87.7877783333"
2,2011-09-07,41.98056,-87.787762,"41.98056, -87.7877616667"
3,2011-09-07,41.980198,-87.787758,"41.9801983333, -87.7877583333"
4,2011-09-07,41.979752,-87.787765,"41.9797516667, -87.787765"


### Spraying locations in 2011:

In [24]:
from folium import plugins
from folium.plugins import MarkerCluster

location2011 = spray2011['Latitude'].mean(), spray2011['Longitude'].mean()
# locationlist2011 = spray2011[["Latitude","Longitude"]].values.tolist()

map2011 = (folium.Map(location=location2011,zoom_start=13))

for index,row in spray2011.iterrows():
    folium.CircleMarker([row['Latitude'], row['Longitude']],
                        radius=3,
                        fill = True,
                        color = "#4286f4", 
                        fill_color="#4286f4"
                       ).add_to(map2011)

map2011.save('2011 Spray.html') 

### Spraying locations in 2013:

In [25]:
map2013 = (folium.Map(location=[41.977049, -87.768738],zoom_start=11))

for index,row2 in spray2013.iterrows():
    folium.CircleMarker([row2['Latitude'], row2['Longitude']],
                       radius=3,
                        fill = True,
                        color = "#b22323", 
                        fill_color="#b22323", # divvy color
                       ).add_to(map2013)

map2013.save('2013 Spray.html')

### All Spraying Locations (2011 & 2013), clustered together

In [26]:
masterspraycluster = (folium.Map(location=[41.843667, -87.803933],zoom_start=10))

marker_cluster = MarkerCluster().add_to(masterspraycluster)

for index,row in spray2011.iterrows():
    folium.CircleMarker([row['Latitude'], row['Longitude']], 
                        radius=3,
                        fill = True,
                        color = "#38ff55", 
                        fill_color="#38ff55").add_to(marker_cluster)
                       

for index, row2 in spray2013.iterrows():
    folium.CircleMarker([row2['Latitude'], row2['Longitude']], 
                        radius=3,
                        fill=True, 
                        color="#ff0000", 
                        fill_color= "#ff0000").add_to(marker_cluster)

masterspraycluster.add_child(folium.LatLngPopup())
masterspraycluster.save('Master Spray (with Clusters).html')

### All Spraying Locations (2011 & 2013), Raw

In [28]:
masterspray = (folium.Map(location=[41.843667, -87.803933],zoom_start=10))

for index,row in spray2011.iterrows():
    folium.CircleMarker([row['Latitude'], row['Longitude']], 
                        radius=3,
                        fill = True,
                        color = "#38ff55", 
                        fill_color="#38ff55").add_to(masterspray)
                       

for index, row2 in spray2013.iterrows():
    folium.CircleMarker([row2['Latitude'], row2['Longitude']], 
                        radius=3,
                        fill=True, 
                        color="#ff0000", 
                        fill_color= "#ff0000").add_to(masterspray)

masterspray.add_child(folium.LatLngPopup())
masterspray.save('Master Spray.html')

In [None]:
# markerspray = (folium.Map(location=[41.843667, -87.803933],zoom_start=10))

# locationlist2011 = spray2011[["Latitude","Longitude"]].values.tolist()
# locationlist2013 = spray2013[["Latitude","Longitude"]].values.tolist()
# icon2011 = folium.Icon(color='red',icon='ok-sign')
# icon2013 = folium.Icon(color='green',icon='ok-sign')

# for index,row in spray2011.iterrows():
#     folium.Marker([row['Latitude'], row['Longitude']], icon=icon2011,
#                   popup=str(row['Date'], 
#                   )).add_to(markerspray)
                       

# for index, row2 in spray2013.iterrows():
#     folium.Marker([row2['Latitude'], row2['Longitude']], icon=icon2013, 
#                   popup=str(row2['Date'] 
#                   )).add_to(markerspray)
    
# markerspray.save('Marker Spray.html')
   

    
#This Currently does not work.  I'll try and figure it out.

# Merging Train/Test Data and Weather Data

In [21]:
two_week_rolling_average = weather_consolidated.rolling(window=14, min_periods=1).mean().reset_index()
three_days_rolling_average = weather_consolidated.rolling(window=7, min_periods=1).mean().reset_index()

In [22]:
two_week_rolling_average.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,PrecipTotal
0,2007-05-01,83.5,51.0,67.5,14.0,51.0,56.5,0.0
1,2007-05-02,71.5,46.75,59.5,5.5,46.5,51.75,0.0
2,2007-05-03,69.833333,46.833333,58.666667,4.333333,44.333333,50.833333,0.0
3,2007-05-04,70.375,47.625,58.5,4.25,43.625,50.625,0.00125
4,2007-05-05,69.5,48.8,58.8,4.4,42.6,50.4,0.003


In [23]:
train = train.merge(two_week_rolling_average, on='Date')
test = test.merge(two_week_rolling_average, on='Date')
train = train.merge(three_days_rolling_average, on='Date', suffixes = ('_2wks','_1wk'))
test = train.merge(three_days_rolling_average, on='Date', suffixes = ('_2wks','_1wk'))

In [24]:
train.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,NumMosquitos,WnvPresent,Tmax_2wks,Tmin_2wks,Tavg_2wks,...,DewPoint_2wks,WetBulb_2wks,PrecipTotal_2wks,Tmax_1wk,Tmin_1wk,Tavg_1wk,Depart_1wk,DewPoint_1wk,WetBulb_1wk,PrecipTotal_1wk
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,4,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714


# Merging Train/Test Data with Spray Data

In [25]:
spray.head()

Unnamed: 0,Date,Latitude,Longitude
0,2011-08-29,42.391623,-88.089163
1,2011-08-29,42.391348,-88.089163
2,2011-08-29,42.391022,-88.089157
3,2011-08-29,42.390637,-88.089158
4,2011-08-29,42.39041,-88.088858


In [26]:
train.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,NumMosquitos,WnvPresent,Tmax_2wks,Tmin_2wks,Tavg_2wks,...,DewPoint_2wks,WetBulb_2wks,PrecipTotal_2wks,Tmax_1wk,Tmin_1wk,Tavg_1wk,Depart_1wk,DewPoint_1wk,WetBulb_1wk,PrecipTotal_1wk
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,1,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,4,0,76.107143,52.75,64.678571,...,45.357143,54.357143,0.069286,79.0,58.0,68.714286,5.714286,50.928571,58.857143,0.110714


In [27]:
def recent_spray(row, lat_long_dist, days):
    '''Function to determine if, for a given row in training or test set, there has been a spray within
    a certain lat_long_dist and specified number of days'''
    # Using difference in latitude and longitude as a metric of distance is not quite right  given
    # how map projections work but in the interest of time we may want to stick with this. We can circle back later.
    
    # define our spray conditionals
    conDate = spray['Date'] < row['Date']
    conRange = spray['Date'] > (row['Date'] - datetime.timedelta(days=days))
    conLat = np.abs(row['Latitude'] - spray['Latitude']) < lat_long_dist
    conLon = np.abs(row['Longitude'] - spray['Longitude']) < lat_long_dist
    conditions = conDate & conRange & conLat & conLon
    
    recent_sprays = spray[conditions]
    # Returns the number of rows in the spray dataframe where each condition is true for the row provided
        
    # test prints to see what each condition passed in the first recent_sprays definition
    #print(sum(row['Date'] > spray['Date']))
    #print(str(row['Date'] - spray['Date'] < datetime.timedelta(days=days)))
    #print(str(np.abs(row['Latitude'] - spray['Latitude'])))
    #print(str(np.abs(row['Longitude'] - spray['Longitude'])))
    #return recent_sprays
    
    return recent_sprays.shape[0]

In [28]:
train['Sprayed_lastweek_close'] = [recent_spray(row[1], .005, 7) for row in train.iterrows()]
train['Sprayed_lastweek_far'] = [recent_spray(row[1], .2, 7) for row in train.iterrows()]

In [29]:
train['Sprayed_last2weeks_close'] = [recent_spray(row[1], .005, 14) for row in train.iterrows()]
train['Sprayed_last2weeks_far'] = [recent_spray(row[1], .2, 14) for row in train.iterrows()]

In [30]:
test['Sprayed_lastweek_close'] = [recent_spray(row[1], .005, 7) for row in test.iterrows()]
test['Sprayed_lastweek_far'] = [recent_spray(row[1], .2, 7) for row in test.iterrows()]

In [31]:
test['Sprayed_last2weeks_close'] = [recent_spray(row[1], .005, 14) for row in test.iterrows()]
test['Sprayed_last2weeks_far'] = [recent_spray(row[1], .2, 14) for row in test.iterrows()]

# Merging All Years Together: Turning Date Column in to 'Week Of' Column

In [32]:
train['Week'] = train['Date'].dt.week
test['Week'] = test['Date'].dt.week

# Elevation Data

In [None]:
train_elev = pd.read_csv('assets/         ')
test_elev = pd.read_csv('assets/         ')

In [152]:
train = train.merge(train_elev, on='Trap')
test = test.merge(test_elev, on='Trap')

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,NumMosquitos,WnvPresent,Tmax_2wks,Tmin_2wks,Tavg_2wks,...,Tavg_1wk,Depart_1wk,DewPoint_1wk,WetBulb_1wk,PrecipTotal_1wk,Sprayed_lastweek_close,Sprayed_lastweek_far,Sprayed_last2weeks_close,Sprayed_last2weeks_far,Week
0,2007-05-29,CULEX PIPIENS/RESTUANS,T002,41.95469,-87.800991,1,0,76.107143,52.75,64.678571,...,68.714286,5.714286,50.928571,58.857143,0.110714,0,0,0,0,22
1,2007-05-29,CULEX RESTUANS,T002,41.95469,-87.800991,1,0,76.107143,52.75,64.678571,...,68.714286,5.714286,50.928571,58.857143,0.110714,0,0,0,0,22
2,2007-05-29,CULEX RESTUANS,T007,41.994991,-87.769279,1,0,76.107143,52.75,64.678571,...,68.714286,5.714286,50.928571,58.857143,0.110714,0,0,0,0,22
3,2007-05-29,CULEX PIPIENS/RESTUANS,T015,41.974089,-87.824812,1,0,76.107143,52.75,64.678571,...,68.714286,5.714286,50.928571,58.857143,0.110714,0,0,0,0,22
4,2007-05-29,CULEX RESTUANS,T015,41.974089,-87.824812,4,0,76.107143,52.75,64.678571,...,68.714286,5.714286,50.928571,58.857143,0.110714,0,0,0,0,22


# Begin modeling

In [33]:
print([x for x in test.columns if x not in train.columns])
print([x for x in train.columns if x not in test.columns])

['Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'PrecipTotal']
[]


In [34]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics


Using Theano backend.


In [35]:
from sklearn.model_selection import train_test_split

In [141]:
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(32, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))

    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))

    model.add(Dense(output_dim))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
    return model

In [142]:
from sklearn.preprocessing import StandardScaler

In [143]:
from sklearn.metrics import confusion_matrix

In [144]:
scaler = StandardScaler()

In [145]:
X = train.drop(['Species','Trap','Date','NumMosquitos','WnvPresent'], axis = 1)
# X = X.drop(['Sprayed_lastweek_close','Sprayed_lastweek_far', 'Sprayed_last2weeks_close',
#        'Sprayed_last2weeks_far'], axis = 1)
# Keep or drop spray data?
X = scaler.fit_transform(X)
y = train['WnvPresent']
y = np_utils.to_categorical(y)


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [147]:
X_train = np.array(X_train)
X_train.shape

(7879, 21)

In [148]:
y_train = np.array(y_train)
y_train.shape

(7879, 2)

In [149]:
model = build_model(X_train.shape[1], 2)

In [150]:
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x27a66714cf8>

In [151]:
y_pred = model.predict(np.array(X_test))

y_pred = [ob[0] for ob in y_pred]

y_test2 = [ob[0] for ob in y_test]

thresh = .9

y_pred2 = [1 if ob > thresh else 0 for ob in y_pred]

y_test2[:5]

confusion_matrix(y_test2, y_pred2)

metrics.roc_auc_score(y_test2, y_pred2)

0.72932811307224321

In [56]:
train.columns

Index(['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 'NumMosquitos',
       'WnvPresent', 'Tmax_2wks', 'Tmin_2wks', 'Tavg_2wks', 'Depart_2wks',
       'DewPoint_2wks', 'WetBulb_2wks', 'PrecipTotal_2wks', 'Tmax_1wk',
       'Tmin_1wk', 'Tavg_1wk', 'Depart_1wk', 'DewPoint_1wk', 'WetBulb_1wk',
       'PrecipTotal_1wk', 'Sprayed_lastweek_close', 'Sprayed_lastweek_far',
       'Sprayed_last2weeks_close', 'Sprayed_last2weeks_far', 'Week'],
      dtype='object')

In [223]:
from sklearn.ensemble import RandomForestClassifier

In [253]:
X = train.drop(['Species','Trap','Date','NumMosquitos','WnvPresent','Station'], axis = 1)
y = train['WnvPresent']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [241]:
rf = RandomForestClassifier(10, max_depth = 30)

rf = rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.94328130947849254

In [242]:
metrics.roc_auc_score(y_test, rf.predict(X_test))

0.5493227096807709

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
>>> ftwo_scorer = make_scorer(fbeta_score, beta=2)
# >>> from sklearn.model_selection import GridSearchCV
>>> from sklearn.svm import LinearSVC
>>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=ftwo_scorer)

In [247]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer

In [257]:
rf_grid = GridSearchCV(rf, param_grid={'n_estimators': [10, 50], 'max_depth': [30, 50]},
                       scoring=make_scorer(metrics.roc_auc_score))

In [258]:
rf_grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50], 'max_depth': [30, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(roc_auc_score), verbose=0)

In [259]:
metrics.roc_auc_score(y_test, rf_grid.predict(X_test))

0.56448651368294422