# West Nile Virus

## Imports

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.tools as tls

init_notebook_mode(connected=True)

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
spray = pd.read_csv('../input/spray.csv')
weather = pd.read_csv('../input/weather.csv')

## Data transformation

In [3]:
spray_datetime = spray.copy()
spray_datetime['DateTime'] = pd.to_datetime(spray_datetime['Date'] + ' ' + spray_datetime['Time'], utc=True)

In [4]:
spray_datetime['Year'] = spray_datetime['Date'].map(lambda string: string[0:4])
spray_datetime['Month'] = spray_datetime['Date'].map(lambda string: string[5:7])
spray_datetime['Day'] = spray_datetime['Date'].map(lambda string: string[8:10])
spray_datetime = spray_datetime[spray_datetime['Year'] == '2013']
spray_datetime = spray_datetime[spray_datetime['Month'] == '08']
spray_datetime = spray_datetime[spray_datetime['Day'] == '15']
spray_datetime.drop(columns=['Year', 'Month', 'Day'], inplace=True)

In [5]:
spray_datetime.to_csv('./../input/transformed_input/spray_datetime.csv')

### Train lite

In [6]:
train_lite = train[['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 
                    'AddressAccuracy', 'NumMosquitos', 'WnvPresent']]

In [7]:
train_lite['Species'].value_counts()

CULEX PIPIENS/RESTUANS    4752
CULEX RESTUANS            2740
CULEX PIPIENS             2699
CULEX TERRITANS            222
CULEX SALINARIUS            86
CULEX TARSALIS               6
CULEX ERRATICUS              1
Name: Species, dtype: int64

Mosquito trap observations are capped at 50 mosquitoes. We need to find multiple trap observations for the same date and group them together. When aggregating species, we'll prioritize `CULEX PIPIENS/RESTUANS`, then `CULEX PIPIENS`, then `CULEX RESTUANS`, and finally other species.

In [8]:
def consolidate_species(series):
    elements = set(series)
    if 'CULEX PIPIENS/RESTUANS' in elements:
        return 'CULEX PIPIENS/RESTUANS'
    elif ('CULEX PIPIENS' in elements) and ('CULEX RESTUANS' in elements):
        return 'CULEX PIPIENS/RESTUANS'
    elif 'CULEX PIPIENS' in elements:
        return 'CULEX PIPIENS'
    elif 'CULEX RESTUANS' in elements:
        return 'CULEX RESTUANS'
    else:
        return 'OTHER'

In [9]:
train_lite = train_lite.groupby(['Date', 'Trap']).agg(
    {
        'Species': consolidate_species,
        'Latitude': np.mean, 
        'Longitude': np.mean,
        'AddressAccuracy': np.mean,
        'NumMosquitos': np.sum, 
        'WnvPresent': np.max,
        'NumMosquitos': np.sum
    }).reset_index()

In [10]:
train_lite.head()

Unnamed: 0,Date,Trap,Species,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,T002,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,2,0
1,2007-05-29,T007,CULEX RESTUANS,41.994991,-87.769279,9,1,0
2,2007-05-29,T015,CULEX PIPIENS/RESTUANS,41.974089,-87.824812,8,5,0
3,2007-05-29,T045,CULEX RESTUANS,41.9216,-87.666455,8,2,0
4,2007-05-29,T046,CULEX RESTUANS,41.891118,-87.654491,8,1,0


In [11]:
train.shape

(10506, 12)

In [12]:
train_lite.shape

(4616, 8)

By merging traps for the same date, we've been able to significantly reduce the number of "superfluous" observations, as well as slightly reduce class imbalance.

In [13]:
train['WnvPresent'].value_counts(normalize=True)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

In [14]:
train_lite['WnvPresent'].value_counts(normalize=True)

0    0.916594
1    0.083406
Name: WnvPresent, dtype: float64

We'll also remove `OTHER` species when creating dummy variables.

In [15]:
train_lite['Species'].value_counts()

CULEX PIPIENS/RESTUANS    3876
CULEX RESTUANS             456
CULEX PIPIENS              258
OTHER                       26
Name: Species, dtype: int64

In [16]:
train_lite['CULEX PIPIENS'] = train_lite['Species'].map(
    lambda species: 1 if species in ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS'] else 0)

train_lite['CULEX RESTUANS'] = train_lite['Species'].map(
    lambda species: 1 if species in ['CULEX RESTUANS', 'CULEX PIPIENS/RESTUANS'] else 0)

In [17]:
train_lite.head()

Unnamed: 0,Date,Trap,Species,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,CULEX PIPIENS,CULEX RESTUANS
0,2007-05-29,T002,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,2,0,1,1
1,2007-05-29,T007,CULEX RESTUANS,41.994991,-87.769279,9,1,0,0,1
2,2007-05-29,T015,CULEX PIPIENS/RESTUANS,41.974089,-87.824812,8,5,0,1,1
3,2007-05-29,T045,CULEX RESTUANS,41.9216,-87.666455,8,2,0,0,1
4,2007-05-29,T046,CULEX RESTUANS,41.891118,-87.654491,8,1,0,0,1


Finally, we drop redundant features.

In [18]:
train_lite = train_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude',
                         'NumMosquitos', 'WnvPresent']]

In [19]:
train_lite.head()

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,1,1,41.95469,-87.800991,2,0
1,2007-05-29,0,1,41.994991,-87.769279,1,0
2,2007-05-29,1,1,41.974089,-87.824812,5,0
3,2007-05-29,0,1,41.9216,-87.666455,2,0
4,2007-05-29,0,1,41.891118,-87.654491,1,0


### Adding spray effects

Spraying effectiveness decreases with the inverse square law. We will sum the effects of sprays which are up to 400 meters away, and will only consider a spray's effect for up to 3 days.

In [20]:
spray.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


In [21]:
spray['Date'] = pd.to_datetime(spray['Date'])
train_lite['Date'] = pd.to_datetime(train_lite['Date'])

In [22]:
train_lite['Date-1'] = pd.DatetimeIndex(train_lite['Date']) - pd.DateOffset(1)
train_lite['Date-2'] = pd.DatetimeIndex(train_lite['Date']) - pd.DateOffset(2)
train_lite['Date-3'] = pd.DatetimeIndex(train_lite['Date']) - pd.DateOffset(3)

In [23]:
train_lite.head()

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,NumMosquitos,WnvPresent,Date-1,Date-2,Date-3
0,2007-05-29,1,1,41.95469,-87.800991,2,0,2007-05-28,2007-05-27,2007-05-26
1,2007-05-29,0,1,41.994991,-87.769279,1,0,2007-05-28,2007-05-27,2007-05-26
2,2007-05-29,1,1,41.974089,-87.824812,5,0,2007-05-28,2007-05-27,2007-05-26
3,2007-05-29,0,1,41.9216,-87.666455,2,0,2007-05-28,2007-05-27,2007-05-26
4,2007-05-29,0,1,41.891118,-87.654491,1,0,2007-05-28,2007-05-27,2007-05-26


In [24]:
from geopy.distance import geodesic

In [25]:
# Distance in meters from the center of Chicago to 0.005 degrees West
geodesic((41.87, -87.62), (41.87, -87.625)).m

415.09589763727826

In [26]:
train_lite['SprayPowerDay0'] = 0 # sprays on the day of sample collection
train_lite['SprayPowerDay1'] = 0 # sprays on the day before sample collection
train_lite['SprayPowerDay2'] = 0 # sprays two days before sample collection
train_lite['SprayPowerDay3'] = 0 # sprays three days before sample collection

In [27]:
train_lite.head()

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,NumMosquitos,WnvPresent,Date-1,Date-2,Date-3,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3
0,2007-05-29,1,1,41.95469,-87.800991,2,0,2007-05-28,2007-05-27,2007-05-26,0,0,0,0
1,2007-05-29,0,1,41.994991,-87.769279,1,0,2007-05-28,2007-05-27,2007-05-26,0,0,0,0
2,2007-05-29,1,1,41.974089,-87.824812,5,0,2007-05-28,2007-05-27,2007-05-26,0,0,0,0
3,2007-05-29,0,1,41.9216,-87.666455,2,0,2007-05-28,2007-05-27,2007-05-26,0,0,0,0
4,2007-05-29,0,1,41.891118,-87.654491,1,0,2007-05-28,2007-05-27,2007-05-26,0,0,0,0


In [28]:
# calculating spray power for the day of the sample collection
for i in range(train_lite.shape[0]):
    day = train_lite.loc[i, 'Date']
    lat = train_lite.loc[i, 'Latitude']
    long = train_lite.loc[i, 'Longitude']
    
    # create a df for nearby sprays on the same day
    near_sprays_today = spray[
        (spray['Date'] == day)
        # shortcut to reduce comparisons: 0.005 degrees is >= 415 m in Chicago
        & (abs(spray['Latitude'] - lat) < 0.005) 
        & (abs(spray['Longitude'] - long) < 0.005)
    ]
    spray_power = 0
    for j in range(near_sprays_today.shape[0]):
        distance = geodesic(
            (near_sprays_today['Latitude'].iloc[j], near_sprays_today['Longitude'].iloc[j]), 
            (lat, long)
        ).m
        if distance <= 400:
            spray_power += 1/(distance ** 2)
    train_lite.loc[i, 'SprayPowerDay0'] = spray_power

In [29]:
# calculating spray power for the day before the sample collection
for i in range(train_lite.shape[0]):
    day = train_lite.loc[i, 'Date-1']
    lat = train_lite.loc[i, 'Latitude']
    long = train_lite.loc[i, 'Longitude']
    
    # create a df for nearby sprays on the same day
    near_sprays_today = spray[
        (spray['Date'] == day)
        # shortcut to reduce comparisons: 0.005 degrees is >= 415 m in Chicago
        & (abs(spray['Latitude'] - lat) < 0.005) 
        & (abs(spray['Longitude'] - long) < 0.005)
    ]
    spray_power = 0
    for j in range(near_sprays_today.shape[0]):
        distance = geodesic(
            (near_sprays_today['Latitude'].iloc[j], near_sprays_today['Longitude'].iloc[j]), 
            (lat, long)
        ).m
        if distance <= 400:
            spray_power += 1/(distance ** 2)
    train_lite.loc[i, 'SprayPowerDay1'] = spray_power

In [30]:
# calculating spray power for two days before the sample collection
for i in range(train_lite.shape[0]):
    day = train_lite.loc[i, 'Date-2']
    lat = train_lite.loc[i, 'Latitude']
    long = train_lite.loc[i, 'Longitude']
    
    # create a df for nearby sprays on the same day
    near_sprays_today = spray[
        (spray['Date'] == day)
        # shortcut to reduce comparisons: 0.005 degrees is >= 415 m in Chicago
        & (abs(spray['Latitude'] - lat) < 0.005) 
        & (abs(spray['Longitude'] - long) < 0.005)
    ]
    spray_power = 0
    for j in range(near_sprays_today.shape[0]):
        distance = geodesic(
            (near_sprays_today['Latitude'].iloc[j], near_sprays_today['Longitude'].iloc[j]), 
            (lat, long)
        ).m
        if distance <= 400:
            spray_power += 1/(distance ** 2)
    train_lite.loc[i, 'SprayPowerDay2'] = spray_power

In [31]:
# calculating spray power for three days before the sample collection
for i in range(train_lite.shape[0]):
    day = train_lite.loc[i, 'Date-3']
    lat = train_lite.loc[i, 'Latitude']
    long = train_lite.loc[i, 'Longitude']
    
    # create a df for nearby sprays on the same day
    near_sprays_today = spray[
        (spray['Date'] == day)
        # shortcut to reduce comparisons: 0.005 degrees is >= 415 m in Chicago
        & (abs(spray['Latitude'] - lat) < 0.005) 
        & (abs(spray['Longitude'] - long) < 0.005)
    ]
    spray_power = 0
    for j in range(near_sprays_today.shape[0]):
        distance = geodesic(
            (near_sprays_today['Latitude'].iloc[j], near_sprays_today['Longitude'].iloc[j]), 
            (lat, long)
        ).m
        if distance <= 400:
            spray_power += 1/(distance ** 2)
    train_lite.loc[i, 'SprayPowerDay3'] = spray_power

In [32]:
train_lite.head()

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,NumMosquitos,WnvPresent,Date-1,Date-2,Date-3,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3
0,2007-05-29,1,1,41.95469,-87.800991,2,0,2007-05-28,2007-05-27,2007-05-26,0.0,0.0,0.0,0
1,2007-05-29,0,1,41.994991,-87.769279,1,0,2007-05-28,2007-05-27,2007-05-26,0.0,0.0,0.0,0
2,2007-05-29,1,1,41.974089,-87.824812,5,0,2007-05-28,2007-05-27,2007-05-26,0.0,0.0,0.0,0
3,2007-05-29,0,1,41.9216,-87.666455,2,0,2007-05-28,2007-05-27,2007-05-26,0.0,0.0,0.0,0
4,2007-05-29,0,1,41.891118,-87.654491,1,0,2007-05-28,2007-05-27,2007-05-26,0.0,0.0,0.0,0


In [33]:
train_lite = train_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude',
                         'SprayPowerDay0', 'SprayPowerDay1', 'SprayPowerDay2', 'SprayPowerDay3',
                         'NumMosquitos', 'WnvPresent']]

In [34]:
train_lite.head()

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,NumMosquitos,WnvPresent
0,2007-05-29,1,1,41.95469,-87.800991,0.0,0.0,0.0,0,2,0
1,2007-05-29,0,1,41.994991,-87.769279,0.0,0.0,0.0,0,1,0
2,2007-05-29,1,1,41.974089,-87.824812,0.0,0.0,0.0,0,5,0
3,2007-05-29,0,1,41.9216,-87.666455,0.0,0.0,0.0,0,2,0
4,2007-05-29,0,1,41.891118,-87.654491,0.0,0.0,0.0,0,1,0


### Weather transform

In [35]:
weather['Date'] = pd.to_datetime(weather['Date'])

In [36]:
weather_lite = pd.DataFrame([], columns=weather.columns)

for i in range(int(weather.shape[0]/2)):
    row = []
    for col in weather_lite.columns:
        if col in ['Station', 'Date', 'Sunrise', 'Sunset']:
            row.append(weather[col].loc[(2*i)])
        else:
            try:
                val = (float(weather[col].loc[(2*i)]) + float(weather[col].loc[(2*i+1)]))/2
            except ValueError:
                try:
                    val = float(weather[col].loc[(2*i)])
                except ValueError:
                    try:
                        val = float(weather[col].loc[(2*i+1)])
                    except ValueError:
                        val = str(weather[col].loc[(2*i)]) + ' ' + weather[col].loc[(2*i+1)]
            row.append(val)
    weather_lite.loc[i] = row

In [37]:
def string_to_hours(string):
    return int(string[0:2]) + int(string[3:4])/60

weather_lite['SunshineHours'] = weather_lite['Sunset'].map(string_to_hours) - weather_lite['Sunrise'].map(string_to_hours)

In [38]:
weather_lite = weather_lite[['Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 
                             'Heat', 'Cool', 'SunshineHours', 'Depth', 'SeaLevel', 'AvgSpeed']]

In [39]:
weather_lite.dtypes

Date             datetime64[ns]
Tmax                    float64
Tmin                    float64
Tavg                    float64
Depart                  float64
DewPoint                float64
WetBulb                 float64
Heat                    float64
Cool                    float64
SunshineHours           float64
Depth                   float64
SeaLevel                float64
AvgSpeed                float64
dtype: object

### Train and weather merge

In [40]:
train_weather = pd.merge(train_lite, weather_lite, how='left', left_on='Date', right_on='Date')

In [41]:
with pd.option_context('display.max_columns', None):
    display(train_weather.head())

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,NumMosquitos,WnvPresent,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,SunshineHours,Depth,SeaLevel,AvgSpeed
0,2007-05-29,1,1,41.95469,-87.800991,0.0,0.0,0.0,0,2,0,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,15.1,0.0,30.1,6.95
1,2007-05-29,0,1,41.994991,-87.769279,0.0,0.0,0.0,0,1,0,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,15.1,0.0,30.1,6.95
2,2007-05-29,1,1,41.974089,-87.824812,0.0,0.0,0.0,0,5,0,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,15.1,0.0,30.1,6.95
3,2007-05-29,0,1,41.9216,-87.666455,0.0,0.0,0.0,0,2,0,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,15.1,0.0,30.1,6.95
4,2007-05-29,0,1,41.891118,-87.654491,0.0,0.0,0.0,0,1,0,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,15.1,0.0,30.1,6.95


In [42]:
train_weather['Date'] = pd.to_datetime(train_weather['Date'], utc=True)

In [43]:
train_weather.to_csv('./../input/transformed_input/train_weather.csv')

### Test transform

In [44]:
test_lite = test[['Date', 'Species', 'Latitude', 'Longitude']]

test_lite['CULEX PIPIENS'] = test_lite['Species'].map(
    lambda species: 1 if species in ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS'] else 0)

test_lite['CULEX RESTUANS'] = test_lite['Species'].map(
    lambda species: 1 if species in ['CULEX RESTUANS', 'CULEX PIPIENS/RESTUANS'] else 0)

test_lite = test_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [45]:
test_lite['Date'] = pd.to_datetime(test_lite['Date'])

In [46]:
test_lite['Date-1'] = pd.DatetimeIndex(test_lite['Date']) - pd.DateOffset(1)
test_lite['Date-2'] = pd.DatetimeIndex(test_lite['Date']) - pd.DateOffset(2)
test_lite['Date-3'] = pd.DatetimeIndex(test_lite['Date']) - pd.DateOffset(3)

In [47]:
test_lite['SprayPowerDay0'] = 0 # sprays on the day of sample collection
test_lite['SprayPowerDay1'] = 0 # sprays on the day before sample collection
test_lite['SprayPowerDay2'] = 0 # sprays two days before sample collection
test_lite['SprayPowerDay3'] = 0 # sprays three days before sample collection

Since there is no spray data for the years in our test data, we do not need to calculate the `SprayPower` features, since they are all zero.

In [48]:
test_lite = test_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude',
                       'SprayPowerDay0', 'SprayPowerDay1', 'SprayPowerDay2', 'SprayPowerDay3']]

In [49]:
test_weather = pd.merge(test_lite, weather_lite, how='left', left_on='Date', right_on='Date')

In [50]:
with pd.option_context('display.max_columns', None):
    display(test_weather.head())

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,SunshineHours,Depth,SeaLevel,AvgSpeed
0,2008-06-11,1,1,41.95469,-87.800991,0,0,0,0,86.0,63.5,75.0,7.0,55.5,64.0,0.0,10.0,15.0,0.0,29.98,10.2
1,2008-06-11,0,1,41.95469,-87.800991,0,0,0,0,86.0,63.5,75.0,7.0,55.5,64.0,0.0,10.0,15.0,0.0,29.98,10.2
2,2008-06-11,1,0,41.95469,-87.800991,0,0,0,0,86.0,63.5,75.0,7.0,55.5,64.0,0.0,10.0,15.0,0.0,29.98,10.2
3,2008-06-11,0,0,41.95469,-87.800991,0,0,0,0,86.0,63.5,75.0,7.0,55.5,64.0,0.0,10.0,15.0,0.0,29.98,10.2
4,2008-06-11,0,0,41.95469,-87.800991,0,0,0,0,86.0,63.5,75.0,7.0,55.5,64.0,0.0,10.0,15.0,0.0,29.98,10.2


In [51]:
test_weather['Date'] = pd.to_datetime(test_weather['Date'], utc=True)

In [52]:
test_weather.to_csv('./../input/transformed_input/test_weather.csv')

# Train test split

In [53]:
from sklearn.model_selection import train_test_split

We remove `NumMosquitos`, since it is not available in the test data.

In [54]:
X = train_weather.drop(columns='WnvPresent')
y = train_weather['WnvPresent']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=437708)

In [56]:
X_train.head()

Unnamed: 0,Date,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,NumMosquitos,...,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,SunshineHours,Depth,SeaLevel,AvgSpeed
3236,2011-09-12 00:00:00+00:00,1,1,41.763733,-87.742302,0.0,0.0,0.0,0,5,...,75.0,10.0,59.0,65.0,0.0,10.0,13.016667,0.0,29.905,10.1
2732,2011-07-15 00:00:00+00:00,0,1,41.986921,-87.689778,0.0,0.0,0.0,0,1,...,79.0,5.0,60.0,66.5,0.0,14.0,14.95,0.0,29.97,6.75
444,2007-08-01 00:00:00+00:00,1,1,41.74785,-87.702716,0.0,0.0,0.0,0,7,...,80.5,8.0,62.5,69.5,0.0,15.5,14.933333,0.0,29.99,3.9
448,2007-08-01 00:00:00+00:00,1,1,41.778748,-87.586427,0.0,0.0,0.0,0,12,...,80.5,8.0,62.5,69.5,0.0,15.5,14.933333,0.0,29.99,3.9
2603,2011-06-24 00:00:00+00:00,1,1,41.974689,-87.890615,0.0,0.0,0.0,0,130,...,64.0,-7.0,55.5,59.0,1.0,0.0,14.9,0.0,29.76,9.15


In [57]:
y_train_mosq = X_train['NumMosquitos']
y_test_mosq = X_test['NumMosquitos']
X_train.drop(columns='NumMosquitos', inplace=True)
X_test.drop(columns='NumMosquitos', inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



## Logistic regression

In [58]:
from sklearn.linear_model import LogisticRegression

In [59]:
logr = LogisticRegression()

In [60]:
X_train_dates = X_train.copy()
X_train_dates.loc[:, 'Year'] = X_train_dates['Date'].dt.year
X_train_dates.loc[:, 'Month'] = X_train_dates['Date'].dt.month
X_train_dates.loc[:, 'Day'] = X_train_dates['Date'].dt.day
X_train_dates.drop(columns='Date', inplace=True)

In [61]:
from sklearn.preprocessing import StandardScaler

In [62]:
ss = StandardScaler()

In [63]:
ss.fit(X_train_dates)

X_train_dates = pd.DataFrame(ss.transform(X_train_dates), columns=X_train_dates.columns, index=X_train_dates.index)

In [64]:
logr.fit(X_train_dates, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
logr.score(X_train_dates, y_train)

0.91709994222992486

In [66]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_train, logr.predict_proba(X_train_dates)[:,1], pos_label=1)
auc(fpr, tpr)

0.82355340311909409

In [67]:
X_test_dates = X_test.copy()
X_test_dates.loc[:, 'Year'] = X_test_dates['Date'].dt.year
X_test_dates.loc[:, 'Month'] = X_test_dates['Date'].dt.month
X_test_dates.loc[:, 'Day'] = X_test_dates['Date'].dt.day
X_test_dates.drop(columns='Date', inplace=True)

In [68]:
X_test_dates = pd.DataFrame(ss.transform(X_test_dates), columns=X_test_dates.columns, index=X_test_dates.index)

In [69]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, logr.predict_proba(X_test_dates)[:,1], pos_label=1)
auc(fpr, tpr)

0.78795486767485823

In [70]:
logr.score(X_test_dates, y_test)

0.91507798960138653

In [71]:
list(zip(X_test_dates.columns, logr.coef_[0]))

[('CULEX PIPIENS', 0.93253698783781591),
 ('CULEX RESTUANS', 0.18163493280941945),
 ('Latitude', -0.12736645590731782),
 ('Longitude', -0.49519244779323723),
 ('SprayPowerDay0', 0.020864925792770977),
 ('SprayPowerDay1', 0.12895426508987423),
 ('SprayPowerDay2', 0.0077373644470478172),
 ('SprayPowerDay3', 0.0),
 ('Tmax', 0.72916134454385417),
 ('Tmin', 0.79949239169691944),
 ('Tavg', 0.6150529050025495),
 ('Depart', -2.3398271561805313),
 ('DewPoint', 0.40248831420303521),
 ('WetBulb', -0.18453855556380724),
 ('Heat', -0.26080655215697768),
 ('Cool', 0.69207115401920982),
 ('SunshineHours', -0.52763700913251232),
 ('Depth', 0.0),
 ('SeaLevel', 0.042882562685262526),
 ('AvgSpeed', 0.014169289725779178),
 ('Year', 0.31458288137789758),
 ('Month', 1.8829550419361292),
 ('Day', 0.2059457739382905)]

In [72]:
test_weather_dates = test_weather.copy()
test_weather_dates.loc[:, 'Year'] = test_weather_dates['Date'].dt.year
test_weather_dates.loc[:, 'Month'] = test_weather_dates['Date'].dt.month
test_weather_dates.loc[:, 'Day'] = test_weather_dates['Date'].dt.day
test_weather_dates.drop(columns='Date', inplace=True)

In [73]:
test_weather_dates = pd.DataFrame(ss.transform(test_weather_dates),
                                  columns=test_weather_dates.columns,
                                  index=test_weather_dates.index)

In [74]:
test_weather_dates.head()

Unnamed: 0,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,Tmax,Tmin,...,WetBulb,Heat,Cool,SunshineHours,Depth,SeaLevel,AvgSpeed,Year,Month,Day
0,0.327309,0.26239,1.010615,-1.311931,-0.020522,-0.016998,-0.02392,0.0,0.582272,0.106251,...,-0.009875,-0.371448,0.317182,0.884687,0.0,0.138948,1.046489,-0.770101,-1.479914,-0.549415
1,-3.055213,0.26239,1.010615,-1.311931,-0.020522,-0.016998,-0.02392,0.0,0.582272,0.106251,...,-0.009875,-0.371448,0.317182,0.884687,0.0,0.138948,1.046489,-0.770101,-1.479914,-0.549415
2,0.327309,-3.811124,1.010615,-1.311931,-0.020522,-0.016998,-0.02392,0.0,0.582272,0.106251,...,-0.009875,-0.371448,0.317182,0.884687,0.0,0.138948,1.046489,-0.770101,-1.479914,-0.549415
3,-3.055213,-3.811124,1.010615,-1.311931,-0.020522,-0.016998,-0.02392,0.0,0.582272,0.106251,...,-0.009875,-0.371448,0.317182,0.884687,0.0,0.138948,1.046489,-0.770101,-1.479914,-0.549415
4,-3.055213,-3.811124,1.010615,-1.311931,-0.020522,-0.016998,-0.02392,0.0,0.582272,0.106251,...,-0.009875,-0.371448,0.317182,0.884687,0.0,0.138948,1.046489,-0.770101,-1.479914,-0.549415


In [75]:
predictions = pd.DataFrame(logr.predict_proba(test_weather_dates)[:,1], 
                           index=test_weather_dates.index,
                           columns=['WnvPresent'])

In [76]:
predictions['Id'] = test['Id']
predictions = predictions[['Id', 'WnvPresent']]
predictions.head()

Unnamed: 0,Id,WnvPresent
0,1,0.000859
1,2,3.7e-05
2,3,0.00041
3,4,1.8e-05
4,5,1.8e-05


In [77]:
predictions.to_csv('./../submissions/submission01-martim.csv', index=False)

## Looking only at the data from 2013

In [78]:
X_train_2013 = X_train.copy()
X_train_2013.loc[:, 'Year'] = X_train_2013['Date'].dt.year
X_train_2013.loc[:, 'Month'] = X_train_2013['Date'].dt.month
X_train_2013.loc[:, 'Day'] = X_train_2013['Date'].dt.day
X_train_2013.drop(columns='Date', inplace=True)
X_train_2013 = X_train_2013[X_train_2013['Year'] == 2013]
X_train_2013.drop(columns='Year', inplace=True)
X_train_2013.head()

Unnamed: 0,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,Tmax,Tmin,...,DewPoint,WetBulb,Heat,Cool,SunshineHours,Depth,SeaLevel,AvgSpeed,Month,Day
3822,1,1,41.732984,-87.649642,0.0,0.0,0.0,0,82.5,59.0,...,53.5,61.5,0.0,6.0,15.016667,0.0,30.095,5.1,7,12
3872,0,1,41.778748,-87.586427,0.0,0.0,0.0,0,96.5,73.0,...,69.5,75.0,0.0,20.0,15.0,0.0,29.825,13.95,7,19
3991,1,1,41.991429,-87.747113,0.0,0.0,0.0,0,81.5,61.0,...,55.5,62.0,0.0,6.5,14.933333,0.0,29.95,7.45,8,1
4276,1,1,41.991429,-87.747113,0.0,0.0,0.0,0,88.0,67.5,...,64.5,69.0,0.0,13.0,12.966667,0.0,30.005,4.65,8,29
4206,1,1,41.960616,-87.777189,0.0,0.0,0.0,0,78.0,67.5,...,66.5,68.5,0.0,8.0,12.95,0.0,30.05,7.0,8,22


In [79]:
ss = StandardScaler()

In [80]:
ss.fit(X_train_2013)

X_train_2013 = pd.DataFrame(ss.transform(X_train_2013), columns=X_train_2013.columns, index=X_train_2013.index)
X_train_2013.head()

Unnamed: 0,CULEX PIPIENS,CULEX RESTUANS,Latitude,Longitude,SprayPowerDay0,SprayPowerDay1,SprayPowerDay2,SprayPowerDay3,Tmax,Tmin,...,DewPoint,WetBulb,Heat,Cool,SunshineHours,Depth,SeaLevel,AvgSpeed,Month,Day
3822,0.355353,0.291147,-1.071357,0.539837,-0.04066,-0.033672,-0.047403,0.0,0.092614,-0.636232,...,-0.734568,-0.469014,-0.278121,-0.362066,0.842975,0.0,1.001553,-1.007052,-0.546272,-0.507802
3872,-2.814106,0.291147,-0.636974,1.311686,-0.04066,-0.033672,-0.047403,0.0,2.3353,1.624178,...,1.469293,1.851818,-0.278121,2.27234,0.828248,0.0,-1.450997,2.328044,-0.546272,0.355671
3991,0.355353,0.291147,1.381754,-0.650273,-0.04066,-0.033672,-0.047403,0.0,-0.067578,-0.313316,...,-0.459086,-0.383057,-0.278121,-0.26798,0.769337,0.0,-0.315557,-0.121462,0.388531,-1.864687
4276,0.355353,0.291147,1.381754,-0.650273,-0.04066,-0.033672,-0.047403,0.0,0.973669,0.73616,...,0.780586,0.820337,-0.278121,0.955137,-0.968518,0.0,0.184036,-1.176633,0.388531,1.589203
4206,0.355353,0.291147,1.089283,-1.017498,-0.04066,-0.033672,-0.047403,0.0,-0.62825,0.73616,...,1.056069,0.73438,-0.278121,0.014278,-0.983246,0.0,0.592795,-0.291043,0.388531,0.72573


In [81]:
y_2013 = y_train[y_train.index.isin(X_train_2013.index)]
y_2013.head()

3822    1
3872    0
3991    0
4276    0
4206    0
Name: WnvPresent, dtype: int64

In [82]:
logr.fit(X_train_2013, y_2013)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [83]:
logr.score(X_train_2013, y_2013)

0.86749716874292182

In [84]:
list(zip(X_train_2013.columns, logr.coef_[0]))

[('CULEX PIPIENS', 0.84422822385845664),
 ('CULEX RESTUANS', 0.21483555837717655),
 ('Latitude', 0.067574484098311599),
 ('Longitude', -0.45038538301968589),
 ('SprayPowerDay0', 0.020345870749228796),
 ('SprayPowerDay1', 0.18522783249282734),
 ('SprayPowerDay2', 0.010571623666036596),
 ('SprayPowerDay3', 0.0),
 ('Tmax', 0.37987271028591019),
 ('Tmin', 0.90637559952430613),
 ('Tavg', 0.65066867354353219),
 ('Depart', -1.5915797266177341),
 ('DewPoint', -0.67163814331591654),
 ('WetBulb', -0.063107386883475725),
 ('Heat', -0.14319819800994787),
 ('Cool', 0.68179712118447755),
 ('SunshineHours', -0.85611574181334038),
 ('Depth', 0.0),
 ('SeaLevel', 0.34627478464563083),
 ('AvgSpeed', 0.15192051709823592),
 ('Month', 1.0991079550007625),
 ('Day', -0.045842535866731343)]

## Linear regression on the number of mosquitoes

In [85]:
from sklearn.linear_model import LinearRegression

In [86]:
lm = LinearRegression()

In [87]:
lm.fit(X_train_dates, y_train_mosq)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [88]:
lm.score(X_train_dates, y_train_mosq)

0.027265523368415145

In [89]:
lm.score(X_test_dates, y_test_mosq)

0.021577346079638238

In [90]:
list(zip(X_test_dates.columns, logr.coef_[0]))

[('CULEX PIPIENS', 0.84422822385845664),
 ('CULEX RESTUANS', 0.21483555837717655),
 ('Latitude', 0.067574484098311599),
 ('Longitude', -0.45038538301968589),
 ('SprayPowerDay0', 0.020345870749228796),
 ('SprayPowerDay1', 0.18522783249282734),
 ('SprayPowerDay2', 0.010571623666036596),
 ('SprayPowerDay3', 0.0),
 ('Tmax', 0.37987271028591019),
 ('Tmin', 0.90637559952430613),
 ('Tavg', 0.65066867354353219),
 ('Depart', -1.5915797266177341),
 ('DewPoint', -0.67163814331591654),
 ('WetBulb', -0.063107386883475725),
 ('Heat', -0.14319819800994787),
 ('Cool', 0.68179712118447755),
 ('SunshineHours', -0.85611574181334038),
 ('Depth', 0.0),
 ('SeaLevel', 0.34627478464563083),
 ('AvgSpeed', 0.15192051709823592),
 ('Year', 1.0991079550007625),
 ('Month', -0.045842535866731343)]

## Mosquito population in 2013

In [91]:
y_2013_mosq = y_train_mosq[y_train_mosq.index.isin(X_train_2013.index)]

In [92]:
lm.fit(X_train_2013, y_2013_mosq)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [93]:
lm.score(X_train_2013, y_2013_mosq)

0.13669489372534493

In [94]:
list(zip(X_train_2013.columns, logr.coef_[0]))

[('CULEX PIPIENS', 0.84422822385845664),
 ('CULEX RESTUANS', 0.21483555837717655),
 ('Latitude', 0.067574484098311599),
 ('Longitude', -0.45038538301968589),
 ('SprayPowerDay0', 0.020345870749228796),
 ('SprayPowerDay1', 0.18522783249282734),
 ('SprayPowerDay2', 0.010571623666036596),
 ('SprayPowerDay3', 0.0),
 ('Tmax', 0.37987271028591019),
 ('Tmin', 0.90637559952430613),
 ('Tavg', 0.65066867354353219),
 ('Depart', -1.5915797266177341),
 ('DewPoint', -0.67163814331591654),
 ('WetBulb', -0.063107386883475725),
 ('Heat', -0.14319819800994787),
 ('Cool', 0.68179712118447755),
 ('SunshineHours', -0.85611574181334038),
 ('Depth', 0.0),
 ('SeaLevel', 0.34627478464563083),
 ('AvgSpeed', 0.15192051709823592),
 ('Month', 1.0991079550007625),
 ('Day', -0.045842535866731343)]

### Gradient boosting classifier

In [95]:
from sklearn.ensemble import GradientBoostingClassifier

In [96]:
gbc = GradientBoostingClassifier()

In [97]:
gbc.fit(X_train_dates, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [98]:
gbc.score(X_train_dates, y_train)

0.93240901213171579

In [99]:
gbc.predict(X_train_dates)

array([0, 0, 0, ..., 0, 0, 0])

In [100]:
gbc.score(X_test_dates, y_test)

0.91854419410745236

### Random forest classifier

In [101]:
from sklearn.ensemble import RandomForestClassifier

In [102]:
rfc = RandomForestClassifier(random_state=437708)

In [103]:
rfc.fit(X_train_dates, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=437708, verbose=0,
            warm_start=False)

In [104]:
rfc.score(X_train_dates, y_train)

0.98671288272674751

In [105]:
rfc.score(X_test_dates, y_test)

0.90294627383015602

In [106]:
rfc_predictions = pd.DataFrame(rfc.predict_proba(test_weather_dates)[:,1],
                               index=test_weather_dates.index,
                               columns=['WnvPresent'])

In [107]:
rfc_predictions['Id'] = test['Id']
rfc_predictions = rfc_predictions[['Id', 'WnvPresent']]
rfc_predictions.head()

Unnamed: 0,Id,WnvPresent
0,1,0.1
1,2,0.1
2,3,0.1
3,4,0.1
4,5,0.1


In [108]:
rfc_predictions.to_csv('./../submissions/submission02-martim.csv', index=False)

## Bunch of 💩 below here

In [109]:
rferferferge

NameError: name 'rferferferge' is not defined

In [None]:
test_lite.head()

### Transforming `Date` to `datetime` objects with time

In [None]:
train['Date'] = pd.to_datetime(train['Date'], utc=True)

In [None]:
spray['Date'] = pd.to_datetime(spray['Date'] + ' ' + spray['Time'], utc=True)

In [None]:
weather['Date'] = pd.to_datetime(weather['Date'], utc=True)

In [None]:
weather.head()

In [None]:
def weather_lat_long_elev(station):
    if station == 1:
        return (41.995, -87.933, 662)
    elif station == 2:
        return (41.786, -87.752, 612)
    
def weather_lat(station):
    if station == 1:
        return 41.995
    elif station == 2:
        return 41.786
    
def weather_long(station):
    if station == 1:
        return -87.933
    elif station == 2:
        return -87.752
    
def weather_elev(station):
    if station == 1:
        return 662
    elif station == 2:
        return 612

In [None]:
weather['Latitude'] = weather['Station'].apply(weather_lat)
weather['Longitude'] = weather['Station'].apply(weather_long)

In [None]:
weather.head()

In [None]:
spray.head()

In [None]:
train.head()

In [None]:
weather.head()

### Train lite

In [None]:
train_lite = train[['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 
                    'AddressAccuracy', 'NumMosquitos', 'WnvPresent']]

In [None]:
train_lite['Species'].value_counts()

Mosquito trap observations are capped at 50 mosquitoes. We need to find multiple trap observations for the same date and group them together. When aggregating species, we'll prioritize `CULEX PIPIENS/RESTUANS`, then `CULEX PIPIENS`, then `CULEX RESTUANS`, and finally other species.

In [None]:
def consolidate_species(series):
    elements = set(series)
    if 'CULEX PIPIENS/RESTUANS' in elements:
        return 'CULEX PIPIENS/RESTUANS'
    elif ('CULEX PIPIENS' in elements) and ('CULEX RESTUANS' in elements):
        return 'CULEX PIPIENS/RESTUANS'
    elif 'CULEX PIPIENS' in elements:
        return 'CULEX PIPIENS'
    elif 'CULEX RESTUANS' in elements:
        return 'CULEX RESTUANS'
    else:
        return 'OTHER'

In [None]:
train_lite = train_lite.groupby(['Date', 'Trap']).agg(
    {
        'Species': consolidate_species,
        'Latitude': np.mean, 
        'Longitude': np.mean,
        'AddressAccuracy': np.mean,
        'NumMosquitos': np.sum, 
        'WnvPresent': np.max,
        'NumMosquitos': np.sum
    }).reset_index()

In [None]:
train_lite.head()

In [None]:
train.shape

In [None]:
train_lite.shape

By merging traps for the same date, we've been able to significantly reduce the number of "superfluous" observations, as well as slightly reduce class imbalance.

In [None]:
train['WnvPresent'].value_counts(normalize=True)

In [None]:
train_lite['WnvPresent'].value_counts(normalize=True)

We'll also remove `OTHER` species when creating dummy variables.

In [None]:
train_lite['Species'].value_counts()

In [None]:
train_lite['CULEX PIPIENS'] = train_lite['Species'].map(
    lambda species: 1 if species in ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS'] else 0)

train_lite['CULEX RESTUANS'] = train_lite['Species'].map(
    lambda species: 1 if species in ['CULEX RESTUANS', 'CULEX PIPIENS/RESTUANS'] else 0)

In [None]:
train_lite.head()

Finally, we drop redundant features, as well as remove `NumMosquitos`, since it is not available in the test data.

In [None]:
train_lite = train_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude', 'WnvPresent']]

In [None]:
train_lite.head()

### Adding spray effects

Spraying effectiveness decreases with the inverse square law. We will sum the effects of sprays which are up to 400 meters away, and will only consider a spray's effect for up to 3 days.

In [None]:
spray['Date'] = pd.to_datetime(spray['Date'])
train_lite['Date'] = pd.to_datetime(train_lite['Date'])

In [None]:
train_lite['Date-1'] = pd.DatetimeIndex(train_lite['Date']) - pd.DateOffset(1)
train_lite['Date-2'] = pd.DatetimeIndex(train_lite['Date']) - pd.DateOffset(2)
train_lite['Date-3'] = pd.DatetimeIndex(train_lite['Date']) - pd.DateOffset(3)

In [None]:
train_lite.head()

In [None]:
from geopy.distance import geodesic

In [None]:
spray['Date'].value_counts()

In [None]:
geodesic((41.87, -87.62), (41.87, -87.625)).m

In [None]:
geopy.distance.geodesic((41.87, -87.62), (41.87, -87.625)).m

In [None]:
for 

In [None]:
spray[(spray['Date'] == '2013-08-15') 
      & (abs(spray['Latitude'] - 41.944718) < 0.005)
      & (abs(spray['Longitude'] - (-87.808187)) < 0.005)].shape

In [None]:
near_sprays_today = spray[
    (spray['Date'] == '2013-08-15')
    & (abs(spray['Latitude'] - 41.944718) < 0.005)
    & (abs(spray['Longitude'] - (-87.808187)) < 0.005)
]

In [None]:
train_lite['SprayPowerToday'] = 0

In [None]:
train_lite.loc['SprayPowerToday',0] = 3

In [None]:
train_lite.drop('SprayPowerToday', inplace=True)

In [None]:
train_lite.head()

In [None]:
for i in range(train_lite.head().shape[0]):
    day = train_lite['Date'].loc[i]
    lat = train_lite['Latitude'].loc[i]
    long = train_lite['Longitude'].loc[i]
    near_sprays_today = spray[
        (spray['Date'] == day)
        & (abs(spray['Latitude'] - lat) < 0.005)
        & (abs(spray['Longitude'] - long) < 0.005)
    ]
    spray_power = 0
    for j in range(near_sprays_today.shape[0]):
        distance = geodesic(
            (near_sprays_today['Latitude'].loc[j], near_sprays_today['Longitude'].loc[j]), 
            (lat, long)
        ).m
        if distance <= 400:
            spray_power += 1/(distance ** 2)
    train

In [None]:
train_lite['Date'] = pd.to_datetime(train_lite['Date'])

In [None]:
spray['Date'].dt.mktime()

In [None]:
dt.timedelta(spray['Date'].loc[0], 
             dt.date(2007,5,28))

In [None]:
spray[dt.timedelta(spray['Date'], dt.date(2007,5,28)) == 1]

In [None]:
spray[spray['Latitude'] - 41 > 0]

In [None]:
geopy.distance.geodesic((41.87, -87.62), (41.87, -87.63)).m

In [None]:
train_lite.head()

In [None]:
41.8781, -87.629841

In [None]:
spray.head()

In [None]:
import geopy.distance

coords_1 = (52.2296756, 21.0122287)
coords_2 = (52.406374, 16.9251681)

geopy.distance.vincenty(coords_1, coords_2).km

geopy.distance.geodesic(coords_1, coords_2).m

In [None]:
geopy.distance.geodesic((spray['Latitude'].loc[0], spray['Longitude'].loc[0]), (spray['Latitude'].loc[1], spray['Longitude'].loc[1])).m

In [None]:
spray['Date'].value_counts()

In [None]:
spray.head()

### Weather transform

In [None]:
weather_lite = pd.DataFrame([], columns=weather.columns)

for i in range(int(weather.shape[0]/2)):
    row = []
    for col in weather_lite.columns:
        if col in ['Station', 'Date', 'Sunrise', 'Sunset']:
            row.append(weather[col].loc[(2*i)])
        else:
            try:
                val = (float(weather[col].loc[(2*i)]) + float(weather[col].loc[(2*i+1)]))/2
            except ValueError:
                try:
                    val = float(weather[col].loc[(2*i)])
                except ValueError:
                    try:
                        val = float(weather[col].loc[(2*i+1)])
                    except ValueError:
                        val = str(weather[col].loc[(2*i)]) + ' ' + weather[col].loc[(2*i+1)]
            row.append(val)
    weather_lite.loc[i] = row

In [None]:
def string_to_hours(string):
    return int(string[0:2]) + int(string[3:4])/60

weather_lite['SunshineHours'] = weather_lite['Sunset'].map(string_to_hours) - weather_lite['Sunrise'].map(string_to_hours)

In [None]:
weather_lite = weather_lite[['Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 
                             'Heat', 'Cool', 'SunshineHours', 'Depth', 'SeaLevel', 'AvgSpeed']]

In [None]:
weather_lite.dtypes

### Train and weather merge

In [None]:
train_weather = pd.merge(train_lite, weather_lite, how='left', left_on='Date', right_on='Date')

In [None]:
train_weather.head()

In [None]:
train_weather.to_csv('./../input/transformed_input/train_weather.csv')

### Test transform

In [None]:
test_lite = test[['Date', 'Species', 'Latitude', 'Longitude']]

test_lite['CULEX PIPIENS'] = test_lite['Species'].map(
    lambda species: 1 if species in ['CULEX PIPIENS', 'CULEX PIPIENS/RESTUANS'] else 0)

test_lite['CULEX RESTUANS'] = test_lite['Species'].map(
    lambda species: 1 if species in ['CULEX RESTUANS', 'CULEX PIPIENS/RESTUANS'] else 0)

test_lite = test_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude']]

In [None]:
test_weather = pd.merge(test_lite, weather_lite, how='left', left_on='Date', right_on='Date')

In [None]:
test_weather.head()

In [None]:
train[['Trap', 'Latitude', 'Longitude']].groupby('Trap').agg(
    {'Latitude': pd.Series.nunique, 
     'Longitude': pd.Series.nunique}).sort_values('Latitude', ascending=False)

In [None]:
train_lite = train_lite[['Date', 'CULEX PIPIENS', 'CULEX RESTUANS', 'Latitude', 'Longitude',
                         'NumMosquitos', 'WnvPresent']]

In [None]:
train_lite.head()

In [None]:
with pd.option_context('display.max_columns', None):
    display(weather.head())

In [None]:
weather.iloc[0]

In [None]:
weather.shape

In [None]:
weather['Date'].loc[3]

In [None]:
weather_lite.loc[0]['Date']

In [None]:
val = int(weather['Depart'].loc[1])

In [None]:
try:
    val = (float(weather['Depart'].loc[1]) + float(weather['Depart'].loc[3]))/2
except ValueError:
    try:
        val = float(weather['Depart'].loc[1])
    except ValueError:
        try:
            val = float(weather['Depart'].loc[3])
        except ValueError:
            val = str(weather['Depart'].loc[1]) + ' ' + weather['Depart'].loc[3]
    
val

In [None]:
val

In [None]:
spray.head()

### Weather transform

In [None]:
weather_lite = pd.DataFrame([], columns=weather.columns)

for i in range(int(weather.shape[0]/2)):
    row = []
    for col in weather_lite.columns:
        if col in ['Station', 'Date', 'Sunrise', 'Sunset']:
            row.append(weather[col].loc[(2*i)])
        else:
            try:
                val = (float(weather[col].loc[(2*i)]) + float(weather[col].loc[(2*i+1)]))/2
            except ValueError:
                try:
                    val = float(weather[col].loc[(2*i)])
                except ValueError:
                    try:
                        val = float(weather[col].loc[(2*i+1)])
                    except ValueError:
                        val = str(weather[col].loc[(2*i)]) + ' ' + weather[col].loc[(2*i+1)]
            row.append(val)
    weather_lite.loc[i] = row

In [None]:
def string_to_hours(string):
    return int(string[0:2]) + int(string[3:4])/60

weather_lite['SunshineHours'] = weather_lite['Sunset'].map(string_to_hours) - weather_lite['Sunrise'].map(string_to_hours)

In [None]:
weather_lite = weather_lite[['Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 
                             'Heat', 'Cool', 'SunshineHours', 'Depth', 'SeaLevel', 'ResultSpeed',
                             'ResultDir', 'AvgSpeed', 'Latitude', 'Longitude']]

In [None]:
weather_lite.dtypes

In [None]:
train_lite.dtypes

In [None]:
train_lite['Species'].value_counts()

In [None]:
weather.head()

In [None]:
with pd.option_context('display.max_columns', None):
    display(weather_lite.head())

In [None]:
weather_lite.dtypes

In [None]:
pd.merge(train, weather, how='left', left_on='Date', right_on='Date')[['Station']]

In [None]:
train['AddressAccuracy'].value_counts()

In [None]:
spray.to_csv('./../input/transformed_input/spray_clean.csv')

In [None]:
train.to_csv('./../input/transformed_input/train_clean.csv')

In [None]:
weather.to_csv('./../input/transformed_input/weather_clean.csv')

In [None]:
train[train['WnvPresent'] == 1].shape

In [None]:
train.shape

In [None]:
from datetime import date
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def year_fraction(birthday):
    start = date(birthday.year, 1, 1).toordinal()
    year_length = date(birthday.year+1, 1, 1).toordinal() - start
    fractional_year = birthday.year + float(birthday.toordinal() - start) / year_length
    return fractional_year

data = [19900806, 19780517, 19900215, 19841102, 19800709, 19870421, 19780905, 19900111, 19950702, 19921009, 19930612, 19930131, 19870109]
birthdates = list()

for birthdate in data:
    datestr = str(birthdate)
    datestr = int(datestr[0:4]), int(datestr[4:6]), int(datestr[6:8])
    birthdates.append(date(datestr[0], datestr[1], datestr[2]))

birthdates = pd.Series(birthdates, name='birthdates')
birthdates = birthdates.apply(year_fraction)
sns.stripplot(x=birthdates);

In [None]:
train.shape[0]

In [None]:
plt.plot(range(1, train.shape[0]+1), pd.to_datetime(train['Date']), 'ro')

In [None]:
plt.figure(figsize=(15,8))
plt.plot(pd.to_datetime(train['Date']), [1] * train.shape[0], 'rx')
plt.plot(pd.to_datetime(train['Date']), train['WnvPresent'], 'bx');

In [None]:
plt.figure(figsize=(15,8))
plt.plot(pd.to_datetime(test['Date']), [1] * test.shape[0], 'rx');
plt.plot(pd.to_datetime(spray['Date']), [1] * spray.shape[0], 'bx');

In [None]:
train[train['WnvPresent'] == 1]['Date'].map(lambda date_str: int(date_str[5:7] + date_str[8:10])).min()

In [None]:
train['Date'] = pd.to_datetime(train['Date'])

In [None]:
train[(train['Date'].dt.month > 6) | ((train['Date'].dt.month == 6) & (train['Date'].dt.day > 27))].shape

In [None]:
weather.head()

In [None]:
weather['Sunrise'] + weather['Sunset']

In [None]:
weather[weather['Station'] == 1]['Sunset'].map(lambda string: int(string[0:2]) + int(string[3:4])/60) - weather[weather['Station'] == 1]['Sunrise'].map(lambda string: int(string[0:2]) + int(string[3:4])/60)

In [None]:
data

In [None]:
train_weather.head()

In [None]:
import folium
import folium.plugins as plugins
import numpy as np

np.random.seed(3141592)
initial_data = (
    np.random.normal(scale=0.1, size=(100, 2)) * np.array([[1, 1]]) +
    np.array([[41.8781, -87.629841]])
)

move_data = np.random.normal(size=(100, 2)) * 0.01

data = [(initial_data + move_data * i).tolist() for i in range(100)]

# weight = 0 # default value
# for time_entry in data:
#     for row in time_entry:
#         row.append(weight)

In [None]:
m = folium.Map([41.8781, -87.629841], tiles='cartodbdark_matter', zoom_start=9) # stamentoner

hm = plugins.HeatMapWithTime(data)

hm.add_to(m)

m

In [None]:
from folium import plugins

map_hooray = folium.Map(location=[41.8781, -87.629841],
                        tiles='cartodbdark_matter',
                        zoom_start = 10) 

# Ensure you're handing it floats
train_weather['Latitude'] = train_weather['Latitude'].astype(float)
train_weather['Longitude'] = train_weather['Longitude'].astype(float)

# Filter the DF for rows, then columns, then remove NaNs
# heat_df = df_acc[df_acc['Speed_limit']=='40'] # Reducing data size so it runs faster
heat_df = train_weather#[train_weather['Date'].dt.year==2013] # Reducing data size so it runs faster
heat_df = heat_df[['Latitude', 'Longitude']]

# Create weight column, using date
heat_df['Weight'] = train_weather['Date'].dt.day
heat_df['Weight'] = heat_df['Weight'].astype(float)
heat_df = heat_df.dropna(axis=0, subset=['Latitude','Longitude', 'Weight'])

# List comprehension to make out list of lists
heat_data = [[[row['Latitude'],row['Longitude']] for index, row in heat_df[heat_df['Weight'] == i].iterrows()] for i in range(0,13)]

# Plot it on the map
hm = plugins.HeatMapWithTime(heat_data,auto_play=True,max_opacity=0.8)
hm.add_to(map_hooray)
# Display the map
map_hooray

In [None]:
import os
import time
from selenium import webdriver

delay=5
fn='testmap.html'
tmpurl='file://{path}/{mapfile}'.format(path=os.getcwd(),mapfile=fn)
m.save(fn)

browser = webdriver.Chrome()
browser.get(tmpurl)
#Give the map tiles some time to load
time.sleep(delay)
browser.save_screenshot('map.png')
browser.quit()

In [None]:
"""
=========================
Simple animation examples
=========================

This example contains two animations. The first is a random walk plot. The
second is an image animation.
"""

from matplotlib import animation, rc
from IPython.display import HTML

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation


def update_line(num, data, line):
    line.set_data(data[..., :num])
    return line,

fig1 = plt.figure()

data = np.random.rand(2, 25)
l, = plt.plot([], [], 'r-')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('x')
plt.title('test')
line_ani = animation.FuncAnimation(fig1, update_line, 25, fargs=(data, l),
                                   interval=50, blit=True)

# HTML(line_ani.to_html5_video())

# To save the animation, use the command: line_ani.save('lines.mp4')

fig2 = plt.figure()

x = np.arange(-9, 10)
y = np.arange(-9, 10).reshape(-1, 1)
base = np.hypot(x, y)
ims = []
for add in np.arange(15):
    ims.append((plt.pcolor(x, y, base + add, norm=plt.Normalize(0, 30)),))

im_ani = animation.ArtistAnimation(fig2, ims, interval=50, repeat_delay=3000,
                                   blit=True)
# To save this second animation with some metadata, use the following command:
# im_ani.save('im.mp4', metadata={'artist':'Guido'})

# plt.show()
HTML(line_ani.to_html5_video())

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
# setup Lambert Conformal basemap.
# set resolution=None to skip processing of boundary datasets.
m = Basemap(width=12000000,height=9000000,projection='lcc',
            resolution=None,lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
m.shadedrelief()
plt.show()

In [None]:
HTML(im_ani.to_jshtml())

In [None]:
HTML(line_ani.to_html5_video())

In [None]:
from matplotlib import animation, rc
from IPython.display import HTML

In [None]:
fig, ax = plt.subplots()

ax.set_xlim(( 0, 2))
ax.set_ylim((-2, 2))

line, = ax.plot([], [], lw=2)

In [None]:
# initialization function: plot the background of each frame
def init():
    line.set_data([], [])
    return (line,)

In [None]:
def animate(i):
    x = np.linspace(0, 2, 1000)
    y = np.sin(2 * np.pi * (x - 0.01 * i))
    line.set_data(x, y)
    return (line,)

In [None]:
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=100, interval=20, blit=True)

In [None]:
HTML(anim.to_html5_video())

In [None]:
anim.save('im.mp4', metadata={'artist':'Guido'})

In [None]:
((np.random.random(size=(2*19))-0.5)*20000).sum()/19

In [None]:
np.random.normal(loc=0, scale=20000, size=(19)).sum()/19

In [None]:
randos.sum()

In [None]:
weather[weather['Station'] == 1]['Sunrise'].map(lambda string: int(string[0:2]) + int(string[3:4])/60)# + weather[weather['Station'] == 1]['Sunset'].map(lambda string: int(string))

In [None]:
train.shape

In [None]:
sns.pointplot(pd.to_datetime(train['Date']))

In [None]:
iplot([{
    'x': spray.index,
    'y': spray[col],
    'name': col
}  for col in spray.columns])

In [None]:
import plotly.figure_factory as ff

fips = ['06021', '06023', '06027',
        '06029', '06033', '06059',
        '06047', '06049', '06051',
        '06055', '06061']
values = range(len(fips))

fig = ff.create_choropleth(fips=fips, values=values)
iplot(fig)#, filename='choropleth of some cali counties - full usa scope')

In [None]:
import plotly.figure_factory as ff

import numpy as np
import pandas as pd

df_sample = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv')
df_sample_r = df_sample[df_sample['STNAME'] == 'Florida']

values = df_sample_r['TOT_POP'].tolist()
fips = df_sample_r['FIPS'].tolist()

endpts = list(np.mgrid[min(values):max(values):4j])
colorscale = ["#030512","#1d1d3b","#323268","#3d4b94","#3e6ab0",
              "#4989bc","#60a7c7","#85c5d3","#b7e0e4","#eafcfd"]
fig = ff.create_choropleth(
    fips=fips, values=values, scope=['Florida'], show_state_data=True,
    colorscale=colorscale, binning_endpoints=endpts, round_legend_values=True,
    plot_bgcolor='rgb(229,229,229)',
    paper_bgcolor='rgb(229,229,229)',
    legend_title='Population by County',
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.5},
    exponent_format=True,
)
iplot(fig)#, filename='choropleth_florida')

In [None]:
# import plotly.plotly as py
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')
df.head()

df['text'] = df['name'] + '<br>Population ' + (df['pop']/1e6).astype(str)+' million'
limits = [(0,2),(3,10),(11,20),(21,50),(50,3000)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","lightgrey"]
cities = []
scale = 5000

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df[lim[0]:lim[1]]
    city = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['lon'],
        lat = df_sub['lat'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['pop']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    cities.append(city)

layout = dict(
        title = '2014 US city populations<br>(Click legend to toggle traces)',
        showlegend = True,
        geo = dict(
            scope='usa',
#             projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=cities, layout=layout )
iplot( fig, validate=False)#, filename='d3-bubble-map-populations' )

In [None]:
df.head()

In [None]:
iplot(dict(data=cities))

In [None]:
"""
=========================
Simple animation examples
=========================

This example contains two animations. The first is a random walk plot. The
second is an image animation.
"""

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation


def update_line(num, data, line):
    line.set_data(data[..., :num])
    return line,

fig1 = plt.figure()

data = np.random.rand(2, 25)
l, = plt.plot([], [], 'r-')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('x')
plt.title('test')
line_ani = animation.FuncAnimation(fig1, update_line, 25, fargs=(data, l),
                                   interval=50, blit=True)

# To save the animation, use the command: line_ani.save('lines.mp4')

fig2 = plt.figure()

x = np.arange(-9, 10)
y = np.arange(-9, 10).reshape(-1, 1)
base = np.hypot(x, y)
ims = []
for add in np.arange(15):
    ims.append((plt.pcolor(x, y, base + add, norm=plt.Normalize(0, 30)),))

im_ani = animation.ArtistAnimation(fig2, ims, interval=50, repeat_delay=3000,
                                   blit=True)
# To save this second animation with some metadata, use the following command:
# im_ani.save('im.mp4', metadata={'artist':'Guido'})

plt.show()

In [None]:
cities[0]

https://plot.ly/~jackluo/2181/us-wind-turbine-dataset-animation-using/#/

In [None]:
iplot([{"x": [1, 2, 3], "y": [3, 1, 6]}])

In [None]:
weather[weather['Date'] == '2014-10-02']

In [None]:
test.tail()