# Data Merging

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')
sns.set_palette('viridis')

In [3]:
train = pd.read_csv('../data/train_cleaned.csv')
weather = pd.read_csv('../data/weather_cleaned.csv')
spray = pd.read_csv('../data/spray_cleaned.csv')

In [5]:
weather.columns = weather.columns.map(lambda x: x.lower())
spray.columns = spray.columns.map(lambda x: x.lower())

In [6]:
train.date[0], weather.date[0], spray.date[0]

('2007-05-29', '2007-05-01', '2011-08-29')

The date formats in each of the datasets are the same so we can merge on those columns.

### Establishing `station` column in train dataset

We decide to split Chicago on a E/W axis to divide the city into north and south halves. Traps in the 'north' half will be tied to weather recorded at O'Hare Airport (`station` = 1) and traps in the 'south' half will be tied to weather recorded at Midway Airport (`station` = 2)

In [7]:
(41.998+41.786)/2

41.891999999999996

In [8]:
train['station'] = np.where(train['latitude']>=41.892, 1, 2)

In [9]:
train.station.value_counts()

2    6072
1    4434
Name: station, dtype: int64

In [10]:
train.shape, weather.shape

((10506, 13), (2918, 19))

In [11]:
train_weather = pd.merge(train, weather, on=['date', 'station'], )

In [12]:
train_weather.shape

(10440, 30)

In [13]:
train_weather.columns

Index(['date', 'address', 'species', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent', 'station', 'tmax', 'tmin', 'tavg',
       'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 'sunrise', 'sunset',
       'codesum', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed',
       'resultdir', 'avgspeed'],
      dtype='object')

### Species

In [14]:
train_weather.species.value_counts()

CULEX PIPIENS/RESTUANS    4723
CULEX RESTUANS            2713
CULEX PIPIENS             2691
CULEX OTHER                313
Name: species, dtype: int64

In [15]:
train_weather[(train_weather.species == 'CULEX PIPIENS/RESTUANS')]['wnvpresent'].mean()

0.05547321617615922

In [16]:
train_weather[(train_weather.species == 'CULEX PIPIENS')]['wnvpresent'].mean()

0.08918617614269789

In [17]:
train_weather[(train_weather.species == 'CULEX RESTUANS')]['wnvpresent'].mean()

0.01806118687799484

In [18]:
train_weather[(train_weather.species == 'CULEX OTHER')]['wnvpresent'].mean()

0.0

The pipiens species looks like it is much more likely to have WNV (almost 9%, C. restuans only ~2%, others 0%)

### Latitude

In [20]:
train_weather[train_weather.latitude >= train_weather.latitude.median()]['wnvpresent'].sum()

305

In [21]:
train_weather[train_weather.latitude < train_weather.latitude.median()]['wnvpresent'].sum()

246

More cases were on the northern side of the city.

### Longitude

In [22]:
train_weather[train_weather.longitude >= train_weather.longitude.median()]['wnvpresent'].sum()

223

In [23]:
train_weather[train_weather.longitude < train_weather.longitude.median()]['wnvpresent'].sum()

328

More cases were on the eastern side of the city.

### Address Accuracy

In [51]:
train_weather.addressaccuracy.value_counts()

8    4591
9    3953
5    1806
3      90
Name: addressaccuracy, dtype: int64

In [50]:
train_weather[(train_weather.addressaccuracy == 8)]['wnvpresent'].mean()

0.05249401001960357

In [52]:
train_weather[(train_weather.addressaccuracy == 9)]['wnvpresent'].mean()

0.05438907159119656

In [53]:
train_weather[(train_weather.addressaccuracy == 5)]['wnvpresent'].mean()

0.05204872646733112

In [54]:
train_weather[(train_weather.addressaccuracy == 3)]['wnvpresent'].mean()

0.011111111111111112

It looks like fewer of the low-accuracy sites reported West Nile Virus present. Could be a data collection issue?

### Station

In [61]:
train_weather[train_weather.station == 1]['wnvpresent'].mean()

0.06542056074766354

In [62]:
train_weather[train_weather.station == 2]['wnvpresent'].mean()

0.043614736494300345

'tmax', 'tmin', 'tavg',
       'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 'sunrise', 'sunset',
       'codesum', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed',
       'resultdir', 'avgspeed'

### Temperature

In [27]:
train_weather[train_weather.tmax >= train_weather.tmax.mean()]['wnvpresent'].mean()

0.05108264714852089

In [28]:
train_weather[train_weather.tmax < train_weather.tmax.mean()]['wnvpresent'].mean()

0.05564142194744977

In [29]:
train_weather[train_weather.tmin >= train_weather.tmin.mean()]['wnvpresent'].mean()

0.06573403000168548

In [31]:
train_weather[train_weather.tmin < train_weather.tmin.mean()]['wnvpresent'].mean()

0.035722209895717776

In [32]:
train_weather[train_weather.tavg >= train_weather.tavg.mean()]['wnvpresent'].mean()

0.05837875857218217

In [33]:
train_weather[train_weather.tavg < train_weather.tavg.mean()]['wnvpresent'].mean()

0.04607616242373238

### Dewpoint

In [36]:
train_weather[train_weather.dewpoint >= train_weather.dewpoint.mean()]['wnvpresent'].mean()

0.06333453626849513

In [37]:
train_weather[train_weather.dewpoint < train_weather.dewpoint.mean()]['wnvpresent'].mean()

0.04083299305839118