In [1]:
import pandas as pd
import numpy as np

In [2]:
test = pd.read_csv('../data/test.csv', index_col=0)
weather = pd.read_csv('../data/weather_cleaned.csv')
spray = pd.read_csv('../data/spray_cleaned.csv')

In [3]:
test.columns = test.columns.map(lambda x: x.lower())
weather.columns = weather.columns.map(lambda x: x.lower())

In [4]:
test.species = test.species.map({'CULEX PIPIENS/RESTUANS': 'CULEX PIPIENS/RESTUANS',
                   'CULEX RESTUANS': 'CULEX RESTUANS',
                   'CULEX PIPIENS': 'CULEX PIPIENS',
                   'CULEX TERRITANS': 'CULEX OTHER', 
                   'CULEX SALINARIUS': 'CULEX OTHER',
                   'CULEX TARSALIS': 'CULEX OTHER',
                   'CULEX ERRATICUS': 'CULEX OTHER'})

test.species = test.species.fillna('CULEX OTHER')

In [5]:
test.columns

Index(['date', 'address', 'species', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy'],
      dtype='object')

In [7]:
test['station'] = np.where(test['latitude'] >= 41.892, 1, 2)

In [8]:
test_weather = pd.merge(test, weather, on=['date', 'station'], )

In [9]:
train = pd.read_csv('../data/train_weather_spray_merged.csv')

In [10]:
set(train.columns).difference(test_weather.columns)

{'nummosquitos', 'spray_nearby', 'wnvpresent'}

The only final feature to engineer for test data is the `spray_nearby` feature.

In [11]:
from math import sin, cos, radians, asin, sqrt
def global_distance(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956
    
    return c * r

In [12]:
traps = {}
for index, row in test_weather.iterrows():
    traps[row['trap']] = (row['longitude'], row['latitude'])


In [14]:
trap_distances = {}

for trap in traps:
    lon = traps[trap][0]
    lat = traps[trap][1]
    
    for index, spray_row in spray.iterrows():
        tmp_dist = global_distance(lon, lat, spray_row['Longitude'], spray_row['Latitude'])
        if trap in trap_distances:
            trap_distances[trap] = min(tmp_dist, trap_distances[trap])
        else:
            trap_distances[trap] = tmp_dist

These are the only features we're actually using for our models, so we'll store our testing data so that it can be easily accessed by these names.


In [15]:
cols = ['latitude', 'longitude', 'addressaccuracy', 'spray_nearby', 'station',
       'tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed', 'ts', 'sq', 'fg+', 'gr', 'br', 'tsra', 'dz', 'bcfg', 'hz',
       'fu', 'sn', 'fg', 'vcts', 'ra', 'mifg', 'vcfg', 'species_CULEX OTHER',
       'species_CULEX PIPIENS', 'species_CULEX PIPIENS/RESTUANS',
       'species_CULEX RESTUANS']

In [16]:
test_weather['spray_nearby'] = (test_weather['trap'].map(trap_distances) < .125).map(float)

In [17]:
test_weather_dummies = pd.get_dummies(test_weather, columns=['species'])

Just doing a brief check that the shape is what we expect, we should have 38 columns.

In [18]:
test_weather_dummies[cols].shape

(116293, 38)

In [19]:
test_weather.to_csv('../data/test_merged.csv', index=False)