In [1]:
%pylab inline

import pandas as pd
import numpy as np

# for calculating accurate distances betwen lat/long points
#from geopy.distance import vincenty

from calendar import monthrange
from datetime import datetime, date, timedelta

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
#from itertools import chain

#from astral import Astral

Populating the interactive namespace from numpy and matplotlib


In [2]:
# import given data
date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

train = pd.read_csv('../input/train.csv', dtype=str, parse_dates=['Date'], date_parser=date_parser)[['Date', 'Address', 'Species', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy', 'NumMosquitos', 'WnvPresent']]
test = pd.read_csv('../input/test.csv', dtype=str, parse_dates=['Date'], date_parser=date_parser)[['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy']]

weather = pd.read_csv('../input/weather.csv', index_col=1, parse_dates=['Date'], date_parser=date_parser)[['Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed']]

#elevation = pd.read_csv('../input/elevation.csv', dtype=str, index_col=0)

# import usgs data
usgs_04087440 = pd.read_table('../input/usgs_04087440.txt', comment='#', index_col=2, parse_dates=['datetime'], date_parser=date_parser)[['01_00065_00003']]
usgs_05536105 = pd.read_table('../input/usgs_05536105.txt', comment='#', index_col=2, parse_dates=['datetime'], date_parser=date_parser)[['01_00065_00003']]
usgs_05536118 = pd.read_table('../input/usgs_05536118.txt', comment='#', index_col=2, parse_dates=['datetime'], date_parser=date_parser)[['01_00065_00003']]
usgs_05536121 = pd.read_table('../input/usgs_05536121.txt', comment='#', index_col=2, parse_dates=['datetime'], date_parser=date_parser)[['02_00065_00003']]
usgs_05536123 = pd.read_table('../input/usgs_05536123.txt', comment='#', index_col=2, parse_dates=['datetime'], date_parser=date_parser)[['35_00065_00003']]

In [3]:
# weather processing
#weather_codes = np.unique(list(chain(*weather.CodeSum.str.split().tolist())))

# replace letters, combine stations
weather.replace('[a-zA-Z]', 0, regex=True, inplace=True)
weather_merged = pd.merge(weather[weather.Station == 1], weather[weather.Station == 2], left_index=True, right_index=True, suffixes=['_1', '_2'])
weather_merged.drop(weather_merged.filter(regex='Station').columns, axis=1, inplace=True)

# add weather by days
days = range(21)
weather_days = pd.DataFrame(index=weather_merged.index)
for i in days:
    weather_days = pd.merge(weather_days, weather_merged.shift(i), left_index=True, right_index=True, suffixes=['', '_d' + str(i)])

# usgs processing
usgs_df = pd.DataFrame(index=weather_merged.index, columns=['USGS04087440', 'USGS05536105', 'USGS05536118', 'USGS05536121', 'USGS05536123'])
usgs_df['USGS04087440'] = usgs_04087440['01_00065_00003']
usgs_df['USGS05536105'] = usgs_05536105['01_00065_00003']
usgs_df['USGS05536118'] = usgs_05536118['01_00065_00003']
usgs_df['USGS05536121'] = usgs_05536121['02_00065_00003']
usgs_df['USGS05536123'] = usgs_05536123['35_00065_00003']
usgs_df.fillna(0, inplace=True)

# add usgs data by days
days = range(21)
usgs_days = pd.DataFrame(index=weather_merged.index)
for i in days:
    usgs_days = pd.merge(usgs_days, usgs_df.shift(i), left_index=True, right_index=True, suffixes=['', '_d' + str(i)])

# astral
city_name = 'Chicago'
a = Astral()
a.solar_depression = 'civil'
city = a[city_name]

#astral_df = pd.DataFrame(index=weather_merged.index, columns=['MoonPhase', 'Dawn', 'Sunrise', 'Noon', 'Sunset', 'Dusk'])
astral_df = pd.DataFrame(index=weather_merged.index, columns=['MoonPhase', 'Sunrise', 'Sunset'])
for date in astral_df.index:
    sun = city.sun(date=date, local=True)
    #dawn = (sun['dawn'] - city.tz.localize(date)).total_seconds()/3600.0
    sunrise = (sun['sunrise'] - city.tz.localize(date)).total_seconds()/3600.0
    #noon = (sun['noon'] - city.tz.localize(date)).total_seconds()/3600.0
    sunset = (sun['sunset'] - city.tz.localize(date)).total_seconds()/3600.0
    #dusk = (sun['dusk'] - city.tz.localize(date)).total_seconds()/3600.0
    #astral_df.ix[date] = [city.moon_phase(date=date), dawn, sunrise, noon, sunset, dusk]
    astral_df.ix[date] = [city.moon_phase(date=date), sunrise, sunset]

In [4]:
# species
#species = pd.get_dummies(train.Species)
#train[species.columns] = species
#train.drop('Species', axis=1, inplace=True)

species_lb = LabelBinarizer()
species_lb.fit(list(train['Species'].values) + list(test['Species'].values))
species_list = species_lb.classes_.tolist()
#print species_lb.transform(train['Species'].values)

#species_le = LabelEncoder()
#species_le.fit(list(train['Species'].values) + list(test['Species'].values))

In [5]:
# encode trap values
#trap_le = LabelEncoder()
#trap_le.fit(list(train['Trap'].values) + list(test['Trap'].values))

In [6]:
print train.shape

(10506, 12)


In [9]:
# function for generating features
def gen_features(data):
    # start with empty dataframe
    x = pd.DataFrame(index=data.index)
    
    # add date
    x['Date'] = data['Date']
    x['DayOfWeek'] = data.Date.apply(lambda x: x.weekday())
    x['DayOfMonth'] = data.Date.apply(lambda x: x.day)
    x['DayOfYear'] = data.Date.apply(lambda x: x.timetuple().tm_yday)
    
    # add location
    x[['Latitude', 'Longitude']] = data[['Latitude', 'Longitude']]
    #x['AddressAccuracy'] = data['AddressAccuracy']
    #x['Trap'] = trap_le.transform(data['Trap'].values)
    
    # add species
    x[species_list] = pd.DataFrame(species_lb.transform(data['Species'].values), index=data.index, columns=species_list)
    #x['Species'] = species_le.transform(data['Species'].values)
    
    # merge astral
    #x_merged = pd.merge(x, astral_df, left_on='Date', right_index=True)
    
    # merge elevatoin
    #x_merged = pd.merge(x_merged, elevation, how='left', left_on=['Latitude', 'Longitude'], right_on=['Latitude', 'Longitude'])
    
    # merge usgs
    #x_merged = pd.merge(x_merged, usgs_days, left_on='Date', right_index=True)
    
    # merge weather
    x_merged = pd.merge(x, weather_days, left_on='Date', right_index=True)
    x_merged.drop('Date', axis=1, inplace=True)
    
    return x_merged

In [10]:
# get train and test data
X_train = gen_features(train)
X_train['NumMosquitos'] = train['NumMosquitos']
X_train['WnvPresent'] = train['WnvPresent']
X_test = gen_features(test)

In [11]:
# add extra data
#X_train['NumMosquitos'] = train['NumMosquitos']
#print train['NumMosquitos']

In [12]:
# output to csv
X_train.to_csv('../working/train_f.csv')
X_test.to_csv('../working/test_f.csv')