In [388]:
%pylab inline

import pandas as pd
import numpy as np

# for calculating accurate distances betwen lat/long points
#from geopy.distance import vincenty

from calendar import monthrange
from datetime import datetime, date, timedelta

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
#from itertools import chain

from astral import Astral

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [389]:
date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

train = pd.read_csv('../input/train.csv', parse_dates=['Date'], date_parser=date_parser)[['Date', 'Address', 'Species', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy', 'NumMosquitos', 'WnvPresent']]
test = pd.read_csv('../input/test.csv',  parse_dates=['Date'], date_parser=date_parser)[['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap', 'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy']]

weather = pd.read_csv('../input/weather.csv', index_col=1, parse_dates=['Date'], date_parser=date_parser)[['Station', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed']]

In [391]:
# weather processing
#weather_codes = np.unique(list(chain(*weather.CodeSum.str.split().tolist())))

# replace letters, combine stations
weather.replace('[a-zA-Z]', 0, regex=True, inplace=True)
weather_merged = pd.merge(weather[weather.Station == 1], weather[weather.Station == 2], left_index=True, right_index=True, suffixes=['_1', '_2'])
weather_merged.drop(weather_merged.filter(regex='Station').columns, axis=1, inplace=True)

# add merged weather

In [475]:
# astral
city_name = 'Chicago'
a = Astral()
a.solar_depression = 'civil'
city = a[city_name]
print city.tz

astral_df = pd.DataFrame(index=weather_merged.index, columns=['MoonPhase', 'Dawn', 'Sunrise', 'Noon', 'Sunset', 'Dusk'])
for date in astral_df.index:
    sun = city.sun(date=date, local=True)
    dawn = (sun['dawn'] - city.tz.localize(date)).total_seconds()/3600.0
    sunrise = (sun['sunrise'] - city.tz.localize(date)).total_seconds()/3600.0
    noon = (sun['noon'] - city.tz.localize(date)).total_seconds()/3600.0
    sunset = (sun['sunset'] - city.tz.localize(date)).total_seconds()/3600.0
    dusk = (sun['dusk'] - city.tz.localize(date)).total_seconds()/3600.0
    astral_df.ix[date] = [city.moon_phase(date=date), dawn, sunrise, noon, sunset, dusk]

#sun = city.sun(date=train.Date[0], local=True)
#sunrise_delta = sun['sunrise'] - city.tz.localize(train.Date[0])
#print sunrise_delta.total_seconds()/3600.0

moon_phase = city.moon_phase(date=train.Date[0])
print moon_phase

print astral_df

US/Central
11
           MoonPhase      Dawn   Sunrise      Noon    Sunset      Dusk
Date                                                                  
2007-05-01        13  5.271389  5.777778  12.79722  19.81556  20.32167
2007-05-02        14  5.247778  5.755833  12.79528  19.83361  20.34139
2007-05-03        14  5.224444  5.734167  12.79333  19.85167  20.36139
2007-05-04        15  5.201667  5.713056  12.79194  19.86972  20.38111
2007-05-05        16  5.179167  5.692222  12.79028  19.88778  20.40083
2007-05-06        17  5.156944  5.671944  12.78917  19.90556  20.42056
2007-05-07        18     5.135  5.651667  12.78778  19.92333     20.44
2007-05-08        19  5.113611  5.632222  12.78694  19.94111  20.45972
2007-05-09        20  5.092778  5.612778  12.78611  19.95889  20.47917
2007-05-10        21  5.071944  5.594167  12.78528  19.97639  20.49833
2007-05-11        22  5.051944  5.575556    12.785  19.99389  20.51778
2007-05-12        23  5.031944  5.557778  12.78444  20.01111  2

In [476]:
# species
#species = pd.get_dummies(train.Species)
#train[species.columns] = species
#train.drop('Species', axis=1, inplace=True)

species_lb = LabelBinarizer()
species_lb.fit(list(train['Species'].values) + list(test['Species'].values))
species_list = species_lb.classes_.tolist()
#print species_lb.transform(train['Species'].values)

species_le = LabelEncoder()
species_le.fit(list(train['Species'].values) + list(test['Species'].values))

LabelEncoder()

In [477]:
# encode trap values
trap_le = LabelEncoder()
trap_le.fit(list(train['Trap'].values) + list(test['Trap'].values))

LabelEncoder()

In [478]:
# function for generating features
def gen_features(data):
    # start with empty dataframe
    x = pd.DataFrame(index=data.index)
    
    # add date
    x['Date'] = data['Date']
    x['DayOfWeek'] = data.Date.apply(lambda x: x.weekday())
    x['DayOfMonth'] = data.Date.apply(lambda x: x.day)
    x['DayOfYear'] = data.Date.apply(lambda x: x.timetuple().tm_yday)
    
    # add location
    x[['Latitude', 'Longitude']] = data[['Latitude', 'Longitude']]
    x['AddressAccuracy'] = data['AddressAccuracy']
    x['Trap'] = trap_le.transform(data['Trap'].values)
    
    # add species
    x[species_list] = pd.DataFrame(species_lb.transform(data['Species'].values), index=data.index, columns=species_list)
    #x['Species'] = species_le.transform(data['Species'].values)
    
    # merge astral
    x_merged = pd.merge(x, astral_df, left_on='Date', right_index=True)
    
    # merge weather
    x_merged = pd.merge(x_merged, weather_merged, left_on='Date', right_index=True)
    x_merged.drop('Date', axis=1, inplace=True)
    
    return x_merged

In [479]:
# get train and test data
X_train = gen_features(train)
X_train['WnvPresent'] = train['WnvPresent']
X_test = gen_features(test)

In [480]:
# output to csv
X_train.to_csv('../working/train_f.csv')
X_test.to_csv('../working/test_f.csv')