## Imports

In [62]:
import numpy as np
import pandas as pd
import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [99]:
train_df = pd.read_csv('../input/transformed_input/train_clean_owen')
ag_train = pd.read_csv('../input/transformed_input/aggregated_train_clean_owen')
test_df = pd.read_csv('../input/transformed_input/test_clean_owen')
spray_df = pd.read_csv('../input/transformed_input/spray_clean_owen')
weather_df = pd.read_csv('../input/transformed_input/weather_clean_owen')

In [100]:
train_df.Date = pd.to_datetime(train_df.Date)
ag_train.Date = pd.to_datetime(ag_train.Date)
test_df.Date = pd.to_datetime(test_df.Date)
spray_df.Date = pd.to_datetime(spray_df.Date)
weather_df.Date = pd.to_datetime(weather_df.Date)

In [101]:
def weather_stat (date, start_days_back, end_days_back, col, stat):
    '''
    date = the end date
    start_days_back = how far back the start day is (0 is day of)
    end_days_back = how far back the end day is, end day should be after start day (inclusive)
    col = the col where you are getting the stat from
    stat = 'a' for average, 'h' for high, 'l' for low
    station = '3' for both, '1' or '2'
    '''
    start_date = date - datetime.timedelta(start_days_back)
    end_date = date - datetime.timedelta(end_days_back)
    df = weather_df[(weather_df['Date']>start_date) & (weather_df['Date']<=end_date)]
    data = df[col]
    result = stat(data)
    return result

In [102]:
date = datetime.datetime(2013,6,1)
weather_stat(date, 7, 0, 'Tmax', np.mean)

75.14285714285714

## 2013 Train

In [103]:
train_2013 = ag_train[(ag_train['Date']>datetime.datetime(2013,1,1)) & (ag_train['Date']<datetime.datetime(2014,1,1))]

In [108]:
date_traps = {}
date_traps['2013-07-17'] = ['T159','T089','T008','T094']
date_traps['2013-07-25'] = ['T228']
date_traps['2013-08-08'] = ['T147']
date_traps['2013-08-15'] = ['T230','T013','T151','T002','T017']
date_traps['2013-08-22'] = ['T138','T227','T102','T030'] 
date_traps['2013-08-29'] = ['T066','T028','T231'] 
date_traps['2013-09-05'] = ['T233']
good_traps = ['T159','T089','T008','T094','T228','T147','T230','T013','T151','T002','T017',
              'T138','T227','T102','T030','T066','T028','T231','T233']

In [111]:
date_traps.keys()
    

dict_keys(['2013-07-17', '2013-07-25', '2013-08-08', '2013-08-15', '2013-08-22', '2013-08-29', '2013-09-05'])

In [85]:
from sklearn.model_selection import train_test_split

In [86]:
train_2013.columns

Index(['Date', 'Trap', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent',
       'pipien', 'restuans', 'DateTrap'],
      dtype='object')

In [None]:
X = df[['Date', 'NumMosquitos',
       'pipien', 'restuans', 'DateTrap']]
y = df['NumMosquitos']

In [82]:
copy1 = train_2013[train_2013.WnvPresent == 1]
copy2 = train_2013[train_2013.WnvPresent == 1]

In [83]:
train_2013 = pd.concat([train_2013,copy1,copy2])

In [84]:
train_2013.WnvPresent.value_counts()

0    1766
1     717
Name: WnvPresent, dtype: int64

In [69]:
def add_weather(df):
    days = 7

    cols = ['Tmax']
    for col in cols:
        df[str(days)+'DayMax'+col] = df['Date'].map(lambda d: weather_stat(d, days, 0, col, np.max))

    cols = ['Tmin']
    for col in cols:
        df[str(days)+'DayMin'+col] = train_dfdf['Date'].map(lambda d: weather_stat(d, days, 0, col, np.min))

    cols = ['Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat','Cool', 'PrecipTotal', 'SunLight', 
            'Rain', 'Haze', 'Mist', 'Drizzle','Fog']
    for col in cols:
        df[str(days)+'DayMean'+col] = df['Date'].map(lambda d: weather_stat(d, days, 0, col, np.mean))
    
    return df

In [70]:
train_df.head()

Unnamed: 0,Date,Trap,Latitude,Longitude,NumMosquitos,WnvPresent,pipien,restuans,DateTrap
0,2007-06-29,T002,41.95469,-87.800991,2,0,1,1,2007-06-29T002
1,2007-06-29,T015,41.974089,-87.824812,1,0,1,1,2007-06-29T015
2,2007-06-29,T015,41.974089,-87.824812,2,0,0,1,2007-06-29T015
3,2007-06-29,T046,41.891118,-87.654491,2,0,0,1,2007-06-29T046
4,2007-06-29,T054,41.921965,-87.632085,3,0,1,1,2007-06-29T054


In [61]:
spray_df.Date.value_counts()

2013-08-15    2668
2013-08-29    2302
2013-07-17    2202
2011-09-07    2114
2013-07-25    1607
2013-08-22    1587
2013-08-08    1195
2013-09-05     924
2013-08-16     141
2011-08-29      95
Name: Date, dtype: int64