In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances

%matplotlib inline

In [3]:
# load data
spray = pd.read_csv('./assets/spray_clean.csv', index_col='Unnamed: 0')
loc_train = pd.read_csv('train_merge_w.csv')
loc_test = pd.read_csv('test_merge_w.csv')

In [4]:
# convert dates to datetimes
loc_train['Date'] = pd.to_datetime(loc_train['Date'])
loc_test['Date'] = pd.to_datetime(loc_test['Date'])
spray['Date'] = pd.to_datetime(spray['Date'])
spray['Time'] = pd.to_datetime(spray['Time'])

In [5]:
# define function that will transform days since spray into something measuring lasting effectiveness of spray
def spray_transform(num_days_array, decay_width=10, decay_end=20):
    midpoint = decay_end - decay_width/2
    scale = 10/decay_width
    
    trans_array = -1/(1+np.exp(-scale*(num_days_array-midpoint)))+1
    
    return trans_array

In [6]:
# feature engineer 'days from last spray' at each trap location

def add_days_from_last_spray(loc_df, spray_df, dist_tol=0.005, min_sprays=5):
    
    # pull out unique traps and locations of them
    trap_df = loc_df.drop_duplicates(subset='Trap')[['Trap', 'Latitude','Longitude']]
    
    # this will be a list of all the dates that each trap has been sprayed
    spray_dates = []
    
    # loop through (unique) traps
    for trap_id in trap_df.index:
        
        # check if distance from trap to each spray is less than tolerance
        trap_lat = trap_df.loc[trap_id, 'Latitude']
        trap_long = trap_df.loc[trap_id, 'Longitude']
        spray_df['Distance'] = (pairwise_distances([[trap_lat, trap_long]], spray_df[['Latitude', 'Longitude']])[0] < dist_tol)
        
        # list of all the dates that this trap has been sprayed
        trap_spray_dates = []
        for date in spray_df['Date'].unique():
            # if trap has been sprayed 'enough' on this date, add date to trap_spray_dates
            if spray_df[(spray_df['Date']==date) & (spray_df['Distance']==1)].shape[0] > min_sprays:
                trap_spray_dates.append(date)
        
        spray_dates.append(trap_spray_dates)
    
    spray_df.drop(columns='Distance', inplace=True)
    
    # add column to dataframe showing the dates that each unique trap has been sprayed
    trap_df['SprayDates'] = spray_dates
    
    # merge dataframes so that each row in location data has a list of associated dates the trap has been sprayed
    loc_df = loc_df.merge(trap_df.drop(columns=['Latitude','Longitude']), how='left', on='Trap', copy = False)
    
    # this will be a list with the number of days since each location was sprayed
    days_from_spray = []
    
    # loop through each location
    for index in loc_df.index:
        trap_date = np.datetime64(loc_df.loc[index, 'Date'])
        # calculate time difference between trap collection date and dates that trap was sprayed
        time_diff = [trap_date - np.datetime64(x) for x in loc_df.loc[index, 'SprayDates']]
        # convert time to number of days, and keep positive days only
        day_diff = [x.astype('timedelta64[D]')/np.timedelta64(1, 'D') for x in time_diff if x>=np.timedelta64(0)]
        if len(day_diff) !=0:
            # calculate the minimum number of days since the last spray, and append to list
            days_from_spray.append(min(day_diff))
        else:
            # if trap has never been sprayed, assign nan
            days_from_spray.append(None)
        
    loc_df['DaysFromSpray'] = days_from_spray
    loc_df.drop(columns='SprayDates', inplace=True)
    
    # set nan values to max value
    loc_df['DaysFromSpray'].fillna(value = 2*loc_df['DaysFromSpray'].max(), inplace = True)
    
    # engineer other features based from days since last spray
    
    # 1. log(Days+1) as well to flatten out larger numbers once effect of spray has worn off (exp. decay)
    # add one so that range is still positive
    loc_df['LogDays'] = np.log(loc_df['DaysFromSpray']+1)
    
    # 2. spray_transform assumes exponential decay after a set number of days
    loc_df['SprayEffect'] = loc_df['DaysFromSpray'].map(spray_transform)
    
    return loc_df

In [7]:
loc_train = add_days_from_last_spray(loc_train, spray)
loc_test = add_days_from_last_spray(loc_test, spray)

In [9]:
# save to csv
loc_train.to_csv('train_merge_all.csv')
loc_test.to_csv('test_merge_all.csv')