In [1]:
import numpy as np
import pandas as pd

import geohash
import gc
from geopy import distance

In [11]:
train_df = pd.read_pickle('./data/train_df.pkl')
eval_df = pd.read_pickle('./data/eval_df.pkl')

In [12]:
def timestamp_to_hour_minute(timestamp):
    timestamp = timestamp.split(':')
    return int(timestamp[0]), int(timestamp[1])

In [13]:
# hour - this is to capture peak hours or midnight [sin/cos]
train_df['hour'] = train_df.timestamp.apply(lambda x: timestamp_to_hour_minute(x)[0])
train_df['hour_sin'] = np.sin(train_df.hour*(np.pi/24))
train_df['hour_cos'] = np.cos(train_df.hour*(np.pi/24))

eval_df['hour'] = eval_df.timestamp.apply(lambda x: timestamp_to_hour_minute(x)[0])
eval_df['hour_sin'] = np.sin(eval_df.hour*(np.pi/24))
eval_df['hour_cos'] = np.cos(eval_df.hour*(np.pi/24))

In [14]:
# day_mod_seven - basically to capture 7 days pattern (seasonality) [sin/cos]
train_df['day_mod_seven'] = train_df.day.values % 7
train_df['day_of_week_sin'] = np.sin(train_df.day_mod_seven*(np.pi/7))
train_df['day_of_week_cos'] = np.cos(train_df.day_mod_seven*(np.pi/7))

eval_df['day_mod_seven'] = eval_df.day.values % 7
eval_df['day_of_week_sin'] = np.sin(eval_df.day_mod_seven*(np.pi/7))
eval_df['day_of_week_cos'] = np.cos(eval_df.day_mod_seven*(np.pi/7))

In [15]:
# per_fifteen_minutes - more granular [sin/cos]
train_df['per_fifteen_minutes'] = train_df.order.values % 96
train_df['fifteen_minute_sin'] = np.sin(train_df.per_fifteen_minutes*(np.pi/96))
train_df['fifteen_minute_cos'] = np.cos(train_df.per_fifteen_minutes*(np.pi/96))

eval_df['per_fifteen_minutes'] = eval_df.order.values % 96
eval_df['fifteen_minute_sin'] = np.sin(eval_df.per_fifteen_minutes*(np.pi/96))
eval_df['fifteen_minute_cos'] = np.cos(eval_df.per_fifteen_minutes*(np.pi/96))

In [17]:
agg_dict = {}
for gh in train_df.geohash6.unique():
    temp_df = train_df[train_df.geohash6 == gh]
    
    agg_dict[gh] = {}
    
    agg_dict[gh]['per_fifteen_minutes_median'] = temp_df.groupby('per_fifteen_minutes').median().demand.values
    agg_dict[gh]['per_fifteen_minutes_mean'] = temp_df.groupby('per_fifteen_minutes').mean().demand.values
    
    agg_dict[gh]['hour_median'] = temp_df.groupby('hour').median().demand.values
    agg_dict[gh]['hour_mean'] = temp_df.groupby('hour').median().demand.values
    
    agg_dict[gh]['day_mod_seven_median'] = temp_df.groupby('day_mod_seven').median().demand.values
    agg_dict[gh]['day_mod_seven_mean'] = temp_df.groupby('day_mod_seven').mean().demand.values

In [18]:
def populate_per_fifteen_minutes_median(x):
    return agg_dict[x.geohash6]['per_fifteen_minutes_median'][x.per_fifteen_minutes]

def populate_per_fifteen_minutes_mean(x):
    return agg_dict[x.geohash6]['per_fifteen_minutes_mean'][x.per_fifteen_minutes]

In [19]:
# per fifteen_minutes (of that geohash)
train_df['per_fifteen_minutes_median'] = train_df.apply(populate_per_fifteen_minutes_median, axis=1)
train_df['per_fifteen_minutes_mean'] = train_df.apply(populate_per_fifteen_minutes_mean, axis=1)

eval_df['per_fifteen_minutes_median'] = eval_df.apply(populate_per_fifteen_minutes_median, axis=1)
eval_df['per_fifteen_minutes_mean'] = eval_df.apply(populate_per_fifteen_minutes_mean, axis=1)

In [20]:
# median by hour (of that geohash)
train_df['hour_median'] = (
    train_df.apply(lambda x: agg_dict[x.geohash6]['hour_median'][x.hour], axis=1)
)
train_df['hour_mean'] = (
    train_df.apply(lambda x: agg_dict[x.geohash6]['hour_mean'][x.hour], axis=1)
)

eval_df['hour_median'] = (
    eval_df.apply(lambda x: agg_dict[x.geohash6]['hour_median'][x.hour], axis=1)
)
eval_df['hour_mean'] = (
    eval_df.apply(lambda x: agg_dict[x.geohash6]['hour_mean'][x.hour], axis=1)
)

In [21]:
# median by day_mod_seven (of that geohash)
train_df['day_of_week_median'] = (
    train_df.apply(lambda x: agg_dict[x.geohash6]['day_mod_seven_median'][x.day_mod_seven], axis=1)
)
train_df['day_of_week_mean'] = (
    train_df.apply(lambda x: agg_dict[x.geohash6]['day_mod_seven_mean'][x.day_mod_seven], axis=1)
)

eval_df['day_of_week_median'] = (
    eval_df.apply(lambda x: agg_dict[x.geohash6]['day_mod_seven_median'][x.day_mod_seven], axis=1)
)
eval_df['day_of_week_mean'] = (
    eval_df.apply(lambda x: agg_dict[x.geohash6]['day_mod_seven_mean'][x.day_mod_seven], axis=1)
)

In [23]:
train_df['day'] = train_df.day.astype('int8')
train_df['hour'] = train_df.hour.astype('int8')
train_df['day_mod_seven'] = train_df.day_mod_seven.astype('int8')
train_df['per_fifteen_minutes'] = train_df.per_fifteen_minutes.astype('int8')

eval_df['day'] = eval_df.day.astype('int8')
eval_df['hour'] = eval_df.hour.astype('int8')
eval_df['day_mod_seven'] = eval_df.day_mod_seven.astype('int8')
eval_df['per_fifteen_minutes'] = eval_df.per_fifteen_minutes.astype('int8')

In [24]:
train_df['lat'] = train_df.geohash6.apply(lambda x: geohash.decode(x)[0])
train_df['long'] = train_df.geohash6.apply(lambda x: geohash.decode(x)[1])

eval_df['lat'] = eval_df.geohash6.apply(lambda x: geohash.decode(x)[0])
eval_df['long'] = eval_df.geohash6.apply(lambda x: geohash.decode(x)[1])

In [25]:
def get_distance_by_gh(gh1, gh2):
    coor1 = geohash.decode(gh1)
    coor2 = geohash.decode(gh2)
    return distance.distance(coor1, coor2).km

In [26]:
POI_GEOHASH = ['qp09d8', 'qp03xx', 'qp03wf']
poi_distance_dict = {}
for poi in POI_GEOHASH:
    poi_distance_dict[poi] = {}
    for gh in train_df.geohash6.unique():
        poi_distance_dict[poi][gh] = get_distance_by_gh(poi, gh)

In [27]:
for gh in POI_GEOHASH:
    train_df['distance_from_{0}'.format(gh)] = train_df.geohash6.apply(lambda x: poi_distance_dict[gh][x])
    eval_df['distance_from_{0}'.format(gh)] = eval_df.geohash6.apply(lambda x: poi_distance_dict[gh][x])

In [28]:
# NORMALIZE
for gh in POI_GEOHASH:
    gh_key = 'distance_from_{0}'.format(gh)
    gh_mean = train_df[gh_key].mean()
    gh_std = train_df[gh_key].std()
    train_df[gh_key] = (train_df[gh_key] - gh_mean)/gh_std
    eval_df[gh_key] = (eval_df[gh_key] - gh_mean)/gh_std

In [29]:
train_df.to_pickle('./preprocessed_train_df_cp_0.pkl')
eval_df.to_pickle('./preprocessed_eval_df_cp_0.pkl')

In [30]:
df = train_df.append(eval_df)

In [31]:
df.shape

(7781295, 25)

In [32]:
df.columns

Index(['order', 'geohash6', 'timestamp', 'day', 'demand', 'hour', 'hour_sin',
       'hour_cos', 'day_mod_seven', 'day_of_week_sin', 'day_of_week_cos',
       'per_fifteen_minutes', 'fifteen_minute_sin', 'fifteen_minute_cos',
       'per_fifteen_minutes_median', 'per_fifteen_minutes_mean', 'hour_median',
       'hour_mean', 'day_of_week_median', 'day_of_week_mean', 'lat', 'long',
       'distance_from_qp09d8', 'distance_from_qp03xx', 'distance_from_qp03wf'],
      dtype='object')

In [None]:
preprocessed_df = pd.DataFrame()
for i, gh in enumerate(df.geohash6.unique()):
    print(i, len(df.geohash6.unique()))
    temp_df = df[df.geohash6 == gh]
    temp_df.set_index('order', inplace=True)
    num_rows = temp_df.shape[0]
    arr = []
    for i in range(96*7 + 3, num_rows - 5):
        # put in whatever data needed into here
        arr.append({
            'order': i,
            'geohash6': gh,
            'day': temp_df.at[i, 'day'],
            'lat': temp_df.at[i, 'lat'],
            'long': temp_df.at[i, 'long'],
            
            'distance_from_qp09d8': temp_df.at[i, 'distance_from_qp09d8'],
            'distance_from_qp03xx': temp_df.at[i, 'distance_from_qp03xx'],
            'distance_from_qp03wf': temp_df.at[i, 'distance_from_qp03wf'],
            
            # LABELS
            'target_1': temp_df.at[i+1, 'demand'],
            'target_2': temp_df.at[i+2, 'demand'],
            'target_3': temp_df.at[i+3, 'demand'],
            'target_4': temp_df.at[i+4, 'demand'],
            'target_5': temp_df.at[i+5, 'demand'],
            
            # Last two hours demands
            'demand_t0': temp_df.at[i, 'demand'],
            'demand_t1': temp_df.at[i-1, 'demand'],
            'demand_t2': temp_df.at[i-2, 'demand'],
            'demand_t3': temp_df.at[i-3, 'demand'],
            'demand_t4': temp_df.at[i-4, 'demand'],
            'demand_t5': temp_df.at[i-5, 'demand'],
            'demand_t6': temp_df.at[i-6, 'demand'],
            'demand_t7': temp_df.at[i-7, 'demand'],
            
            # Seven days ago +/- 1 hour
            'demand_7d_t0': temp_df.at[i-3-96*7, 'demand'],
            'demand_7d_t1': temp_df.at[i-2-96*7, 'demand'],
            'demand_7d_t2': temp_df.at[i-1-96*7, 'demand'],
            'demand_7d_t3': temp_df.at[i-96*7, 'demand'],
            
            'demand_7d_t4': temp_df.at[i+1-96*7, 'demand'],
            'demand_7d_t5': temp_df.at[i+2-96*7, 'demand'],
            'demand_7d_t6': temp_df.at[i+3-96*7, 'demand'],
            'demand_7d_t7': temp_df.at[i+4-96*7, 'demand'],
            'demand_7d_t8': temp_df.at[i+5-96*7, 'demand'],
            
            'demand_7d_t9': temp_df.at[i+6-96*7, 'demand'],
            'demand_7d_t10': temp_df.at[i+7-96*7, 'demand'],
            'demand_7d_t11': temp_df.at[i+8-96*7, 'demand'],
            'demand_7d_t12': temp_df.at[i+9-96*7, 'demand'],
            
            'day_mod_seven': temp_df.at[i, 'day_mod_seven'],
            'day_of_week_sin': temp_df.at[i, 'day_of_week_sin'],
            'day_of_week_cos': temp_df.at[i, 'day_of_week_cos'],
            
            'hour': temp_df.at[i, 'hour'],
            'hour_sin': temp_df.at[i, 'hour_sin'],
            'hour_cos': temp_df.at[i, 'hour_cos'],
            
            'per_fifteen_minutes': temp_df.at[i, 'per_fifteen_minutes'],
            'fifteen_minute_sin': temp_df.at[i, 'fifteen_minute_sin'],
            'fifteen_minute_cos': temp_df.at[i, 'fifteen_minute_cos'],
            
            # could have used target median/mean but this is easier with very little sacrifice
            'day_of_week_median': temp_df.at[i, 'day_of_week_median'],
            'day_of_week_mean': temp_df.at[i, 'day_of_week_mean'],
            
            'hour_median': temp_df.at[i, 'hour_median'],
            'hour_mean': temp_df.at[i, 'hour_mean'],
            
            'per_fifteen_minutes_median': temp_df.at[i, 'per_fifteen_minutes_median'],
            'per_fifteen_minutes_mean': temp_df.at[i, 'per_fifteen_minutes_mean'],
        })
    preprocessed_df = preprocessed_df.append(pd.DataFrame(arr), ignore_index=True)
    gc.collect()

In [65]:
preprocessed_df.shape

(6877575, 49)

In [66]:
preprocessed_df.to_pickle('./preprocessed_df.pkl')

In [67]:
preprocessed_df[preprocessed_df.day <= 50].to_pickle('./preprocessed_train_df.pkl')
preprocessed_df[preprocessed_df.day > 50].to_pickle('./preprocessed_eval_df.pkl')

In [68]:
del preprocessed_df
gc.collect()

837

In [None]:
preprocessed_df = pd.DataFrame()
for i, gh in enumerate(df.geohash6.unique()):
    print(i, len(df.geohash6.unique()))
    temp_df = df[df.geohash6 == gh]
    
    # temp_df.set_index('order', inplace=True)
    
    proc_df = temp_df.iloc[96*7 + 3:-5].copy()
    
    proc_df['target_1'] = temp_df.demand.values[96*7 + 1 + 3:-4]
    proc_df['target_2'] = temp_df.demand.values[96*7 + 2 + 3:-3]
    proc_df['target_3'] = temp_df.demand.values[96*7 + 3 + 3:-2]
    proc_df['target_4'] = temp_df.demand.values[96*7 + 4 + 3:-1]
    proc_df['target_5'] = temp_df.demand.values[96*7 + 5 + 3:]
    
    proc_df['demand_t0'] = proc_df['demand'] 
    proc_df['demand_t1'] = temp_df.demand.values[96*7 - 1 + 3:-6]
    proc_df['demand_t2'] = temp_df.demand.values[96*7 - 2 + 3:-7]
    proc_df['demand_t3'] = temp_df.demand.values[96*7 - 3 + 3:-8]
    proc_df['demand_t4'] = temp_df.demand.values[96*7 - 4 + 3:-9]
    proc_df['demand_t5'] = temp_df.demand.values[96*7 - 5 + 3:-10]
    proc_df['demand_t6'] = temp_df.demand.values[96*7 - 6 + 3:-11]
    proc_df['demand_t7'] = temp_df.demand.values[96*7 - 7 + 3:-12]
    proc_df['demand_t8'] = temp_df.demand.values[96*7 - 8 + 3:-13]
    
    for d in range(1, 8):
        proc_df['demand_{0}d_t0'.format(d)] = temp_df.demand.values[96*(7-d) - 3 + 3:-96*d - 3 - 5]
        proc_df['demand_{0}d_t1'.format(d)] = temp_df.demand.values[96*(7-d) - 2 + 3:-96*d - 2 - 5]
        proc_df['demand_{0}d_t2'.format(d)] = temp_df.demand.values[96*(7-d) - 1 + 3:-96*d - 1 - 5]
        proc_df['demand_{0}d_t3'.format(d)] = temp_df.demand.values[96*(7-d) + 3:-96*d - 5]

        proc_df['demand_{0}d_t4'.format(d)] = temp_df.demand.values[96*(7-d) + 1 + 3:-96*d + 1 - 5]
        proc_df['demand_{0}d_t5'.format(d)] = temp_df.demand.values[96*(7-d) + 2 + 3:-96*d + 2 - 5]
        proc_df['demand_{0}d_t6'.format(d)] = temp_df.demand.values[96*(7-d) + 3 + 3:-96*d + 3 - 5]
        proc_df['demand_{0}d_t7'.format(d)] = temp_df.demand.values[96*(7-d) + 4 + 3:-96*d + 4 - 5]
        proc_df['demand_{0}d_t8'.format(d)] = temp_df.demand.values[96*(7-d) + 5 + 3:-96*d + 5 - 5]

        proc_df['demand_{0}d_t9'.format(d)] = temp_df.demand.values[96*(7-d) + 6 + 3:-96*d + 6 - 5]
        proc_df['demand_{0}d_t10'.format(d)] = temp_df.demand.values[96*(7-d) + 7 + 3:-96*d + 7 - 5]
        proc_df['demand_{0}d_t11'.format(d)] = temp_df.demand.values[96*(7-d) + 8 + 3:-96*d + 8 - 5]
        proc_df['demand_{0}d_t12'.format(d)] = temp_df.demand.values[96*(7-d) + 9 + 3:-96*d + 9 - 5]

    # preprocessed_df = preprocessed_df.append(proc_df, ignore_index=True)
    num_rows = preprocessed_df.shape[0]
    added_rows = proc_df.shape[0]
    preprocessed_df.loc[num_rows:num_rows + added_rows] = proc_df
    gc.collect()

preprocessed_df.shape

In [None]:
# preprocessed_df = pd.DataFrame()
GEOHASH6_LIST = df.geohash6.unique()
for i in range(1172, 1329):
    print(i)
    gh = GEOHASH6_LIST[i]
    # print(i, len(df.geohash6.unique()))
    temp_df = df[df.geohash6 == gh]
    
    # temp_df.set_index('order', inplace=True)
    
    proc_df = temp_df.iloc[96*7 + 3:-5].copy()
    
    proc_df['target_1'] = temp_df.demand.values[96*7 + 1 + 3:-4]
    proc_df['target_2'] = temp_df.demand.values[96*7 + 2 + 3:-3]
    proc_df['target_3'] = temp_df.demand.values[96*7 + 3 + 3:-2]
    proc_df['target_4'] = temp_df.demand.values[96*7 + 4 + 3:-1]
    proc_df['target_5'] = temp_df.demand.values[96*7 + 5 + 3:]
    
    proc_df['demand_t0'] = proc_df['demand'] 
    proc_df['demand_t1'] = temp_df.demand.values[96*7 - 1 + 3:-6]
    proc_df['demand_t2'] = temp_df.demand.values[96*7 - 2 + 3:-7]
    proc_df['demand_t3'] = temp_df.demand.values[96*7 - 3 + 3:-8]
    proc_df['demand_t4'] = temp_df.demand.values[96*7 - 4 + 3:-9]
    proc_df['demand_t5'] = temp_df.demand.values[96*7 - 5 + 3:-10]
    proc_df['demand_t6'] = temp_df.demand.values[96*7 - 6 + 3:-11]
    proc_df['demand_t7'] = temp_df.demand.values[96*7 - 7 + 3:-12]
    proc_df['demand_t8'] = temp_df.demand.values[96*7 - 8 + 3:-13]
    
    for d in range(1, 8):
        proc_df['demand_{0}d_t0'.format(d)] = temp_df.demand.values[96*(7-d) - 3 + 3:-96*d - 3 - 5]
        proc_df['demand_{0}d_t1'.format(d)] = temp_df.demand.values[96*(7-d) - 2 + 3:-96*d - 2 - 5]
        proc_df['demand_{0}d_t2'.format(d)] = temp_df.demand.values[96*(7-d) - 1 + 3:-96*d - 1 - 5]
        proc_df['demand_{0}d_t3'.format(d)] = temp_df.demand.values[96*(7-d) + 3:-96*d - 5]

        proc_df['demand_{0}d_t4'.format(d)] = temp_df.demand.values[96*(7-d) + 1 + 3:-96*d + 1 - 5]
        proc_df['demand_{0}d_t5'.format(d)] = temp_df.demand.values[96*(7-d) + 2 + 3:-96*d + 2 - 5]
        proc_df['demand_{0}d_t6'.format(d)] = temp_df.demand.values[96*(7-d) + 3 + 3:-96*d + 3 - 5]
        proc_df['demand_{0}d_t7'.format(d)] = temp_df.demand.values[96*(7-d) + 4 + 3:-96*d + 4 - 5]
        proc_df['demand_{0}d_t8'.format(d)] = temp_df.demand.values[96*(7-d) + 5 + 3:-96*d + 5 - 5]

        proc_df['demand_{0}d_t9'.format(d)] = temp_df.demand.values[96*(7-d) + 6 + 3:-96*d + 6 - 5]
        proc_df['demand_{0}d_t10'.format(d)] = temp_df.demand.values[96*(7-d) + 7 + 3:-96*d + 7 - 5]
        proc_df['demand_{0}d_t11'.format(d)] = temp_df.demand.values[96*(7-d) + 8 + 3:-96*d + 8 - 5]
        proc_df['demand_{0}d_t12'.format(d)] = temp_df.demand.values[96*(7-d) + 9 + 3:-96*d + 9 - 5]

    preprocessed_df = preprocessed_df.append(proc_df, ignore_index=True)
#     num_rows = preprocessed_df.shape[0]
#     added_rows = proc_df.shape[0]
#     preprocessed_df.loc[num_rows: num_rows + added_rows] = proc_df.loc[:]
    gc.collect()

preprocessed_df.shape

In [33]:
csv_file = open('./data/preprocessed_formatted_data.csv', 'w')

headers = [
    'order', 'geohash6', 'day', 'lat', 'long',
    'distance_from_qp09d8', 'distance_from_qp03xx', 'distance_from_qp03wf',
    'target_1', 'target_2', 'target_3', 'target_4', 'target_5',
    'demand_t0', 'demand_t1', 'demand_t2', 'demand_t3', 'demand_t4', 'demand_t5', 'demand_t6', 'demand_t7',
    'day_mod_seven', 'day_of_week_sin', 'day_of_week_cos',
    'hour', 'hour_sin', 'hour_cos',
    'per_fifteen_minutes', 'fifteen_minute_sin', 'fifteen_minute_cos',
    'day_of_week_median', 'day_of_week_mean',
    'hour_median', 'hour_mean',
    'per_fifteen_minutes_median', 'per_fifteen_minutes_mean'
]

for d in range(1, 8):
    for i in range(13):
        headers.append('demand_{0}d_t{1}'.format(d, i))

csv_file.write(','.join(headers))
csv_file.write('\n')

for i, gh in enumerate(df.geohash6.unique()):
    print(i, len(df.geohash6.unique()))
    temp_df = df[df.geohash6 == gh]
    temp_df.set_index('order', inplace=True)
    num_rows = temp_df.shape[0]
    for i in range(96*7 + 3, num_rows - 5):
        # put in whatever data needed into here
        arr = {
            'order': i,
            'geohash6': gh,
            'day': temp_df.at[i, 'day'],
            'lat': temp_df.at[i, 'lat'],
            'long': temp_df.at[i, 'long'],
            
            'distance_from_qp09d8': temp_df.at[i, 'distance_from_qp09d8'],
            'distance_from_qp03xx': temp_df.at[i, 'distance_from_qp03xx'],
            'distance_from_qp03wf': temp_df.at[i, 'distance_from_qp03wf'],
            
            # LABELS
            'target_1': temp_df.at[i+1, 'demand'],
            'target_2': temp_df.at[i+2, 'demand'],
            'target_3': temp_df.at[i+3, 'demand'],
            'target_4': temp_df.at[i+4, 'demand'],
            'target_5': temp_df.at[i+5, 'demand'],
            
            # Last two hours demands
            'demand_t0': temp_df.at[i, 'demand'],
            'demand_t1': temp_df.at[i-1, 'demand'],
            'demand_t2': temp_df.at[i-2, 'demand'],
            'demand_t3': temp_df.at[i-3, 'demand'],
            'demand_t4': temp_df.at[i-4, 'demand'],
            'demand_t5': temp_df.at[i-5, 'demand'],
            'demand_t6': temp_df.at[i-6, 'demand'],
            'demand_t7': temp_df.at[i-7, 'demand'],
            
            'day_mod_seven': temp_df.at[i, 'day_mod_seven'],
            'day_of_week_sin': temp_df.at[i, 'day_of_week_sin'],
            'day_of_week_cos': temp_df.at[i, 'day_of_week_cos'],
            
            'hour': temp_df.at[i, 'hour'],
            'hour_sin': temp_df.at[i, 'hour_sin'],
            'hour_cos': temp_df.at[i, 'hour_cos'],
            
            'per_fifteen_minutes': temp_df.at[i, 'per_fifteen_minutes'],
            'fifteen_minute_sin': temp_df.at[i, 'fifteen_minute_sin'],
            'fifteen_minute_cos': temp_df.at[i, 'fifteen_minute_cos'],
            
            # could have used target median/mean but this is easier with very little sacrifice
            'day_of_week_median': temp_df.at[i, 'day_of_week_median'],
            'day_of_week_mean': temp_df.at[i, 'day_of_week_mean'],
            
            'hour_median': temp_df.at[i, 'hour_median'],
            'hour_mean': temp_df.at[i, 'hour_mean'],
            
            'per_fifteen_minutes_median': temp_df.at[i, 'per_fifteen_minutes_median'],
            'per_fifteen_minutes_mean': temp_df.at[i, 'per_fifteen_minutes_mean'],
        }
        
        for d in range(1, 8):
            for j in range(13):
                arr['demand_{0}d_t{1}'.format(d, j)] = temp_df.at[i + j - 96*d - 3, 'demand']
        
        content = []
        for h in headers:
            content.append(str(arr[h]))
        csv_file.write(','.join(content))
        csv_file.write('\n')
    gc.collect()
    
csv_file.close()

0 1329
1 1329
2 1329
3 1329
4 1329
5 1329
6 1329
7 1329
8 1329
9 1329
10 1329
11 1329
12 1329
13 1329
14 1329
15 1329
16 1329
17 1329
18 1329
19 1329
20 1329
21 1329
22 1329
23 1329
24 1329
25 1329
26 1329
27 1329
28 1329
29 1329
30 1329
31 1329
32 1329
33 1329
34 1329
35 1329
36 1329
37 1329
38 1329
39 1329
40 1329
41 1329
42 1329
43 1329
44 1329
45 1329
46 1329
47 1329
48 1329
49 1329
50 1329
51 1329
52 1329
53 1329
54 1329
55 1329
56 1329
57 1329
58 1329
59 1329
60 1329
61 1329
62 1329
63 1329
64 1329
65 1329
66 1329
67 1329
68 1329
69 1329
70 1329
71 1329
72 1329
73 1329
74 1329
75 1329
76 1329
77 1329
78 1329
79 1329
80 1329
81 1329
82 1329
83 1329
84 1329
85 1329
86 1329
87 1329
88 1329
89 1329
90 1329
91 1329
92 1329
93 1329
94 1329
95 1329
96 1329
97 1329
98 1329
99 1329
100 1329
101 1329
102 1329
103 1329
104 1329
105 1329
106 1329
107 1329
108 1329
109 1329
110 1329
111 1329
112 1329
113 1329
114 1329
115 1329
116 1329
117 1329
118 1329
119 1329
120 1329
121 1329
122 1329
123

923 1329
924 1329
925 1329
926 1329
927 1329
928 1329
929 1329
930 1329
931 1329
932 1329
933 1329
934 1329
935 1329
936 1329
937 1329
938 1329
939 1329
940 1329
941 1329
942 1329
943 1329
944 1329
945 1329
946 1329
947 1329
948 1329
949 1329
950 1329
951 1329
952 1329
953 1329
954 1329
955 1329
956 1329
957 1329
958 1329
959 1329
960 1329
961 1329
962 1329
963 1329
964 1329
965 1329
966 1329
967 1329
968 1329
969 1329
970 1329
971 1329
972 1329
973 1329
974 1329
975 1329
976 1329
977 1329
978 1329
979 1329
980 1329
981 1329
982 1329
983 1329
984 1329
985 1329
986 1329
987 1329
988 1329
989 1329
990 1329
991 1329
992 1329
993 1329
994 1329
995 1329
996 1329
997 1329
998 1329
999 1329
1000 1329
1001 1329
1002 1329
1003 1329
1004 1329
1005 1329
1006 1329
1007 1329
1008 1329
1009 1329
1010 1329
1011 1329
1012 1329
1013 1329
1014 1329
1015 1329
1016 1329
1017 1329
1018 1329
1019 1329
1020 1329
1021 1329
1022 1329
1023 1329
1024 1329
1025 1329
1026 1329
1027 1329
1028 1329
1029 1329
1030 13

In [34]:
df = pd.read_csv('./data/preprocessed_formatted_data.csv')
df.shape

(6877575, 127)

In [35]:
df.to_pickle('./preprocessed_df_with_daily.pkl')
df[df.day <= 50].to_pickle('./preprocessed_train_df_with_daily.pkl')
df[df.day > 50].to_pickle('./preprocessed_eval_df_with_daily.pkl')

In [None]:
preprocessed_df.to_pickle('./preprocessed_df_with_daily.pkl')
preprocessed_df[preprocessed_df.day <= 50].to_pickle('./preprocessed_train_df_with_daily.pkl')
preprocessed_df[preprocessed_df.day > 50].to_pickle('./preprocessed_eval_df_with_daily.pkl')

In [None]:
# csv_file = open('./data/preprocessed_formatted_data.csv', 'w')

# headers = [
#     'order', 'geohash6', 'day', 'lat', 'long',
#     'distance_from_qp09d8', 'distance_from_qp03xx', 'distance_from_qp03wf',
#     'target_1', 'target_2', 'target_3', 'target_4', 'target_5',
#     'demand_t0', 'demand_t1', 'demand_t2', 'demand_t3', 'demand_t4', 'demand_t5', 'demand_t6', 'demand_t7',
#     'demand_7d_t0', 'demand_7d_t1', 'demand_7d_t2', 'demand_7d_t3',
#     'demand_7d_t4', 'demand_7d_t5', 'demand_7d_t6', 'demand_7d_t7', 'demand_7d_t8',
#     'demand_7d_t9', 'demand_7d_t10', 'demand_7d_t11', 'demand_7d_t12',
#     'day_mod_seven', 'day_of_week_sin', 'day_of_week_cos',
#     'hour', 'hour_sin', 'hour_cos',
#     'per_fifteen_minutes', 'fifteen_minute_sin', 'fifteen_minute_cos',
#     'day_of_week_median', 'day_of_week_mean',
#     'hour_median', 'hour_mean',
#     'per_fifteen_minutes_median', 'per_fifteen_minutes_mean'
# ]

# csv_file.write(','.join(headers))
# csv_file.write('\n')

# for i, gh in enumerate(df.geohash6.unique()):
#     print(i, len(df.geohash6.unique()))
#     temp_df = df[df.geohash6 == gh]
#     temp_df.set_index('order', inplace=True)
#     num_rows = temp_df.shape[0]
#     for i in range(96*7 + 3, num_rows - 5):
#         # put in whatever data needed into here
#         arr = {
#             'order': i,
#             'geohash6': gh,
#             'day': temp_df.at[i, 'day'],
#             'lat': temp_df.at[i, 'lat'],
#             'long': temp_df.at[i, 'long'],
            
#             'distance_from_qp09d8': temp_df.at[i, 'distance_from_qp09d8'],
#             'distance_from_qp03xx': temp_df.at[i, 'distance_from_qp03xx'],
#             'distance_from_qp03wf': temp_df.at[i, 'distance_from_qp03wf'],
            
#             # LABELS
#             'target_1': temp_df.at[i+1, 'demand'],
#             'target_2': temp_df.at[i+2, 'demand'],
#             'target_3': temp_df.at[i+3, 'demand'],
#             'target_4': temp_df.at[i+4, 'demand'],
#             'target_5': temp_df.at[i+5, 'demand'],
            
#             # Last two hours demands
#             'demand_t0': temp_df.at[i, 'demand'],
#             'demand_t1': temp_df.at[i-1, 'demand'],
#             'demand_t2': temp_df.at[i-2, 'demand'],
#             'demand_t3': temp_df.at[i-3, 'demand'],
#             'demand_t4': temp_df.at[i-4, 'demand'],
#             'demand_t5': temp_df.at[i-5, 'demand'],
#             'demand_t6': temp_df.at[i-6, 'demand'],
#             'demand_t7': temp_df.at[i-7, 'demand'],
            
#             # Seven days ago +/- 1 hour
#             'demand_7d_t0': temp_df.at[i-3-96*7, 'demand'],
#             'demand_7d_t1': temp_df.at[i-2-96*7, 'demand'],
#             'demand_7d_t2': temp_df.at[i-1-96*7, 'demand'],
#             'demand_7d_t3': temp_df.at[i-96*7, 'demand'],
            
#             'demand_7d_t4': temp_df.at[i+1-96*7, 'demand'],
#             'demand_7d_t5': temp_df.at[i+2-96*7, 'demand'],
#             'demand_7d_t6': temp_df.at[i+3-96*7, 'demand'],
#             'demand_7d_t7': temp_df.at[i+4-96*7, 'demand'],
#             'demand_7d_t8': temp_df.at[i+5-96*7, 'demand'],
            
#             'demand_7d_t9': temp_df.at[i+6-96*7, 'demand'],
#             'demand_7d_t10': temp_df.at[i+7-96*7, 'demand'],
#             'demand_7d_t11': temp_df.at[i+8-96*7, 'demand'],
#             'demand_7d_t12': temp_df.at[i+9-96*7, 'demand'],
            
#             'day_mod_seven': temp_df.at[i, 'day_mod_seven'],
#             'day_of_week_sin': temp_df.at[i, 'day_of_week_sin'],
#             'day_of_week_cos': temp_df.at[i, 'day_of_week_cos'],
            
#             'hour': temp_df.at[i, 'hour'],
#             'hour_sin': temp_df.at[i, 'hour_sin'],
#             'hour_cos': temp_df.at[i, 'hour_cos'],
            
#             'per_fifteen_minutes': temp_df.at[i, 'per_fifteen_minutes'],
#             'fifteen_minute_sin': temp_df.at[i, 'fifteen_minute_sin'],
#             'fifteen_minute_cos': temp_df.at[i, 'fifteen_minute_cos'],
            
#             # could have used target median/mean but this is easier with very little sacrifice
#             'day_of_week_median': temp_df.at[i, 'day_of_week_median'],
#             'day_of_week_mean': temp_df.at[i, 'day_of_week_mean'],
            
#             'hour_median': temp_df.at[i, 'hour_median'],
#             'hour_mean': temp_df.at[i, 'hour_mean'],
            
#             'per_fifteen_minutes_median': temp_df.at[i, 'per_fifteen_minutes_median'],
#             'per_fifteen_minutes_mean': temp_df.at[i, 'per_fifteen_minutes_mean'],
#         }
#         content = []
#         for h in headers:
#             content.append(str(arr[h]))
#         csv_file.write(','.join(content))
#     gc.collect()
    
# csv_file.close()

In [None]:
# REFERENCE
# https://medium.com/ai%C2%B3-theory-practice-business/top-6-errors-novice-machine-learning-engineers-make-e82273d394db
# http://blog.davidkaleko.com/feature-engineering-cyclical-features.html