In [154]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta, date
from ast import literal_eval

'''This script produces a file which contains the book-arrive delta for each transaction.'''

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# load transaction data
trans_df = pd.read_csv('../../../data/cabot_data/sprint_3/trans_s3_raw.csv',
                       parse_dates=['LOOK_DATE']) # transactions with date

In [155]:
# convert strings to tuples
trans_df['PRODUCT'] = [0 if str(x) == 'nan' else literal_eval(x) for x in trans_df['PRODUCT']]

# get look days
trans_df['LOOK_DAY'] = trans_df['LOOK_DATE'].dt.week * 7 - 7 + trans_df['LOOK_DATE'].dt.dayofweek

# get arrival days
get_day = lambda x: x[1] * 7 - 7 + x[2] if x != 0 else 'nan'
trans_df['ARRIVAL_DAY'] = trans_df['PRODUCT'].apply(get_day)

# get difference in booking and arrival
trans_df['BA_DIFF'] = 364
for index, row in trans_df.iterrows():
    if row['ARRIVAL_DAY'] != 'nan':
        trans_df.loc[index, 'BA_DIFF'] = row['ARRIVAL_DAY'] - row['LOOK_DAY']
        
# aggregate to days
trans_df = trans_df.groupby('LOOK_DATE').mean()

# get weeks
trans_df['BA_DIFF'] = trans_df['BA_DIFF'] / 7

# postprocessing
trans_df = trans_df.reset_index()
trans_df.index += 1
trans_df = trans_df.drop(['LOOK_DATE', 'INTRADAY', 'LOOK_DAY'], axis=1)

In [157]:
trans_df.to_csv('../../../data/cabot_data/sprint_4/ba_diffs.csv')