In [None]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta, date
from ast import literal_eval

'''This script produces a file which shows transactions. Each row is a transaction
with arrival, depart, and room type.'''

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# load purchase data
purchases_df = pd.read_csv('../../../data/cabot_data/clean_base/2018_clean.csv',
                           parse_dates=['ARRIVAL', 'DEPART', 'LOOK_DATE'])
purchases_df = (purchases_df[purchases_df['CANCEL_INDICATOR'] == 0] # look at bookings only
                .drop(['RESNO', 'CANCEL_INDICATOR'], axis=1)
                .groupby('group_id').first() # collapse groups
                .sort_values('LOOK_DATE'))

# add stay len col to df_data
purchases_df['stay_len'] = (purchases_df['DEPART'] - purchases_df['ARRIVAL']).dt.days
# anything greater than 4 is 4
purchases_df['stay_len'] = purchases_df['stay_len'].apply(lambda x: 4 if x > 4 else x)

# initialize some helper vars
df_grouped = purchases_df.groupby(by=['LOOK_DATE']).count()
periods_per_day = df_grouped['DEPART'].max()

look_start = purchases_df['LOOK_DATE'].min()
look_end = purchases_df['LOOK_DATE'].max()

ssn_start = purchases_df['ARRIVAL'].min()
ssn_end = purchases_df['ARRIVAL'].max()

In [None]:
# build empty trans df
intraday_range = range(1, periods_per_day+1)
look_range = pd.date_range(look_start, look_end)
ssn_range = pd.date_range(ssn_start, ssn_end)

trans_df = pd.DataFrame(index = pd.MultiIndex.from_product([look_range, intraday_range], 
                                                           names=['LOOK_DATE', 'INTRADAY']),
                        columns=['PRODUCT'])

In [None]:
# fill in purchases
for index, row in purchases_df.iterrows():
    
    # helper vars
    unit = row['UNIT']
    week = row['ARRIVAL'].week
    dow = row['ARRIVAL'].dayofweek + 1
    stay_len = row['stay_len']
    
    intra_day_counter = 1 # counter var for intra day
    cur_cell = (row['LOOK_DATE'], intra_day_counter)
    while(trans_df.loc[cur_cell].notnull()[0]): # find empty cell
        intra_day_counter += 1
        cur_cell = (row['LOOK_DATE'], intra_day_counter)
    trans_df.loc[cur_cell] = [(row['UNIT'], week, dow, stay_len)]
    print(row['LOOK_DATE'])

In [None]:
trans_df.to_csv('../../../data/cabot_data/sprint_2/trans_s2_raw.csv')

In [None]:
trans_df_t = pd.read_csv('../../../data/cabot_data/sprint_2/trans_s2_raw.csv', index_col=['LOOK_DATE', 'INTRADAY'])
prod_num_map = pd.read_csv('../../../data/cabot_data/sprint_2/prod_num_map.csv',
                           header=None,
                           index_col=[0,1,2,3])

# import product number mapping
prod_num_map = pd.Series(prod_num_map.iloc[:,0])
prod_num_map = prod_num_map.to_dict()

# convert product strings to tuples, and map
trans_df_t = trans_df_t.fillna('0')
trans_df_t['PRODUCT'] = [literal_eval(x) for x in trans_df_t['PRODUCT']]
trans_df_t['PRODUCT'] = trans_df_t['PRODUCT'].map(prod_num_map)

# misc processing
trans_df_t = trans_df_t.fillna(0).astype(int)
trans_df_t = trans_df_t.reset_index()
trans_df_t = trans_df_t.drop(['LOOK_DATE', 'INTRADAY'], axis=1)
trans_df_t.index += 1

# save
trans_df_t.to_csv('../../../data/cabot_data/sprint_2/trans_s2.csv')

In [153]:
trans_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRODUCT
LOOK_DATE,INTRADAY,Unnamed: 2_level_1
2018-01-02,1,"(CD, 32, 4, 2)"
2018-01-02,2,"(DD, 25, 4, 2)"
2018-01-02,3,"(DD, 25, 4, 2)"
2018-01-02,4,"(DKB, 39, 4, 3)"
2018-01-02,5,"(DK, 36, 4, 1)"
2018-01-02,6,"(DKB, 39, 4, 3)"
2018-01-02,7,"(4BV, 36, 4, 3)"
2018-01-02,8,"(4BV, 21, 5, 4)"
2018-01-02,9,"(DD, 32, 4, 2)"
2018-01-02,10,"(CK, 25, 4, 2)"
