In [16]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta, date

'''This script produces a file which shows transactions. Each row is a transaction
with arrival, depart, and room type.'''

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# load raw hotel data
df = pd.read_csv('data/2018_preprocessed.csv', parse_dates=['ARRIVAL', 'DEPART', 'LOOK_DATE'])
df = df.sort_values(['LOOK_DATE', 'RESNO'])
df = df[df['CANCEL_INDICATOR'] == 0] # look at bookings only
df_purchases = df.drop(['RESNO', 'CANCEL_INDICATOR'], axis=1)

# initialize some helper vars
df_grouped = df.groupby(by='LOOK_DATE').count()
periods_per_day = df_grouped['DEPART'].max()
num_days = (df_grouped.index.max() - df_grouped.index.min()).days
start_date = df_grouped.index.min()
end_date = df_grouped.index.max()

In [18]:
trans_df = pd.DataFrame(columns=['LOOK_DATE'])

# build empty trans df with max_periods_per_day*num_day rows
for date in daterange(start_date, end_date):
    for j in range(0, periods_per_day):
        row = {'LOOK_DATE': date}
        trans_df = trans_df.append(row, ignore_index=True)

trans_df = trans_df.reset_index()
trans_df['index'] += 1
trans_df = trans_df.rename(index=str, columns={"index": "T"})
trans_df = trans_df.set_index('T')
trans_df['ARRIVAL'] = 0
trans_df['DEPART'] = 0
trans_df['UNIT'] = 0

In [20]:
# Main transaction vector

# storing col indices for convenience
date_loc_t = trans_df.columns.get_loc('LOOK_DATE')
arrival_loc_t = trans_df.columns.get_loc('ARRIVAL')
depart_loc_t = trans_df.columns.get_loc('DEPART')
rm_loc_t = trans_df.columns.get_loc('UNIT')
date_loc_p = df_purchases.columns.get_loc('LOOK_DATE')
arrival_loc_p = df_purchases.columns.get_loc('ARRIVAL')
depart_loc_p = df_purchases.columns.get_loc('DEPART')
rm_loc_p = df_purchases.columns.get_loc('UNIT')

# counter vars for convenience
row_counter = 0
intra_day_counter = 0

# fill in purchases
for day in range(0, num_days):
    start_row = day * periods_per_day
    intra_day_counter = 0
    print(start_row)
    while (trans_df.iloc[start_row, date_loc_t] == df_purchases.iloc[row_counter, date_loc_p]):
        cur_row = start_row + intra_day_counter
        trans_df.iloc[cur_row, arrival_loc_t] = df_purchases.iloc[row_counter, arrival_loc_p]
        trans_df.iloc[cur_row, depart_loc_t] = df_purchases.iloc[row_counter, depart_loc_p]
        trans_df.iloc[cur_row, rm_loc_t] = df_purchases.iloc[row_counter, rm_loc_p]
        intra_day_counter += 1
        row_counter += 1

# truncate string dates
trans_df['ARRIVAL'] = trans_df['ARRIVAL'].astype(str).str[:-9]
trans_df['DEPART'] = trans_df['DEPART'].astype(str).str[:-9]

# replace blanks with 0
trans_df = trans_df.replace('', 0)

# output
trans_df.to_csv('data/transactions_pre.csv')

0
143
286
429
572
715
858
1001
1144
1287
1430
1573
1716
1859
2002
2145
2288
2431
2574
2717
2860
3003
3146
3289
3432
3575
3718
3861
4004
4147
4290
4433
4576
4719
4862
5005
5148
5291
5434
5577
5720
5863
6006
6149
6292
6435
6578
6721
6864
7007
7150
7293
7436
7579
7722
7865
8008
8151
8294
8437
8580
8723
8866
9009
9152
9295
9438
9581
9724
9867
10010
10153
10296
10439
10582
10725
10868
11011
11154
11297
11440
11583
11726
11869
12012
12155
12298
12441
12584
12727
12870
13013
13156
13299
13442
13585
13728
13871
14014
14157
14300
14443
14586
14729
14872
15015
15158
15301
15444
15587
15730
15873
16016
16159
16302
16445
16588
16731
16874
17017
17160
17303
17446
17589
17732
17875
18018
18161
18304
18447
18590
18733
18876
19019
19162
19305
19448
19591
19734
19877
20020
20163
20306
20449
20592
20735
20878
21021
21164
21307
21450
21593
21736
21879
22022
22165
22308
22451
22594
22737
22880
23023
23166
23309
23452
23595
23738
23881
24024
24167
24310
24453
24596
24739
24882
25025
25168
25311
25454
25597

In [118]:
# simplify sprint 1 trans df

# cut all cols but unit
simpl_trans_df = pd.read_csv('data/transactions_pre.csv', index_col='T')
simpl_trans_df = simpl_trans_df[['UNIT']]

# remap to numbers
simpl_trans_df['UNIT'] = simpl_trans_df['UNIT'].map({'CD': 1, 'DD': 2, 'CK': 3, 'DK': 4,
                                                     'DKB': 5, '2BV': 6, '4BV': 7,
                                                     '0': 0})

# output
simpl_trans_df.to_csv('data/transactions_pre_simpl.csv')