In [112]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta, date
import sys

'''This script produces a file which shows availability. Each row is a booking
date, and each col is a (room type, arrival week, price) tuple'''

df_data = pd.read_csv('../../../data/cabot_data/clean_base/2018_clean.csv',
                       parse_dates = ['LOOK_DATE', 'ARRIVAL', 'DEPART'],
                       infer_datetime_format = True,
                       index_col = 'group_id')
df_caps = pd.read_csv('../../../data/cabot_data/clean_base/capacities.csv', index_col='UNIT')

In [120]:
# add stay len col to df_data
df_data['stay_len'] = (df_data['DEPART'] - df_data['ARRIVAL']).dt.days
# anything greater than 4 is 4
df_data['stay_len_clean'] = df_data['stay_len'].apply(lambda x: 4 if x > 4 else x)` 

In [123]:
df_data

Unnamed: 0_level_0,UNIT,RESNO,ARRIVAL,DEPART,LOOK_DATE,CANCEL_INDICATOR,grp_size,stay_len,stay_len_clean
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
269,4BV,10398O,2018-09-06,2018-09-09,2018-01-02,0,5,3,3
863,4BV,10398O,2018-09-06,2018-09-09,2018-01-02,0,4,3,3
2681,DKB,10398P,2018-09-27,2018-09-30,2018-01-02,0,2,3,3
269,4BV,10398Q,2018-09-06,2018-09-09,2018-01-02,0,5,3,3
2143,CK,10398R,2018-05-18,2018-05-21,2018-01-02,0,2,3,3
2143,CK,10398T,2018-05-18,2018-05-21,2018-01-02,0,2,3,3
70,4BV,10398U,2018-05-25,2018-05-29,2018-01-02,0,4,4,4
2710,DD,10398V,2018-06-21,2018-06-23,2018-01-02,0,2,2,2
2850,DD,10398V,2018-06-21,2018-06-23,2018-01-02,0,3,2,2
4477,DD,10398V,2018-06-21,2018-06-23,2018-01-02,0,3,2,2


In [108]:
# helper vars
look_start = df_data['LOOK_DATE'].min()
look_end = df_data['LOOK_DATE'].max()
look_range = pd.date_range(look_start, look_end)

ssn_start = df_data['ARRIVAL'].min()
ssn_end = df_data['ARRIVAL'].max()
ssn_range = pd.date_range(ssn_start, ssn_end)

# get season week range
ssn_weeks = ssn_range.week.drop_duplicates().tolist()

# get max periods per day from transactions script
periods_per_day = (df_data[df_data['CANCEL_INDICATOR'] == 0] # look at bookings only
                   .drop(['CANCEL_INDICATOR'], axis=1)
                   .groupby('group_id').first() # collapse groups
                   .sort_values('LOOK_DATE')
                   .groupby(by=['LOOK_DATE', 'ARRIVAL']).count()
                   .max()[0])
intraday_range = range(0, periods_per_day)

# list of days in week
week_days = [1, 2, 3, 4, 5, 6, 7]

# list of stay lengths (4 refers to 4 days or more)
stay_lens = [0, 1, 2, 3, 4]

# room type list
df_grouped_types = df_data.groupby(by='UNIT').count()
unit_list = df_grouped_types.index.tolist()

In [109]:
# initializing blank avail df
# create blank df
df_avail = pd.DataFrame(index=look_range,
                        columns=pd.MultiIndex.from_product([unit_list, ssn_weeks, week_days, stay_lens], 
                                                           names=['UNIT', 'week', 'dow', 'stay_len']))
df_avail = df_avail.fillna(0.0)

In [110]:
df_avail

UNIT,2BV,2BV,2BV,2BV,2BV,2BV,2BV,2BV,2BV,2BV,...,DKB,DKB,DKB,DKB,DKB,DKB,DKB,DKB,DKB,DKB
week,19,19,19,19,19,19,19,19,19,19,...,43,43,43,43,43,43,43,43,43,43
dow,1,1,1,1,1,2,2,2,2,2,...,6,6,6,6,6,7,7,7,7,7
stay_len,0,1,2,3,4,0,1,2,3,4,...,0,1,2,3,4,0,1,2,3,4
2018-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# add default capacities
for index, row in df_caps.iterrows():
    df_avail[index] = df_avail[index] + row['CAPACITY']

In [45]:
df_avail

UNIT,2BV,2BV,2BV,2BV,2BV,2BV,2BV,2BV,2BV,2BV,...,DKB,DKB,DKB,DKB,DKB,DKB,DKB,DKB,DKB,DKB
doy,1,1,1,1,1,1,1,1,1,1,...,7,7,7,7,7,7,7,7,7,7
stay_len,1,1,1,1,1,1,1,1,1,1,...,4,4,4,4,4,4,4,4,4,4
week,19,20,21,22,23,24,25,26,27,28,...,34,35,36,37,38,39,40,41,42,43
2018-01-02,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-03,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-04,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-05,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-06,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-07,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-08,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-09,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-10,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2
2018-01-11,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2,2,2,2,2,2,2,2,2,2


In [58]:
df_data.iloc[1,:]['ARRIVAL'].dayofweek+1

4

In [105]:
for index, row in df_data.head(1).iterrows():
    progress(index, total, status='Filling in availability')
    
    cap_change = 1/row['grp_size']
    
    # book arrive delta, must subtract capacity for all these dates
    ba_delta = pd.date_range(row['LOOK_DATE'], row['ARRIVAL'])
    
    # helper vars
    unit = row['UNIT']
    dow = row['ARRIVAL'].dayofweek + 1
    stay_len = row['STAY_LEN']
    

    # subtract or add capacity for each day b/w look and arrive
    if row['CANCEL_INDICATOR'] == 0:
        # loop subtract (i know this is bad practice lol)
        for look_date in ba_delta:
            for i in intraday_range:
                cur_idx = (date, row['ARRIVAL'], i)
                df_avail.loc[cur_idx, row['UNIT']] -= cap_change
    else:
        for look_date in ba_delta:
            for i in intraday_range:
                cur_idx = (date, row['ARRIVAL'], i)
                df_avail.loc[cur_idx, row['UNIT']] += cap_change

[=-----------------------------------------------------------] 2.5% ...Filling in availability

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_nested_tuple(tup)


KeyError: "[<class 'datetime.date'> 0] not in index"

In [104]:
# helper load bar function
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()
total = len(df_data)

In [677]:
df_avail.to_csv('data/availability_general.csv')

In [16]:
# collapse index, map to 1's and 0's and prod nums
df_avail_t = pd.read_csv('../../../data/availability_general.csv')
df_avail_t

FileNotFoundError: File b'../../../data/availability_general.csv' does not exist

In [714]:
# collapse index, map to 1's and 0's and prod nums
df_avail_t = pd.read_csv('data/availability_general.csv')

df_avail_t = df_avail_t.drop(['LOOK_DATE', 'ARRIVAL', 'INTRADAY'], axis=1)
df_avail_t.index = df_avail_t.index.rename('T')
df_avail_t = df_avail_t.applymap(lambda x: 0 if x <= 0 else 1)

df_avail_t = df_avail_t.rename(index=str, columns={"CD": "prod_1",
                                                   "DD": "prod_2",
                                                   "CK": "prod_3",
                                                   "DK": "prod_4",
                                                   "DKB": "prod_5",
                                                   "2BV": "prod_6",
                                                   "4BV": "prod_7"})

df_avail_t = df_avail_t.reindex(sorted(df_avail_t.columns), axis=1)

In [719]:
df_avail_t.to_csv('data/availability_sprint1.csv')

In [720]:
df_avail

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2BV,4BV,CD,CK,DD,DK,DKB
LOOK_DATE,ARRIVAL,INTRADAY,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-02,2018-05-11,0,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,1,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,2,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,3,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,4,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,5,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,6,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,7,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,8,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
2018-01-02,2018-05-11,9,8.00,4.0,13.000000,17.0,24.0,6.0,12.0
