In [None]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta, date
from ast import literal_eval
from matplotlib import pyplot as plt

# load transaction data
trans_df = pd.read_csv('../../../data/cabot_data/sprint_3/trans_s3_raw.csv',
                       parse_dates=['LOOK_DATE'])

# load predicted lambdas
lamb_df = pd.read_csv('/Users/johngao/work/thesis/em-hotel-demand-estimation/EM_extension/model/sprint4/sprint4_results.csv')

In [None]:
# get binary value from purchases
trans_df['PURCHASE_COUNT'] = trans_df['PRODUCT'].astype(str) != 'nan'
trans_df['PURCHASE_COUNT'] = trans_df.apply(lambda row: 1 if row['PURCHASE_COUNT'] == True else 0, axis=1)

# get look weeks and look_dow
trans_df['LOOK_WEEK'] = trans_df.apply(lambda row: row['LOOK_DATE'].week, axis=1)
trans_df['LOOK_DOW'] = trans_df.apply(lambda row: row['LOOK_DATE'].dayofweek, axis=1)

# transform look day into numbers
trans_df['LOOK_DOY'] = trans_df.apply(lambda row: row['LOOK_DATE'].dayofyear - 1, axis=1)

# process predicted lambdas
num_days = 299
lamb_df = lamb_df.iloc[lamb_df.index > len(lamb_df) - (num_days + 1),:]
lamb_df = lamb_df.reset_index().drop(['index','var'], axis=1)
lamb_df['LOOK_DOY'] = lamb_df.index + 1
lamb_df = lamb_df.rename(index=str, columns={" value": "pred_lambda"})

# merge predicted lambdas into main df
trans_df = trans_df.merge(lamb_df, on='LOOK_DOY')

# get daily counts and diff
daily_count = trans_df.drop(['INTRADAY', 'LOOK_WEEK', 'LOOK_DATE'],axis=1).groupby('LOOK_DOY').sum()
daily_count['diff'] = daily_count['PURCHASE_COUNT'] - daily_count['pred_lambda']
daily_count = daily_count.drop(['PURCHASE_COUNT', 'pred_lambda'], axis = 1)

# get weekly counts and diff
weekly_count = trans_df.drop(['INTRADAY', 'LOOK_DATE', 'LOOK_DOY', 'LOOK_DATE'],axis=1).groupby('LOOK_WEEK').sum()
weekly_count['diff'] = weekly_count['PURCHASE_COUNT'] - weekly_count['pred_lambda']
weekly_count = weekly_count.drop(['PURCHASE_COUNT', 'pred_lambda'], axis = 1)

# get dow counts and diff
dow_count = trans_df.drop(['INTRADAY', 'LOOK_WEEK', 'LOOK_DATE', 'LOOK_DOY'],axis=1).groupby('LOOK_DOW').sum()
dow_count['diff'] = dow_count['PURCHASE_COUNT'] - dow_count['pred_lambda']
dow_count = dow_count.drop(['PURCHASE_COUNT', 'pred_lambda'], axis = 1)

In [None]:
# save figure for daily differences
plt.figure(figsize=(20,60))
plt.barh(daily_count.index, daily_count['diff'], align='center', alpha=0.5)
plt.yticks(daily_count.index)
plt.ylabel('Look_Day')
plt.xlabel('Count vs Predicted count difference')
plt.title('Look_day vs predicted count difference')
plt.savefig('daily_count_diff.png', format='png')
plt.show()

In [None]:
# save figure for weekly differences
plt.figure(figsize=(20,20))
plt.barh(weekly_count.index, weekly_count['diff'], align='center', alpha=0.5, color='g')
plt.yticks(weekly_count.index)
plt.ylabel('Look_week')
plt.xlabel('Count vs Predicted count difference')
plt.title('Look_week vs predicted count difference')
plt.savefig('weekly_count_diff.png', format='png')
plt.show()

In [None]:
# save figure for weekly differences
plt.figure(figsize=(20,10))
plt.barh(dow_count.index, dow_count['diff'], align='center', alpha=0.5, color='r')
plt.yticks(dow_count.index)
plt.ylabel('Look_dow')
plt.xlabel('Count vs Predicted count difference')
plt.title('Look_dow vs predicted count difference')
plt.savefig('dow_count_diff.png', format='png')
plt.show()

In [None]:
counts_df_daily = trans_df[['LOOK_DATE', 'PURCHASE_TF', 'LOOK_WEEK']]

In [None]:
counts_df_daily = counts_df_daily.groupby('LOOK_DATE').sum()