## Leveraging World Events to Predict E-Commerce Consumer Demand under Anomaly

In [1]:
import sys
sys.path.append('.')
sys.path.append('../')
import os 
import os.path as path
import datetime
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm    
import darts
from darts import TimeSeries
import cufflinks as cf
cf.go_offline()
from plotly.offline import plot, download_plotlyjs, init_notebook_mode, plot, iplot
from IPython.display import display, Math, Markdown
from IPython.display import display, Markdown, clear_output
import ipywidgets as widgets
%load_ext autoreload
%autoreload 2

## Import Functions 

In [2]:
import config as proj_config
cache_path = proj_config.CACHE_DIR
data_path = proj_config.DATA_DIR
events_data_path = proj_config.EVENTS_DATASET_DIR
categories_path = cache_path + '/categories_events/'

In [3]:
from demand_prediction.general_functions import get_file_path, get_df_table, load_table_cache, save_table_cache, get_pred_dates
from demand_prediction.dataset_functions import split_data, create_events_df
from demand_prediction.ts_models import train_models, test_models, save_model, load_model
from demand_prediction.events_models import load_events_model, save_events_model, calc_events_ts
from demand_prediction.neural_prophet_model import NeuralProphetEvents, reformat_events_name, get_events_for_neural_prophet, get_neural_prophet_results
from demand_prediction.lstm_models import get_lstm_results
from demand_prediction.tcn_models import get_tcn_results
from demand_prediction.results_functions import get_all_k_metrics

Global seed set to 0


# Datasets

## Events

In [4]:
world_events = get_df_table("events/world_events_dataset_from_1980") 
world_events.head()

Total data size:  16766


Unnamed: 0,wiki_name,date,country,Category,High-Category,ref_num,avg_num_views,embedding
0,1980 Avon Championships of Cincinnati,1980-01-07,United States of America,TennisTournament,SportsEvent,1,1.923165,"[-0.7737418, 0.44452286, 0.44949022, -0.293493..."
1,1980 Avon Championships of Kansas,1980-01-14,United States of America,TennisTournament,SportsEvent,3,2.240642,"[-0.7737418, 0.44452286, 0.44949022, -0.293493..."
2,1980 Birmingham Open,1980-01-14,United States of America,TennisTournament,SportsEvent,4,2.239492,"[-0.3976065, -0.59886205, 1.0677863, 0.0126197..."
3,1980 Avon Championships of Chicago,1980-01-21,United States of America,TennisTournament,SportsEvent,2,2.626353,"[-0.7737418, 0.44452286, 0.44949022, -0.293493..."
4,1980 CONMEBOL Pre-Olympic Tournament,1980-01-23,Colombia,SoccerTournament,SportsEvent,15,3.678718,"[-0.84798235, -0.18931623, -0.30384776, -0.392..."


## Ecommerce 

Use the following random time series as an expample or provide your own time series.
Please make sure that the ts is a DataFrame that contains one column which is the product sales, and the index is the dates.

In [5]:
dates_example = pd.date_range("2018-06-01", "2020-12-31",freq='d')
values_example = np.random.randint(100,2000,size=(len(dates_example)))
categ_data = pd.DataFrame({'date': dates_example, 'Quantity': values_example})
categ_data.index = categ_data['date']
categ_data = categ_data.drop(columns=['date'])

## Time Series

In [6]:
leaf_name = 'Football Cards'

In [7]:
categ_data.iplot(title=leaf_name, xTitle='Date', yTitle='Sales', theme='white', colors=['steelblue'])

### Events Dataset 

In [8]:
data = create_events_df(categ_data, world_events, emb_only=True)
events_dates = list(set(data['date']))

### Hyper-Parameters

In [10]:
ts_cache = True 
neural_cache = True
lstm_cache = True
lstm_df_cache = True
tcn_cache = True
tcn_df_cache = True
results_cache = True 

n_in = 365
window_size = 2    
prediction_time = 30   

device = 'cpu'  # use 'cuda:2' if you have GPUs

In [11]:
total_pred = pd.DataFrame()
start_pred_list = get_pred_dates('2020-01-01', '2021-01-01')

for start_pred_time in tqdm(start_pred_list): 
    pred_path = cache_path + "/saved_results/final_results_" + leaf_name + "_" + str(start_pred_time) + "_predictions"
    if pred_path and os.path.isfile(pred_path):
        total_pred = pd.read_pickle(pred_path)
        
    else:
        X_train, X_test = split_data(data, start_pred_time)    
        time_series = TimeSeries.from_dataframe(categ_data, value_cols='Quantity')    
        train, test_ts = time_series.split_before(pd.Timestamp(start_pred_time))
        test = test_ts[:prediction_time]
        events_all = pd.concat([X_train, X_test])
        train_df, test_df = train.pd_dataframe(), test.pd_dataframe()
        train_dates, test_dates = train_df.index.values, test_df.index.values
         
        res_prediction = test_models(test, test_name=leaf_name, start_pred_time=start_pred_time, train=train, use_cache=ts_cache)        
        lstm_predictions = get_lstm_results(train, test, train_df, test_df, events_all, start_pred_time, leaf_name, n_in, window_size, categ_data, device, lstm_df_cache, lstm_cache)                   
        tcn_predictions = get_tcn_results(train, test, train_df, test_df, events_all, start_pred_time, leaf_name, n_in, window_size, categ_data, device, tcn_df_cache, tcn_cache)        
        neural_predictions = get_neural_prophet_results(train, test, events_all, leaf_name, events_dates, start_pred_time, neural_cache)
        
        total_pred = pd.concat([total_pred, pd.concat([res_prediction, lstm_predictions, tcn_predictions, neural_predictions], axis=1)])
        os.makedirs(os.path.dirname(pred_path), exist_ok=True)
        total_pred.to_pickle(pred_path) 

100%|██████████| 1/1 [00:00<00:00, 193.05it/s]


# Prediction Plot

In [12]:
total_pred = total_pred[total_pred.index >= start_pred_list[0]]

In [13]:
pred_df = total_pred[['Real Quantity', 'LSTM', 'GAN - Event LSTM', 'Event LSTM', 'Weighted Event LSTM', 'ARIMA', 'Prophet', 'NeuralProphet', 'GAN - Event CNN']]
pred_df.iplot(title = leaf_name + " - All Models", xTitle='Date', yTitle='Sales', theme='white')

# Metrics@K

In [14]:
get_all_k_metrics(total_pred)

Unnamed: 0,ARIMA,Prophet,NeuralProphet,LSTM,Event LSTM,GAN - Event LSTM,Weighted Event LSTM,GAN - Event CNN,Event NeuralProphet
MAE@5,731.0,998.0,2347.0,1646.0,909.0,1025.0,872.0,1395.0,2313.0
MAE@10,524.0,832.0,1862.0,1438.0,722.0,833.0,712.0,1191.0,1774.0
MAE@20,374.0,504.0,1655.0,1081.0,464.0,511.0,473.0,852.0,1682.0
wMAPE@5,0.42,0.573,1.347,0.945,0.522,0.589,0.501,0.801,1.328
wMAPE@10,0.341,0.542,1.213,0.937,0.47,0.543,0.464,0.776,1.156
wMAPE@20,0.318,0.428,1.407,0.919,0.394,0.434,0.402,0.725,1.43
