# MTL-Embedding

In [None]:
import libs.config as config
import libs.hyperparam as hyperparam
import libs.util as util 
import libs.customized_dataset as customized_dataset
import libs.models as models 
import libs.forecasting as forecasting 
import libs.plots as plots 

import os  # for interacting with the operating system
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # for debugging cuda errors
import glob  # for finding files in directories
import warnings 
warnings.filterwarnings('ignore') # for ignoring all warnings
# import argparse  # for parsing command line arguments (for running from the terminal)

import random
import math # for math operations
import time  # for time-related functionalities
import holidays # for checking if a date is a holiday
import matplotlib.pyplot as plt  # for plotting
plt.set_cmap('cividis') # color map for the plots to 'cividis'
## Enable the display of matplotlib plots inline in a Jupyter notebook
%matplotlib inline 
import matplotlib.ticker as ticker # for customizing the plots' tick locations and labels
import numpy as np  # for numerical computations
import pandas as pd  # for data manipulation and analysis
pd.set_option('display.max_columns', None) # to display all columns
from datetime import date, datetime, timedelta  # for working with dates and times

from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer  # for scaling data
from tqdm import tqdm  # for creating progress bars

import torch  # for building and training neural networks
from torch.utils.data import Dataset, DataLoader # for loading and managing datasets
import torch.nn as nn  # for building neural networks
import torch.nn.functional as F  # for implementing various activation functions
import torch.optim as optim  # for defining optimizers

# set the device as GPU with index 0
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
config.get_environ_info(device)

### Check NVIDIA GPU

In [None]:
# !nvidia-smi

### Set data path

In [None]:
## Google cluster
# READ_DIR = r"/home/kkim476/dl-cbcv/data/weekly_cohort_data_1000_missing_filled_final"
READ_DIR = r"/home/kkim476/dl-cbcv/data/selected_ten"
WORK_DIR = r"/home/kkim476/dl-cbcv"

### Set prediction target, covariates, and save mode

In [None]:
# set prediction target
PREDICTION_GOAL = 'mtl_4tasks' # mtl_3tasks or mtl_4tasks which includes sales binding 
TARGET_TASKS = config.get_target_variable_name(PREDICTION_GOAL)

if PREDICTION_GOAL == 'mtl_4tasks':
    weights = torch.tensor([hyperparam.w_acq, hyperparam.w_ropc, hyperparam.w_aov, hyperparam.w_spend]).to(device)
elif PREDICTION_GOAL == 'mtl_3tasks':
    weights = torch.tensor([hyperparam.w_acq, hyperparam.w_ropc, hyperparam.w_aov]).to(device)

### set covariate features
USE_EMBEDDING = True
COHORT_EMBEDDING = True
DUMMY_VAR = False # entity embedding, rather than one hot encoding

### folder path for saving results
SAVE_MODE = False
SAVE_DIR = f'{WORK_DIR}/results/{PREDICTION_GOAL}_embedding10_{datetime.today().strftime("%Y-%m-%d")}'
SAVE_MODEL, SAVE_EPOCH, SAVE_PLOT, SAVE_PREDICT, SAVE_ACTUAL = config.create_save_folders(SAVE_MODE, SAVE_DIR)

## Data Preparation

### Load data

In [None]:
## this file is cohort-week level aggregated panel data after ETL of raw earnest transaction DB
raw_df, MERCHANT_NAMES_EMB_INT = util.read_files_generate_behaviorfeatures_get_embed_dict(
    READ_DIR, hyperparam.TRAIN_START, hyperparam.TEST_START, hyperparam.TEST_END,
    group_identifier='acq_week', time_identifier='week',
    acquisition_identifier='N_week_cohort',
    order_identifier = 'orders',
    spend_identifier = 'spend')

FREQ, week_start = util.get_week_start(raw_df, hyperparam.TRAIN_START, hyperparam.TEST_START, hyperparam.TEST_END) 

### Zero padding

This is technical but important piece regarding how to handle 'cohort' triangle data.

In [None]:
# columns that I want to keep
company_static = ['merchant_index', 'merchant', 'parent_merchant', 'category', 'subcategory']
if USE_EMBEDDING:
    company_static = company_static + ['merchant_emb_int', 'merchant_name']
company_dynamic = ['cohort_size', 'orders', 'spend',
                #    'active_users', 
                #    'rpt_orders','initial_order',
                #    'rpt_spend','initial_spend',
                #    'initial_aov','rpt_aov',
                   ]
# company_dynamic = []

assert all(column in list(raw_df.columns) for column in company_static + company_dynamic), \
    'some columns are missing in the raw_df'

df_padded = util.zero_padding(raw_df, TARGET_TASKS, company_static, company_dynamic, hyperparam.INPUT_CHUNK_LENGTH, FREQ,
                               use_merchant_embedding=True, merchant_name='merchant_name')


### Generate calendar time covariates

Our covariates include:
- week of year_t+1 (1 week ahead) : (categorical) one-hot encoding. 53 variables.
- holidays_t+1 (1 week ahead) : (binary) dummy variable. 1 variable.
- global trend_t : (continuous) quadratic. 2 variables.
- cohort numbering_i : (continuous) quadratic. 2 variables.
- cohort tenure_it : (continuous) quadratic. 2 variables.

In [None]:
df_padded_w_cov, country_holidays = util.generate_calendartime_features(df_padded, FREQ, week_start, DUMMY_VAR)
# util.check_holidays(df_padded_w_cov, country_holidays)
df_padded_w_cov = util.generate_cohort_features(df_padded_w_cov, TARGET_TASKS, COHORT_EMBEDDING, DUMMY_VAR, hyperparam.COHORT_EMB_NUM)

In [None]:
## limit data range
df = df_padded_w_cov[df_padded_w_cov['group']<=hyperparam.TEST_END] # limit cohort
df = df[df['time']>=hyperparam.TRAIN_START_with_offset][(df['time']<=hyperparam.TEST_END)] # limit time window

## get covariate feature names
COVARIATE_FEATURE_NAMES, covariate_name_to_index = config.get_covariate_variable_name(df, USE_EMBEDDING, COHORT_EMBEDDING, DUMMY_VAR)
print(COVARIATE_FEATURE_NAMES)
df

In [None]:
del df_padded
del df_padded_w_cov
del country_holidays

### Split into train:val:test (time-wise) X censored:uncensored (group-wise)

<img src="../img/cohort_triangle.png" alt="cohort_triangle" width="600" height="400">

For censored (cohort 0) group of cohorts (who acquired in 2016)
- (A) train: ~ 2018
- (B) validation : '2019-01-01' ~ '2019-03-31'
- (C) test : '2019-04-01' ~

For uncensored (cohort 1~) group of cohorts (who acquired in 2017~)
- (D) train: ~ 2018
- (E) validation : '2019-01-01' ~ '2019-03-31'
- (F) test : '2019-04-01' ~

We do not distinguish between censored and uncensored cohorts in training. We found empirically that it is better to train them together by obtaining more samples at the cost of giving up incorporating heterogeneity between two.



In [None]:
(main_df, main_df_train, main_df_valid, main_df_test, 
 censored_df, censored_df_train, censored_df_valid, censored_df_test) = util.split_dataframe(
    df, hyperparam.TRAIN_START, hyperparam.VAL_START, hyperparam.TEST_START, 
    hyperparam.VAL_START_with_offset, hyperparam.TEST_START_with_offset, hyperparam.VAL_LOSS)

### Transform data frame to scaled numpy array

1. As numpy array is more efficient to handle, we transform pandas data frame into numpy array.

2. For each task, we scale the data with its own scaler. This is important for multi-task learning as each task has different scale.

In [None]:
## entire sequences (unscaled)
whole_dict = util.df_to_numpy(df, TASKS=TARGET_TASKS, COVARIATES=COVARIATE_FEATURE_NAMES, 
                              use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')

## main cohorts' sequences
main_train_dict = util.df_to_scaled_numpy(main_df_train, TARGET_TASKS, COVARIATE_FEATURE_NAMES,
                                          use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')
main_test_dict = util.df_to_scaled_numpy(main_df_test, TARGET_TASKS, COVARIATE_FEATURE_NAMES,  main_train_dict['scaler'],
                                         use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')
if hyperparam.VAL_LOSS:
    main_val_dict = util.df_to_scaled_numpy(main_df_valid, TARGET_TASKS, COVARIATE_FEATURE_NAMES, main_train_dict['scaler'],
                                            use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')

## censored cohorts' sequences
censored_train_dict = util.df_to_scaled_numpy(censored_df_train, TARGET_TASKS, COVARIATE_FEATURE_NAMES, main_train_dict['scaler'],
                                              use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')
censored_test_dict = util.df_to_scaled_numpy(censored_df_test, TARGET_TASKS, COVARIATE_FEATURE_NAMES, main_train_dict['scaler'],
                                             use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')
if hyperparam.VAL_LOSS:
    censored_val_dict = util.df_to_scaled_numpy(censored_df_valid, TARGET_TASKS, COVARIATE_FEATURE_NAMES, main_train_dict['scaler'],
                                                use_merchant_embedding=USE_EMBEDDING, merchant_name='merchant_name')


In [None]:
import pickle

if SAVE_MODE:
    # To save
    with open(f'{SAVE_MODEL}/np_dict_data.pkl', 'wb') as file:
        pickle.dump([whole_dict, 
                    main_train_dict, main_test_dict, main_val_dict,
                    censored_train_dict, censored_test_dict, censored_val_dict], file)

    df.to_hdf(f'{SAVE_MODEL}/df.h5', key='df', mode='w')
    
    # # To load
    # with open(f'{SAVE_MODEL}/np_dict_data.pkl', 'rb') as file:
    #     whole_dict, main_train_dict, main_test_dict, main_val_dict, \
    #         censored_train_dict, censored_test_dict, censored_val_dict = pickle.load(file)
            
    # df = pd.read_hdf(f'{SAVE_MODEL}/df.h5', key='df')


## Dataset and DataLoader
the creation of custom datasets, and the initialization of the DataLoader.
- custom dataset class will group the data by the group column
- collate function will handle the padding and attention masking

<img src="../img/data%20period%20and%20samples.png" alt="data_period" width="600" height="400">

<img src="../img/input%20output%20format.png" alt="input_output" width="600" height="400">



### Generate train loader and validation loader

In [None]:
from libs.customized_dataset import CrossSectionalTimeSeriesDataset, collate_fn, value_dict_to_np
import multiprocessing

## Create a TimeSeriesDataset instance and initialize DataLoader for each data
value_train = {key: main_train_dict['scaled_value_seq_dict'].get(key, []) +\
  censored_train_dict['scaled_value_seq_dict'].get(key, []) \
    for key in set(main_train_dict['scaled_value_seq_dict']) | set(censored_train_dict['scaled_value_seq_dict'])}
cov_train = main_train_dict['cov_seq'] + censored_train_dict['cov_seq']
value_train_np = value_dict_to_np(value_train, TARGET_TASKS) # shape: (num_groups, seq_len, num_tasks)
train_dataset = CrossSectionalTimeSeriesDataset(value_train_np, cov_train, hyperparam.INPUT_CHUNK_LENGTH)

## To be iterated over batches of data during training
train_loader = DataLoader(train_dataset,
                          batch_size=hyperparam.BATCH_SIZE, # how many samples per batch to load
                          shuffle=True, # have the data reshuffled at every epoch to reduce model overfitting
                          drop_last=False, # If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller
                          collate_fn=collate_fn,
                          pin_memory=True, # True for faster data transfer to GPUs, False if out of memory
                          num_workers=multiprocessing.cpu_count()//2,
                        #   num_workers=hyperparam.NUM_WORKERS, # how many subprocesses to use for data loading. (0: loaded in the main process)
                          )

if hyperparam.VAL_LOSS:
    value_valid = {key: main_val_dict['scaled_value_seq_dict'].get(key, []) +\
        censored_val_dict['scaled_value_seq_dict'].get(key, []) \
            for key in set(main_val_dict['scaled_value_seq_dict']) | set(censored_val_dict['scaled_value_seq_dict'])}
    cov_valid = main_val_dict['cov_seq'] + censored_val_dict['cov_seq']
    value_valid_np = value_dict_to_np(value_valid, TARGET_TASKS) # shape: (num_groups, seq_len, num_tasks)
    val_dataset = CrossSectionalTimeSeriesDataset(value_valid_np, cov_valid, hyperparam.INPUT_CHUNK_LENGTH)
    val_loader = DataLoader(val_dataset, batch_size=hyperparam.BATCH_SIZE, shuffle=False, drop_last=False, # no shuffle for validation
                              collate_fn=collate_fn,
                              )

## get the dimension of the target and covariate data
first_sample = next(iter(train_dataset)) # or train_dataset[0]
first_batch = next(iter(train_loader))

## number of targets, number of covariate features
tgt_dim, cov_dim = first_sample["target"].shape[1], first_sample["covariate"].shape[1] # 3 , 60
if PREDICTION_GOAL == 'mtl_4tasks':
    tgt_dim = 3

In [None]:
if SAVE_MODE:
    # To save the model
    with open(f'{SAVE_MODEL}/data_loader.pkl', 'wb') as file:
        pickle.dump([train_loader, val_loader], file)
        
    # with open(f'{SAVE_MODEL}/data_loader.pkl', 'rb') as file:
    #     train_loader, val_loader = pickle.load(file)


## Model

<div style="background-color:white;">
    <img src="../img/MTL.png" alt="mtl" width="550" height="700">
</div>

If considering attention mask later, modify with this:
- `def forward(self, src, attention_mask):`
- `x = self.transformer(src=src, tgt=tgt, src_key_padding_mask=attention_mask)`

## Training (Estimation)

In [None]:
## For memory monitoring ==
# !pip install memory_profiler
# %load_ext memory_profiler
# %memit my_function()

In [None]:
from libs.models import MTL_Transformer, calculate_MTLloss_3tasks, calculate_MTLloss_4tasks

## initialize model
model = MTL_Transformer(
    input_dim=tgt_dim + cov_dim,
    feature_dict=covariate_name_to_index, 
    d_model=hyperparam.D_MODEL_MTL_EMB1,
    num_encoder_layers=hyperparam.N_ENCODER_LAYERS_MTL_EMB1,
    num_decoder_layers=hyperparam.N_DECODER_LAYERS_MTL_EMB1,
    d_feedforward=hyperparam.D_FEEDFORWARD_MTL_EMB1,
    d_feedforward_task=hyperparam.D_FEEDFORWARD_TASK_MTL_EMB1,
    dropout=hyperparam.DROPOUT_MTL_EMB1,
    num_merchant=len(MERCHANT_NAMES_EMB_INT),
).to(device)

## define optimizer and loss criterion
optimizer = torch.optim.Adam(model.parameters(), lr=hyperparam.LEARNING_RATE_MTL_EMB1)
individual_loss_criterion = torch.nn.MSELoss(reduction='none')

## initialize empty list for losses and early stop
train_losses, valid_losses = [], []
pre_valid_loss, cnt_no_improve = np.inf, 0


In [None]:
if PREDICTION_GOAL == 'mtl_3tasks':

    for epoch in tqdm(range(hyperparam.N_EPOCHS), desc="training", unit="epoch"):
        train_loss = 0.0 # within each epoch, initialize train loss to 0

        for target_input, cov_input, gt in train_loader:
            optimizer.zero_grad() # reset optimizer gradients to zero
            loss = calculate_MTLloss_3tasks(model, target_input, cov_input, gt, 
                                    individual_loss_criterion, weights, device)
            train_loss += loss.item() # accumulate batch loss within each epoch
            loss.backward() # backpropagation
            _ = nn.utils.clip_grad_norm_(model.parameters(), hyperparam.GRADCLIP) # clip gradients to prevent exploding gradients
            optimizer.step() # update parameters based on gradients

        train_losses.append(train_loss/len(train_loader)) # append total train loss for each epoch

        if hyperparam.VAL_LOSS:
            valid_loss = 0.0
            for target_input, cov_input, gt in val_loader:
                loss = calculate_MTLloss_3tasks(model, target_input, cov_input, gt, 
                                                individual_loss_criterion, weights, device)
                valid_loss += loss.item()

            valid_losses.append(valid_loss/len(val_loader))
            
            # Early stop evaluate
            if pre_valid_loss - valid_loss  < hyperparam.MINDELTA:
                cnt_no_improve += 1
                if cnt_no_improve > hyperparam.PATIENCE:
                    break
            else:
                cnt_no_improve = 0
                pre_valid_loss = valid_loss
        
        if epoch % 10 == 0:
            print("train_loss:{:.4f}".format(train_loss))
            if hyperparam.VAL_LOSS:
                print("val loss: {:.4f}".format(valid_loss))
            


In [None]:
if PREDICTION_GOAL == 'mtl_4tasks':
    
    ## get min and max values for each target
    min_scaler_values = {target: {} for target in TARGET_TASKS}
    max_scaler_values = {target: {} for target in TARGET_TASKS}
    for target in TARGET_TASKS:
        min_scaler_values[target] = torch.tensor(main_train_dict['scaler'][target].data_min_, dtype=torch.float32).to(device)
        max_scaler_values[target] = torch.tensor(main_train_dict['scaler'][target].data_max_, dtype=torch.float32).to(device)

    ## trainig loop
    for epoch in tqdm(range(hyperparam.N_EPOCHS), desc="training", unit="epoch"):
        train_loss = 0.0 # within each epoch, initialize train loss to 0

        for target_input, cov_input, gt in train_loader:   
            optimizer.zero_grad() # reset optimizer gradients to zero
            loss = calculate_MTLloss_4tasks(model, target_input, cov_input, gt, 
                                            max_scaler_values, min_scaler_values, TARGET_TASKS,
                                            individual_loss_criterion, weights, device)
            train_loss += loss.item() # accumulate batch loss within each epoch
            loss.backward() # backpropagation
            _ = nn.utils.clip_grad_norm_(model.parameters(), hyperparam.GRADCLIP) # clip gradients to prevent exploding gradients
            optimizer.step() # update parameters based on gradients

        train_losses.append(train_loss/len(train_loader)) # append total train loss for each epoch

        if hyperparam.VAL_LOSS:
            valid_loss = 0.0
            for target_input, cov_input, gt in val_loader:
                loss = calculate_MTLloss_4tasks(model, target_input, cov_input, gt, 
                                                max_scaler_values, min_scaler_values, TARGET_TASKS,
                                                individual_loss_criterion, weights, device)
                valid_loss += loss.item()

            valid_losses.append(valid_loss/len(val_loader))
            
            # Early stop evaluate
            if pre_valid_loss - valid_loss  < hyperparam.MINDELTA:
                cnt_no_improve += 1
                if cnt_no_improve > hyperparam.PATIENCE:
                    break
            else:
                cnt_no_improve = 0
                
            pre_valid_loss = valid_loss
        
        if epoch % 10 == 0:
            print("train_loss:{:.4f}".format(train_loss))
            if hyperparam.VAL_LOSS:
                print("val loss: {:.4f}".format(valid_loss))

In [None]:
def plot_losses():
    plt.figure(figsize=(10, 6))
    plt.xlabel("# of epoch")
    plt.plot(train_losses, label="train loss")
    plt.plot(valid_losses, label="valid loss")
    plt.title(f"{PREDICTION_GOAL} Embedding {len(MERCHANT_NAMES_EMB_INT)} model Losses")
    plt.legend()
        
plot_losses()

In [None]:
import pickle

if SAVE_MODE:
    # To save the model
    with open(f'{SAVE_MODEL}/{PREDICTION_GOAL}_embedding_{len(MERCHANT_NAMES_EMB_INT)}.pkl', 'wb') as file:
        pickle.dump(model, file)

    # # To load the model
    # with open(f'{SAVE_MODEL}/{PREDICTION_GOAL}_embedding_{len(MERCHANT_NAMES_EMB_INT)}.pkl', 'rb') as file:
    #     model = pickle.load(file)


## Prediction (Inference)

- Rolling forecast origin or walk-forward validation (which means generating predictions one step at a time and conditioning upon the predicted values)

For each rolling window:
- Use the last `INPUT_CHUNK_LENGTH` weeks of data as input to forecast the next week.
- Append the forecasted value to the actual data.
- Move the window one week forward and repeat.

*NOTE:* Error can be accumulated in **triple** way as we now take acq_hat, repeat order per customer_hat, aov_hat all together for next acq prediction,for example.

### Cohort 1 (acquired after the beginning of train period)

In [None]:
from libs.forecasting import prepare_TimeSeriesDataset, rolling_forecast, get_sales_recovered_data

# Sets the module in evaluation mode
model.eval()

# rolling forecast (scaled)
main_value_test_np = value_dict_to_np(main_test_dict['scaled_value_seq_dict'], TARGET_TASKS)
test_datasets = prepare_TimeSeriesDataset(main_value_test_np, main_test_dict['cov_seq'], CrossSectionalTimeSeriesDataset, hyperparam.INPUT_CHUNK_LENGTH)
main_value_test_pred = rolling_forecast(test_datasets, model, device, TARGET_TASKS, verbose=True)

# get ground truth in the test period
main_df_test_net = main_df_test[main_df_test['time']>=hyperparam.TEST_START][main_df_test['tenure']>=0]
actual_main = main_df_test_net[['merchant_name','group', 'time', 'cohort_size'] + TARGET_TASKS]

# scale back the forecast
predicted_main = util.inverse_scale_np_to_dataframe_embedding(main_test_dict['scaler'], main_test_dict['group_seq'], main_test_dict['time_seq'], 
                                                                  main_value_test_pred, hyperparam.INPUT_CHUNK_LENGTH)
predicted_main.sort_values(['merchant_name', 'group', 'time'], inplace=True)

#### Cohort 1's Sales recovery

For sales prediction over the holdout period, we need both acquisition over the calibration period and acquisition over the holdout period. (because we need to scale up repeat order per customer from the cohort who were acquired prior to holdout period)

We only have predicted acquisition values for holdout period. We imputed acquisition values for calibration period with actual acquisition value. This makes sense, as our goal is doing our best to maximize our prediction performance given observed calibration data.

In [None]:
if PREDICTION_GOAL == 'mtl_4tasks':
    acq_variable, ropc_variable, aov_variable, _ = TARGET_TASKS
elif PREDICTION_GOAL == 'mtl_3tasks':
    acq_variable, ropc_variable, aov_variable = TARGET_TASKS
else:
    raise ValueError('PREDICTION_GOAL is not tracked.')

## get the average of pseudo acquisition
actual_main_acq = actual_main[['merchant_name','time', acq_variable]].drop_duplicates().reset_index(drop=True)
predicted_main_acq = predicted_main[['merchant_name','time', acq_variable]].groupby(['merchant_name','time']).mean().reset_index()

actual_main, predicted_main = get_sales_recovered_data(predicted_main_acq, actual_main, predicted_main, hyperparam.TEST_START,
                                                       USE_EMBEDDING, MERCHANT_NAMES_EMB_INT)


#### Acquisition plot

In [None]:
from libs.plots import plot_time_series, plot_time_series_multiple
def plot_acq_cohort1(merchant_name):
    plot_time_series(actual_main_acq[actual_main_acq['merchant_name']==merchant_name], 
                     predicted_main_acq[actual_main_acq['merchant_name']==merchant_name],
                     'time', acq_variable,
        title=f'{merchant_name} [Uncensored Cohort 1\'s Acquisition]',
        )

plot_acq_cohort1(list(MERCHANT_NAMES_EMB_INT.keys())[0])

#### Repeat order per customer plot

In [None]:
def plot_ropc_cohort1(merchant_name):
    selected_tuples = [t for t in main_test_dict['group_seq'] if t[0] == merchant_name]
    fig, axs = plt.subplots(3, 1, figsize=(20, 20))  # 5 rows, 1 column
    selected_group_indices = random.sample(range(len(selected_tuples)), 3)
    for i, group_index in enumerate(selected_group_indices):
       group = selected_tuples[group_index][1]
       plot_time_series_multiple(
            main_df[(main_df['group'] == group) & (main_df['merchant_name'] == merchant_name)], 
            predicted_main[(predicted_main['group'] == group) & (predicted_main['merchant_name'] == merchant_name)],
            'time', ropc_variable, 
            title=f'{merchant_name} [Group {group}\'s Repeat Order per Customer]',
            ax=axs[i]
        )
    plt.tight_layout()

plot_ropc_cohort1(list(MERCHANT_NAMES_EMB_INT.keys())[0])

#### AOV plot

In [None]:
def plot_aov_cohort1(merchant_name):
    selected_tuples = [t for t in main_test_dict['group_seq'] if t[0] == merchant_name]
    fig, axs = plt.subplots(3, 1, figsize=(20, 20))  # 5 rows, 1 column
    selected_group_indices = random.sample(range(len(selected_tuples)), 3)
    for i, group_index in enumerate(selected_group_indices):
       group = selected_tuples[group_index][1]
       plot_time_series_multiple(
            main_df[(main_df['group'] == group) & (main_df['merchant_name'] == merchant_name)], 
            predicted_main[(predicted_main['group'] == group) & (predicted_main['merchant_name'] == merchant_name)],
            'time', aov_variable, 
            title=f'{merchant_name} [Group {group}\'s AOV]',
            ax=axs[i]
        )
    plt.tight_layout()

plot_aov_cohort1(list(MERCHANT_NAMES_EMB_INT.keys())[0])

#### Cohort 1's Sales plot

In [None]:
actual_main_sales = actual_main[actual_main['time']>=hyperparam.TEST_START][['merchant_name','time','sales']].groupby(['merchant_name','time']).sum().reset_index()
predicted_main_sales = predicted_main[predicted_main['time']>=hyperparam.TEST_START][['merchant_name','time','sales']].groupby(['merchant_name','time']).sum().reset_index()

def plot_sales_cohort1(merchant_name):
    actual_main_sales0 = actual_main_sales[actual_main_sales['merchant_name']==merchant_name]
    predicted_main_sales0 = predicted_main_sales[predicted_main_sales['merchant_name']==merchant_name]
    plot_time_series(actual_main_sales0, predicted_main_sales0, 'time', 'sales',
        title=f'{merchant_name} [Cohort 1\'s Sales (Uncensored cohorts)]', formatter=True, ylabel='Sales')

plot_sales_cohort1(list(MERCHANT_NAMES_EMB_INT.keys())[0])

### Cohort 0 (acquired before the beginning of train period)

In [None]:
# Sets the module in evaluation mode
model.eval()

# rolling forecast (scaled)
censored_value_test_np = value_dict_to_np(censored_test_dict['scaled_value_seq_dict'], TARGET_TASKS)
censored_test_datasets = prepare_TimeSeriesDataset(censored_value_test_np, censored_test_dict['cov_seq'], CrossSectionalTimeSeriesDataset, hyperparam.INPUT_CHUNK_LENGTH)
censored_value_test_pred = rolling_forecast(censored_test_datasets, model, device, TARGET_TASKS, verbose=True)

# get ground truth in the test period
censored_df_test_net = censored_df_test[censored_df_test['time']>=hyperparam.TEST_START][censored_df_test['tenure']>=0]
actual_censored = censored_df_test_net[['merchant_name','group', 'time', 'cohort_size'] + TARGET_TASKS]

# scale back the forecast
predicted_censored = util.inverse_scale_np_to_dataframe_embedding(censored_test_dict['scaler'], censored_test_dict['group_seq'], censored_test_dict['time_seq'], 
                                                                  censored_value_test_pred, hyperparam.INPUT_CHUNK_LENGTH)
predicted_censored.sort_values(['merchant_name', 'group', 'time'], inplace=True)

#### Cohort 0's sales recovery

In [None]:
if PREDICTION_GOAL == 'mtl_4tasks':
    acq_variable, ropc_variable, aov_variable, _ = TARGET_TASKS
elif PREDICTION_GOAL == 'mtl_3tasks':
    acq_variable, ropc_variable, aov_variable = TARGET_TASKS
else:
    raise ValueError('PREDICTION_GOAL is not tracked.')

## get the average of pseudo acquisition
actual_censored_acq = actual_censored[['merchant_name','time', acq_variable]].drop_duplicates().reset_index(drop=True)
predicted_censored_acq = predicted_censored[['merchant_name','time', acq_variable]].groupby(['merchant_name','time']).mean().reset_index()

actual_censored, predicted_censored = get_sales_recovered_data(predicted_censored_acq, actual_censored, predicted_censored, hyperparam.TEST_START,
                                                       USE_EMBEDDING, MERCHANT_NAMES_EMB_INT)

#### Acquisition plot

In [None]:
from libs.plots import plot_time_series, plot_time_series_multiple
def plot_acq_cohort0(merchant_name):
    plot_time_series(actual_censored_acq[actual_censored_acq['merchant_name']==merchant_name], 
                     predicted_censored_acq[predicted_censored_acq['merchant_name']==merchant_name],
                     'time', acq_variable,
        title=f'{merchant_name} [Uncensored Cohort 0\'s Acquisition]',
        )

plot_acq_cohort0(list(MERCHANT_NAMES_EMB_INT.keys())[0])

#### Repeat order per customer plot

In [None]:
def plot_ropc_cohort0(merchant_name):
    selected_tuples = [t for t in censored_test_dict['group_seq'] if t[0] == merchant_name]
    fig, axs = plt.subplots(3, 1, figsize=(20, 20))  # 5 rows, 1 column
    selected_group_indices = random.sample(range(len(selected_tuples)), 3)
    for i, group_index in enumerate(selected_group_indices):
       group = selected_tuples[group_index][1]
       plot_time_series_multiple(
            censored_df[(censored_df['group'] == group) & (censored_df['merchant_name'] == merchant_name)], 
            predicted_censored[(predicted_censored['group'] == group) & (predicted_censored['merchant_name'] == merchant_name)],
            'time', ropc_variable, 
            title=f'{merchant_name} [Group {group}\'s Repeat Order per Customer]',
            ax=axs[i]
        )
    plt.tight_layout()

plot_ropc_cohort0(list(MERCHANT_NAMES_EMB_INT.keys())[0])



#### AOV plot

In [None]:
def plot_aov_cohort0(merchant_name):
    selected_tuples = [t for t in censored_test_dict['group_seq'] if t[0] == merchant_name]
    fig, axs = plt.subplots(3, 1, figsize=(20, 20))  # 5 rows, 1 column
    selected_group_indices = random.sample(range(len(selected_tuples)), 3)
    for i, group_index in enumerate(selected_group_indices):
       group = selected_tuples[group_index][1]
       plot_time_series_multiple(
            censored_df[(censored_df['group'] == group) & (censored_df['merchant_name'] == merchant_name)], 
            predicted_censored[(predicted_censored['group'] == group) & (predicted_censored['merchant_name'] == merchant_name)],
            'time', aov_variable, 
            title=f'{merchant_name} [Group {group}\'s AOV]',
            ax=axs[i]
        )
    plt.tight_layout()

plot_aov_cohort0(list(MERCHANT_NAMES_EMB_INT.keys())[0])



#### Cohort 0's Sales plot

In [None]:
actual_censored_sales = actual_censored[actual_censored['time']>=hyperparam.TEST_START][['merchant_name','time','sales']].groupby(['merchant_name','time']).sum().reset_index()
predicted_censored_sales = predicted_censored[predicted_censored['time']>=hyperparam.TEST_START][['merchant_name','time','sales']].groupby(['merchant_name','time']).sum().reset_index()

def plot_sales_cohort0(merchant_name):
    actual_censored_sales0 = actual_censored_sales[actual_censored_sales['merchant_name']==merchant_name]
    predicted_censored_sales0 = predicted_censored_sales[predicted_censored_sales['merchant_name']==merchant_name]
    plot_time_series(actual_censored_sales0, predicted_censored_sales0, 'time', 'sales',
        title=f'{merchant_name} [Cohort 0\'s Sales (Uncensored cohorts)]', formatter=True, ylabel='Sales')

plot_sales_cohort0(list(MERCHANT_NAMES_EMB_INT.keys())[0])

### Total Sales

In [None]:
actual_total_sales = pd.merge(actual_main_sales, actual_censored_sales, on=['merchant_name','time'], how='outer')
actual_total_sales['total_sales'] = actual_total_sales['sales_x'] + actual_total_sales['sales_y']

pred_total_sales = pd.merge(predicted_main_sales, predicted_censored_sales, on=['merchant_name','time'], how='outer')
pred_total_sales['total_sales'] = pred_total_sales['sales_x'] + pred_total_sales['sales_y']

def plot_total_sales(merchant_name):
    plot_time_series(actual_total_sales[actual_total_sales['merchant_name']==merchant_name], 
                     pred_total_sales[pred_total_sales['merchant_name']==merchant_name], 'time', 'total_sales',
        title=f'{merchant_name} [Total Sales] over test period', formatter=True, ylabel='Sales')
    
plot_total_sales(list(MERCHANT_NAMES_EMB_INT.keys())[0])

In [None]:
actual_whole = df[['merchant_name','time','spend']].groupby(['merchant_name','time']).sum().reset_index()
actual_whole['time'] = pd.to_datetime(actual_whole['time'], format='%Y-%m-%d')
actual_whole['total_sales'] = actual_whole['spend']

def plot_total_sales_whole(merchant_name):    
    plot_time_series(actual_whole[actual_whole['merchant_name']==merchant_name], 
                     pred_total_sales[pred_total_sales['merchant_name']==merchant_name], 
                     'time', 'total_sales',
        title=f'{merchant_name} [Total Sales] over entire period', formatter=True, ylabel='Sales')
plot_total_sales_whole(list(MERCHANT_NAMES_EMB_INT.keys())[0])

In [None]:
from libs.plots import save_plots_to_pdf

if SAVE_MODE:
    # Saving plots to PDF
    # Saving traininig and validation losses
    save_plots_to_pdf([
        (plot_losses, {}),
        ], f'{SAVE_EPOCH}/train_loss.pdf')
    
    # Saving csv files
    pd.concat([predicted_censored, predicted_main]).to_csv(f'{SAVE_PREDICT}/all_pred.csv', index=False)
    pd.concat([actual_censored, actual_main]).to_csv(f'{SAVE_ACTUAL}/all_actual.csv', index=False)
