In [1]:
import warnings
warnings.filterwarnings('ignore')
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%%time
#competition_dataset_directory = pathlib.Path('../../allData/pickleFiles/')
pickled_dataset_directory = pathlib.Path('../../allData/pickleFiles')

df_train = pd.read_pickle(pickled_dataset_directory / 'train.pkl')
#df_test = pd.read_pickle(pickled_dataset_directory / 'test.pkl')

print(f'Training Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
#print(f'Test Shape: {df_test.shape} - Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

Training Shape: (216716096, 4) - Memory Usage: 3513.50 MB
CPU times: user 6.56 ms, sys: 1.39 s, total: 1.39 s
Wall time: 2.12 s


In [6]:
def get_labels(aids, event_types):
    
    """
    Create ground-truth labels from given session aids and event types
    
    Parameters
    ----------
    aids: pandas.Series of shape (n_events)
        Session aids

    event_types: pandas.Series of shape (n_events)
        Session event types
        
    Returns
    -------
    labels: list of shape (n_events)
        Ground-truth labels
    """
    
    previous_click = None
    previous_carts = set()
    previous_orders = set()
    labels = []

    for aid, event_type in zip(reversed(aids.values), reversed(event_types.values)):
        
        label = {}
        
        if event_type == 0:
            previous_click = aid
        elif event_type == 1:
            previous_carts.add(aid)
        elif event_type == 2:
            previous_orders.add(aid)
            
        label[0] = previous_click 
        label[1] = previous_carts.copy() if len(previous_carts) > 0 else np.nan
        label[2] = previous_orders.copy() if len(previous_orders) > 0 else np.nan
        labels.append(label)
        
    labels = labels[:-1][::-1]
    labels.append({0: np.nan, 1: np.nan, 2: np.nan})
    
    return labels


In [7]:
df_session747 = df_train.loc[df_train['session'] == 0, :]
session747_labels = get_labels(aids=df_session747['aid'], event_types=df_session747['type'])
df_session747.loc[:, 'label'] = session747_labels
df_session747
with pd.option_context(#'display.max_rows', None,
                       'display.max_colwidth', None
                       ):
    print(df_session747.head(10))

   session      aid                      ts  type  \
0        0  1517085 2022-07-31 22:00:00.025     0   
1        0  1563459 2022-07-31 22:01:44.511     0   
2        0  1309446 2022-08-01 15:23:59.426     0   
3        0    16246 2022-08-01 15:28:39.997     0   
4        0  1781822 2022-08-01 15:31:11.344     0   
5        0  1152674 2022-08-01 15:31:25.796     0   
6        0  1649869 2022-08-01 16:04:53.840     1   
7        0   461689 2022-08-01 16:04:58.050     1   
8        0   305831 2022-08-01 16:07:07.105     2   
9        0   461689 2022-08-01 16:07:07.105     2   

                                                                                                                                                                  label  
0  {0: 1563459, 1: {1521766, 315914, 543308, 1649869, 1760145, 1549618, 1199474, 280978, 442293, 275288, 461689, 974651, 789245}, 2: {461689, 1199474, 543308, 305831}}  
1  {0: 1309446, 1: {1521766, 315914, 543308, 1649869, 1760145, 1549618, 1199