# Baseline Model

In [1]:
import numpy as np
import pandas as pd
import yaml
import glob
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from surprise import SVD
from surprise.model_selection import cross_validate

# Load Config


In [3]:
%pwd

'/Users/yingkang/4thBrain/GNN-eCommerce/notebooks'

In [4]:
%cd ..

/Users/yingkang/4thBrain/GNN-eCommerce


In [5]:
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)

config

{'base': {'random_seed': 42},
 'data': {'cosmetic_shop': 'data/raw/cosmetic-shop-ecommerce-events/'},
 'training': {'event_type_weights': {'view': 0.01,
   'cart': 0.1,
   'remove_from_cart': -0.09,
   'purchase': 1.0}},
 'reports': None}

# Load Dataset and EDA

In [8]:
# Get CSV files list of cosmetic shop dataset
path = config['data']['cosmetic_shop']
csv_files = glob.glob(path + "/20*.csv")

# Read each CSV file into DataFrame. This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
all_events = pd.concat(df_list, ignore_index=True)

In [9]:
print(all_events.shape)
all_events.sample(10)

(20692840, 9)


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
20197793,2019-10-28 07:27:24 UTC,purchase,5809880,1602943681873052386,,grattol,4.76,462463264,46fcfeca-752b-4916-b7f1-544285a39b4c
32815,2020-01-01 14:16:06 UTC,view,5561052,1487580005595612013,,,3.0,524863975,42f70603-19b2-43e9-b109-5cdbff4e65c6
11175868,2020-02-24 16:53:38 UTC,view,5864314,1487580011585077370,,dermal,1.03,483143687,cd2aacd6-65a8-43a7-b067-07fb5d462954
10582591,2020-02-19 19:45:06 UTC,cart,5838932,1487580005713052531,,ingarden,4.6,617510949,921dc3a1-2e78-46f0-94b4-c40effa7fb51
19284170,2019-10-20 21:48:47 UTC,remove_from_cart,5796976,1487580005671109489,,masura,1.73,560254797,88d41123-c13f-4e3b-9958-1f983547cb57
8129907,2020-02-03 08:01:03 UTC,cart,5772309,1602943681873052386,,grattol,4.76,609658150,1787f510-c534-2d35-7583-b0e9f344ae05
9042312,2020-02-09 09:12:07 UTC,cart,5861300,1487580008145748965,,,1.75,497997612,0f07c928-445c-41ac-9b2e-89ced249c2dc
8029013,2020-02-02 14:44:43 UTC,view,5808663,2089259162625114209,,ardell,6.37,509996784,73e5c9f7-468f-4489-a799-7123e5448f08
1680402,2020-01-14 11:33:29 UTC,view,5700855,1487580013950664926,,irisk,63.41,468463839,2fe8f6db-59ba-4a7e-9653-d777b6e78893
737828,2020-01-07 18:42:22 UTC,view,5899907,2164688961165852944,,opi,7.41,598065884,4e114e43-e650-4a3f-a54d-b7534e0417fc


In [10]:
# Number of unique users
users_count = len(pd.unique(all_events['user_id']))
print("Unique users: ", users_count)

# Number of unique products
products_count = len(pd.unique(all_events['product_id']))
print("Unique products: ", products_count)

print("Event types: ", pd.unique(all_events.event_type).tolist())


Unique users:  1639358
Unique products:  54571
Event types:  ['view', 'cart', 'remove_from_cart', 'purchase']


In [11]:
# a small dataset used for code development only
all_events_mini = all_events.head(100000)
all_events_mini = all_events_mini[['user_id', 'product_id', 'event_type']]


In [13]:
def interaction_matrix(events, event_type_weights):

    events = events[['user_id', 'product_id', 'event_type']]
    # Add weight column to `events` dataframe
    events['weight'] = events.event_type.apply(lambda x: event_type_weights[x])

    i_matrix = events.groupby(['user_id', 'product_id'], as_index=False) \
        .agg({'event_type': list, 'weight': lambda x: 0.01 if x.sum() < 0 else 1.0 if x.sum() > 1.0 else x.sum()})

    return i_matrix

In [14]:
im = interaction_matrix(all_events_mini, config['training']['event_type_weights'])
im

Unnamed: 0,user_id,product_id,event_type,weight
0,38560385,5861734,[view],0.01
1,43695999,5686278,[view],0.01
2,43695999,5712736,[view],0.01
3,43695999,5730212,[view],0.01
4,43695999,5730214,[view],0.01
...,...,...,...,...
61751,595853462,5804820,[view],0.01
61752,595853555,5714119,[view],0.01
61753,595853918,5863525,[view],0.01
61754,595853961,5677422,[view],0.01


##### refer code for relabeling user_nodes, item_nodes

In [None]:
# refer code for relabeling user_nodes, item_nodes
from sklearn.model_selection import train_test_split

# change df to im??
train, test = train_test_split(df.values, test_size=0.2, random_state=16)
train_df = pd.DataFrame(train, columns=df.columns)
test_df = pd.DataFrame(test, columns=df.columns)

In [None]:
from sklearn import preprocessing as pp

le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'].values)
train_df['item_id_idx'] = le_item.fit_transform(train_df['item_id'].values)

In [None]:
train_user_ids = train_df['user_id'].unique()
train_item_ids = train_df['item_id'].unique()

print('Unique train set user_ids/ item_ids:', len(train_user_ids), len(train_item_ids))

test_df = test_df[
  (test_df['user_id'].isin(train_user_ids)) & \
  (test_df['item_id'].isin(train_item_ids))
]
print('Size of test set before/ after(remove user/item nodes not in train set):', len(test), len(test_df))

In [None]:
test_df['user_id_idx'] = le_user.transform(test_df['user_id'].values)
test_df['item_id_idx'] = le_item.transform(test_df['item_id'].values)

In [15]:
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(0, 1))
surprise_dataset = Dataset.load_from_df(im[['user_id', 'product_id', 'weight']], reader)

In [16]:
df = pd.DataFrame(surprise_dataset.__dict__['raw_ratings'])
df

Unnamed: 0,0,1,2,3
0,38560385,5861734,0.01,
1,43695999,5686278,0.01,
2,43695999,5712736,0.01,
3,43695999,5730212,0.01,
4,43695999,5730214,0.01,
...,...,...,...,...
61751,595853462,5804820,0.01,
61752,595853555,5714119,0.01,
61753,595853918,5863525,0.01,
61754,595853961,5677422,0.01,


In [17]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
#cross_validate(algo, surprise_dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
algo.fit_and_evaluate(surprise_dataset)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2035  0.2107  0.2078  0.2105  0.2053  0.2076  0.0028  
MAE (testset)     0.1211  0.1238  0.1223  0.1222  0.1208  0.1221  0.0010  
Fit time          0.71    0.70    0.71    0.68    0.69    0.70    0.01    
Test time         0.14    0.08    0.08    0.08    0.08    0.09    0.03    


{'test_rmse': array([0.20352061, 0.2107254 , 0.20783519, 0.210534  , 0.20528449]),
 'test_mae': array([0.1211419 , 0.12380897, 0.1222868 , 0.12224697, 0.12083883]),
 'fit_time': (0.7057900428771973,
  0.695408821105957,
  0.7082438468933105,
  0.6787819862365723,
  0.6893918514251709),
 'test_time': (0.14333605766296387,
  0.07866287231445312,
  0.0767359733581543,
  0.07605695724487305,
  0.07603096961975098)}

Precision@K, Recall@K --
user_Embed_wts: all user nodes' embeddings
item_Enbed_wts: all item nodes' embeddings
n_users: num of unique users
n_items: num of unique items(pos + neg)
train_data, test_data

In [None]:
import torch

def get_metrics(user_Embed_wts, item_Embed_wts, n_users, n_items, train_data, test_data, K):
  test_user_ids = torch.LongTensor(test_data['user_id_idx'].unique())
  # compute the score of all user-item pairs
  relevance_score = torch.matmul(user_Embed_wts, torch.transpose(item_Embed_wts,0, 1))

  # create dense tensor of all user-item interactions
  i = torch.stack((
    torch.LongTensor(train_df['user_id_idx'].values),
    torch.LongTensor(train_df['item_id_idx'].values)
  ))
  v = torch.ones((len(train_df)), dtype=torch.float64)
  interactions_t = torch.sparse.FloatTensor(i, v, (n_users, n_items)).to_dense().to(device)

  # mask out training user-item interactions from metric computation
  relevance_score = torch.mul(relevance_score, (1 - interactions_t))

  # compute top scoring items for each user
  topk_relevance_indices = torch.topk(relevance_score, K).indices
  topk_relevance_indices_df = pd.DataFrame(topk_relevance_indices.cpu().numpy(),columns =['top_indx_'+str(x+1) for x in range(K)])
  topk_relevance_indices_df['user_ID'] = topk_relevance_indices_df.index
  topk_relevance_indices_df['top_rlvnt_itm'] = topk_relevance_indices_df[['top_indx_'+str(x+1) for x in range(K)]].values.tolist()
  topk_relevance_indices_df = topk_relevance_indices_df[['user_ID','top_rlvnt_itm']]

  # measure overlap between recommended (top-scoring) and held-out user-item
  # interactions
  test_interacted_items = test_data.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
  metrics_df = pd.merge(test_interacted_items,topk_relevance_indices_df, how= 'left', left_on = 'user_id_idx',right_on = ['user_ID'])
  metrics_df['intrsctn_itm'] = [list(set(a).intersection(b)) for a, b in zip(metrics_df.item_id_idx, metrics_df.top_rlvnt_itm)] #TP

  metrics_df['recall'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/len(x['item_id_idx']), axis = 1)
  metrics_df['precision'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/K, axis = 1)

  return metrics_df['recall'].mean(), metrics_df['precision'].mean()

# Further Work
Use event time to influence the weight of the event. i.e. More recent events carry more weight.