# LightGCN model RecSys

In [1]:
# import matplotlib.pyplot as plt
# import networkx as nx
# import numpy as np
# import pandas as pd
import yaml
# import glob
# import torch
# import torch.nn
from lightgcn import LightGCN
# from torch_geometric.nn import LightGCN
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
# from collections import defaultdict
from utils import *
pd.set_option('display.max_colwidth', None)

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device # = "cpu"

device(type='cuda', index=0)

# Load Config

In [3]:
%cd ..
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)
config

/home/ying/GNN-eCommerce


{'base': {'random_seed': 42},
 'data': {'cosmetic_shop': 'data/raw/cosmetic-shop-ecommerce-events/',
  'preprocessed': 'data/preprocessed/'},
 'training': {'event_type_weights': {'view': 0.01,
   'cart': 0.1,
   'remove_from_cart': -0.09,
   'purchase': 1.0}},
 'reports': None}

# Load Interaction Matrix from csv

In [4]:
interaction_matrix = pd.read_csv(config['data']['preprocessed'] + "interaction_matrix.csv")
interaction_matrix = interaction_matrix.rename(columns={"product_id": "item_id"})

In [5]:
im = interaction_matrix[['user_id', 'item_id', 'weight']].copy()

In [6]:
print('Total data size: ', len(im), ', unique user: ', im.user_id.nunique(), ', unique items: ', im.item_id.nunique())

Total data size:  10157408 , unique user:  1639358 , unique items:  54571


In [7]:
# ?? Should user never purchase removed at the beginning ??? NO for now!!
# mini_im = purchase_users(im)
mini_im = im.sample(200000, random_state=1)  #100000

In [8]:
print('Mini dataset size: ', len(mini_im), ', Users at least purchased once: ', len(purchase_users(mini_im)))
# print('Valid data percentage: ', f'{len(mini_im)/len(im):.2%}')

Mini dataset size:  200000 , Users at least purchased once:  46191


### Prepare Train/ Val/ Test dataset

In [9]:
train_df, test_df = train_test_split(mini_im, test_size=0.3, random_state=16)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=16)

In [10]:
print('Mini set unique user: ', mini_im.user_id.nunique(), ', unique items: ', mini_im.item_id.nunique())
print("Train Size  : ", len(train_df))
print("Val Size : ", len (val_df))
print("Test Size : ", len (test_df))

Mini set unique user:  128555 , unique items:  32907
Train Size  :  140000
Val Size :  30000
Test Size :  30000


In [11]:
n_users, n_items, train_df, train_pos_list_df, val_pos_list_df, test_pos_list_df = prepare_val_test(train_df, val_df, test_df)  # , val_u_i_matrix, test_u_i_matrix

In [12]:
print("Users : ", n_users)
print("Items : ", n_items)
print("Train Size  : ", len(train_df))
print("Val Size : ", len (val_pos_list_df))
print("Test Size : ", len (test_pos_list_df))

Users :  97855
Items :  29177
Train Size  :  140000
Val Size :  1763
Test Size :  1742


In [13]:
# train_df

In [14]:
# train_pos_list_df

In [15]:
# val_pos_list_df

In [16]:
# test_pos_list_df

### Instantiate model and train/val the model

In [17]:
latent_dim = 64
n_layers = 3
LR = 0.005

model = LightGCN(num_nodes=n_users+n_items, embedding_dim=latent_dim, num_layers=n_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
print("Size of Learnable Embedding : ", [x.shape for x in list(model.parameters())])

Size of Learnable Embedding :  [torch.Size([127032, 64])]


In [18]:
bpr_loss, reg_loss, final_loss, recall, precision = \
    train_and_evl(n_users, n_items, 100, train_df, train_pos_list_df, val_pos_list_df, model, optimizer, device=device, EPOCHS = 50, BATCH_SIZE = 512, K = 20, DECAY = 0.0001)  # val_u_i_matrix,

bpr_loss | reg_loss | final_loss | precision | recall


  0%|          | 0/50 [00:00<?, ?it/s]

0.0225 0.0007 0.0231 0.0006 0.0116
0.0002 0.0008 0.001 0.0005 0.0102
0.0002 0.0007 0.0009 0.0008 0.015
0.0002 0.0006 0.0008 0.0007 0.0135
0.0002 0.0005 0.0007 0.0007 0.0128
0.0001 0.0005 0.0006 0.0006 0.0116
0.0001 0.0004 0.0006 0.0006 0.0108
0.0001 0.0004 0.0005 0.0006 0.0122
0.0001 0.0004 0.0005 0.0006 0.0119
0.0001 0.0003 0.0005 0.0007 0.0128
0.0001 0.0003 0.0005 0.0006 0.0111
0.0001 0.0003 0.0004 0.0007 0.0139
0.0001 0.0003 0.0004 0.0006 0.0119
0.0001 0.0003 0.0004 0.0005 0.0102
0.0001 0.0003 0.0004 0.0005 0.0094
0.0001 0.0003 0.0004 0.0005 0.0088
0.0001 0.0003 0.0004 0.0006 0.0111
0.0001 0.0003 0.0004 0.0004 0.0073
0.0001 0.0003 0.0004 0.0005 0.0094
0.0001 0.0002 0.0004 0.0005 0.009
0.0001 0.0002 0.0004 0.0005 0.0101
0.0001 0.0002 0.0003 0.0005 0.0105
0.0001 0.0002 0.0003 0.0005 0.0085
0.0001 0.0002 0.0003 0.0005 0.0091
0.0001 0.0002 0.0004 0.0005 0.0102
0.0001 0.0002 0.0004 0.0004 0.0076
0.0001 0.0002 0.0003 0.0005 0.0091
0.0001 0.0002 0.0003 0.0005 0.0087
0.0001 0.0002 0.0003 0.

In [None]:
edge_index, edge_weight = df_to_graph(train_df, True)
edge_index

In [None]:
idx = list(range(len(users)))
random.shuffle(idx)
loader = DataLoader(idx, batch_size=100, shuffle=True)
len(idx)

In [None]:
model.train()
for batch in loader:
    optimizer.zero_grad()

    batch_usr = users[batch]
    batch_pos_items = pos_items[batch]
    batch_neg_items = neg_items[batch]

    batch_pos_neg_labels = batch_pos_neg_edges(batch_usr, batch_pos_items, batch_neg_items)

In [None]:
batch_pos_neg_labels

In [None]:
# batch_pos_neg_labels[1].min()

In [None]:
out = model(edge_index, batch_pos_neg_labels, edge_weight)
out

In [None]:
size = len(batch)

bpr_loss = model.recommendation_loss(out[:size], out[size:], 0) * size
reg_loss = regularization_loss(model.embedding.weight, size, batch_usr, batch_pos_items, batch_neg_items)
loss = bpr_loss + reg_loss

In [None]:
# bpr_loss_batch_list = []
# reg_loss_batch_list = []
# final_loss_batch_list = []

In [None]:
# bpr_loss_batch_list.append(bpr_loss.item())
# reg_loss_batch_list.append(reg_loss.item())
# final_loss_batch_list.append(loss.item())

In [None]:
# bpr_loss = round(np.mean(bpr_loss_batch_list), 8)
# reg_loss = round(np.mean(reg_loss_batch_list), 8)
# final_loss = round(np.mean(final_loss_batch_list), 8)

In [None]:
print("bpr loss: ", loss, "reg loss: ", reg_loss, "final loss", loss)

In [None]:
# bpr_loss_epoch_list = []
# reg_loss_epoch_list = []
# final_loss_epoch_list = []
# recall_epoch_list = []
# precision_epoch_list = []

In [None]:
model.eval()
with torch.no_grad():
    embeds = model.get_embedding(edge_index, edge_weight)   # ?? ???
    final_usr_embed, final_item_embed = torch.split(embeds, (n_users, n_items))

In [None]:
# matrix = interact_matrix(train_df, n_users, n_items)

In [None]:
test_topK_recall, test_topK_precision = get_metrics(final_usr_embed, final_item_embed, val_pos_list_df, 20)

In [None]:
print('precision: ', test_topK_precision, 'recall: ', test_topK_recall)

In [None]:
relevance_score = final_usr_embed @ final_item_embed.t()
relevance_score.shape

In [None]:
topk_relevance_indices = torch.topk(relevance_score, 20).indices
topk_relevance_indices[0:6]

In [None]:
topk_relevance_indices.shape

In [None]:
# t = topk_relevance_indices
# t = torch.flatten(t)
# t.shape

In [None]:
# torch.max(t)

In [None]:
# torch.min(t)

In [None]:
# aa = list([68,561,1949,2478,4236,5117,5320,6173])
# train_df.loc[train_df['user_id_idx'].isin(aa)]

In [None]:
topk_relevance_indices_df = pd.DataFrame(topk_relevance_indices.cpu().numpy())
topk_relevance_indices_df

In [None]:
topk_relevance_indices_df['top_rlvnt_itm'] = topk_relevance_indices_df.values.tolist()
topk_relevance_indices_df['top_rlvnt_itm']

In [None]:
topk_relevance_indices_df

In [None]:
topk_relevance_indices_df['user_ID'] = topk_relevance_indices_df.index
topk_relevance_indices_df

In [None]:
topk_relevance_indices_df = topk_relevance_indices_df[['user_ID', 'top_rlvnt_itm']]
topk_relevance_indices_df

In [None]:
# test_interacted_items = val_df.groupby('user_id_idx')['item_id_idx'].apply(list).reset_index()
# test_interacted_items

In [None]:
test_pos_list_df

In [None]:
metrics_df = pd.merge(test_pos_list_df, topk_relevance_indices_df, how='left', left_on='user_id_idx', right_on='user_ID')
metrics_df

In [None]:
metrics_df['intrsctn_itm'] = [list(set(a).intersection(b)) for a, b in
                                  zip(metrics_df.item_id_idx_list, metrics_df.top_rlvnt_itm)]
metrics_df