# LightGCN model RecSys

In [None]:
# import matplotlib.pyplot as plt
# import networkx as nx
# import numpy as np
# import pandas as pd
import yaml
from lightgcn import LightGCN
from sklearn.model_selection import train_test_split
from utils import *
pd.set_option('display.max_colwidth', None)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device # = "cpu"

In [None]:
# Load Config
%cd ..
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)
config

# Load Interaction Matrix from csv

In [None]:
interaction_matrix = pd.read_csv(config['data']['preprocessed'] + "interaction_matrix.csv")
interaction_matrix = interaction_matrix.rename(columns={"product_id": "item_id"})

In [None]:
im = interaction_matrix[['user_id', 'item_id', 'weight']].copy()

In [None]:
print('Total data size: ', len(im), ', unique user: ', im.user_id.nunique(), ', unique items: ', im.item_id.nunique())

In [None]:
# ?? Should user never purchase removed at the beginning ??? NO for now!!
# mini_im = purchase_users(im)
#mini_im = im.sample(200000)  #100000
mini_im = im

In [None]:
print('Mini dataset size: ', len(mini_im), ', Users at least purchased once: ', len(purchase_users(mini_im)))
# print('Valid data percentage: ', f'{len(mini_im)/len(im):.2%}')

### Prepare Train/ Val/ Test dataset

In [None]:
train_df, test_df = train_test_split(mini_im, test_size=0.1)
test_df, val_df = train_test_split(test_df, test_size=0.5)

In [None]:
print('Mini set unique user: ', mini_im.user_id.nunique(), ', unique items: ', mini_im.item_id.nunique())
print("Train Size  : ", len(train_df))
print("Val Size : ", len (val_df))
print("Test Size : ", len (test_df))

In [None]:
n_users, n_items, train_df, train_pos_list_df, val_pos_list_df, test_pos_list_df = prepare_val_test(train_df, val_df, test_df)  # , val_u_i_matrix, test_u_i_matrix

In [None]:
#users, pos_items, neg_items = pos_neg_edge_index(train_pos_list_df, 1, n_users, n_items)

In [None]:
print("After data pipline")
print("n_users : ", n_users, ", n_items : ", n_items)
print("train_df Size  : ", len(train_df))
print("val_pos_list_df Size : ", len (val_pos_list_df))
print("test_pos_list_df Size : ", len (test_pos_list_df))
#print("train set size: ", len(users))

### Instantiate model and train/val the model

In [None]:
# Hyper parameters
latent_dim = 80     # aim to 128
n_layers = 3
LR = 0.005
K = 20   # Recall@K
DECAY = 0.0001   # reg loss
BATCH_SIZE = 1024  # train mini batch size
n_neg = 3     # number of negative sample edges per each positive edge

EPOCHS = 50    # total number of epochs
checkpoint_dir = "model-checkpoints"

model = LightGCN(num_nodes=n_users+n_items, embedding_dim=latent_dim, num_layers=n_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
print("Size of Learnable Embedding : ", [x.shape for x in list(model.parameters())])

In [None]:
edge_index, edge_weight = df_to_graph(train_df, True)

In [None]:
bpr_loss, reg_loss, final_loss, recall, precision = \
    train_and_evl(n_users, n_items, n_neg, edge_index, edge_weight, train_pos_list_df, val_pos_list_df, model, optimizer, device=device, EPOCHS=EPOCy default, Tune automatically runs N concurrent trials, where N is the number of CPUs (cores) on your machine.HS, BATCH_SIZE=BATCH_SIZE, K=K, DECAY=DECAY, checkpoint_dir=checkpoint_dir, log_interval=10)

In [None]:
best_model = torch.load(checkpoint_dir + "/LightGCN_best.pt")
best_epoch = best_model['epoch']
best_val_precision = best_model['precision']
best_val_recall = best_model['recall']

test_model = LightGCN(num_nodes=n_users+n_items, embedding_dim=latent_dim, num_layers=n_layers)
test_model.load_state_dict(best_model['model_state_dict'])

In [None]:
test_p, test_recall = evaluation(test_model, n_users, n_items, edge_index, edge_weight, test_pos_list_df, K=20)

print(f"Best epoch {best_epoch}")
print(f"Test Precision: {test_p:>0.4f}, Recall: {test_recall:>0.4f}")
print(f"Val Precision: {best_val_precision:>0.4f}, Recall: {best_val_recall:>0.4f}")


