In [1]:
# import numpy as np
# import pandas as pd
# from surprise import SVD, Dataset, Reader
# from surprise.model_selection import train_test_split

## Load Data

In [1]:
import numpy as np
import yaml
import matplotlib.pyplot as plt
import recmetrics
from src.lightgcn import LightGCN
from src.train_lightgcn import *
from src.utils_v2 import *
import glob

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device # = "cpu"

device(type='cpu')

In [3]:
%cd ..
with open("config.yaml") as config_file:
    config = yaml.safe_load(config_file)

/Users/yingkang/4thBrain/GNN-eCommerce


In [4]:
checkpoint_dir = config['training']['checkpoints_dir']+'2023-02-15_060043/'

In [5]:
train_df = pd.read_csv(checkpoint_dir + 'processed_train.csv')
test_df = pd.read_csv(checkpoint_dir + 'processed_test.csv')
val_df = pd.read_csv(checkpoint_dir + 'processed_val.csv')

In [6]:
edge_index, edge_weight = df_to_graph(train_df, True)

In [7]:
n_users = train_df['user_id_idx'].nunique()
n_items = train_df['item_id_idx'].nunique()

In [8]:
train_df['item_id_idx'] = train_df['item_id_idx']-n_users

In [9]:
combined = pd.concat([train_df, test_df, val_df], ignore_index=True)
interactions_t = interact_matrix(combined, n_users, n_items)

##### Registered users list + Real purchase data

In [20]:
purchased_users = combined.loc[combined['weight'] == 1.0]
p_user_list = purchased_users[['user_id', 'user_id_idx']].drop_duplicates()
p_user_list
# Can also recommend for non-purchase user, but cannot verify if recommendation is good
# users_list = combined[['user_id', 'user_id_idx']].drop_duplicates()
# users_list

Unnamed: 0,user_id,user_id_idx
0,492998340,196372
4,568760540,719382
24,451861530,119424
32,604677197,1306361
41,597163052,1163544
...,...,...
9773394,417026247,77289
9773815,505463598,221282
9774575,568939049,722693
9774966,477344956,164067


In [22]:
# Pick target user
target_users = list(p_user_list['user_id_idx'].sample(1))
test_interactions_t = torch.index_select(interactions_t, 0, torch.tensor(target_users)).to_dense()
print(f'Target users are : {target_users}')

Target users are : [867540]


In [None]:
df = combined.loc[combined['weight'] == 1.0]
df = df[['user_id', 'item_id', 'weight', 'user_id_idx', 'item_id_idx']]
df.groupby()

In [24]:
df = df.loc[(df['user_id_idx'].isin(target_users))]
print(f"Real data: \n {df}")

Real data: 
          Unnamed: 0.1  Unnamed: 0    user_id  item_id  weight  user_id_idx  \
2663755       7211099     7211099  577444426  5668346    0.01       867540   
6160834       7211100     7211100  577444426  5759279    1.00       867540   
6200988       7211103     7211103  577444426  5815036    0.01       867540   
6238541       7211102     7211102  577444426  5809912    1.00       867540   
6909725       7211105     7211105  577444426  5854832    1.00       867540   
9411228       7211104     7211104  577444426  5850625    1.00       867540   
9721292       7211106     7211106  577444426  5861721    1.00       867540   
9728928       7211101     7211101  577444426  5809910    1.00       867540   

         item_id_idx  
2663755         6433  
6160834        14591  
6200988        23294  
6238541        21752  
6909725        34266  
9411228        32731  
9721292        36277  
9728928        21750  


## Load saved best LGCN model

In [14]:
best_model = torch.load(checkpoint_dir + "LightGCN_best.pt", map_location=torch.device('cpu'))

In [15]:
# "latent_dim": 80, "n_layers": 3,
test_model = LightGCN(n_users + n_items, 80, 3)
test_model.load_state_dict(best_model['model_state_dict'])

<All keys matched successfully>

## LGCN Recommendation

In [25]:
k = 5
LGCN_rec_df = test_model.recommendK(edge_index, edge_weight, n_users, n_items, test_interactions_t, target_users, k)
print(f'LGCN Rec for : {target_users}; \n{LGCN_rec_df}')

LGCN Rec for : [867540]; 
tensor([[23643, 21751, 23639, 23642, 34246]])


### MAR@K Plot

In [None]:
random_mark = []
SVD_mark = []

In [None]:
LGCN_mark = []
LGCN_mapk = []
for k in np.arange(1, 11):
    top_index_df = test_model.recommendK(edge_index, edge_weight, n_users, n_items, interactions_t, target_users, k)
    mark, mapk, metrics = test_model.MARK_MAPK(test_pos_list_df, top_index_df, k)
    LGCN_mark.extend([mark])
    LGCN_mapk.extend([mapk])


In [None]:
mark_scores = [random_mark, SVD_mark, LGCN_mark]
index = range(1,10+1)
names = ['Random Recommender', 'SVD Recommender', 'LGCN Recommender']

fig = plt.figure(figsize=(15, 7))
recmetrics.mark_plot(mark_scores, model_names=names, k_range=index)

## Hit Rate

## ROC Plot + Precision Recall Plot

## Average Precision

## Prediction Coverage + Coverage Plot

## Maybe not -- Long Tail Plot for item popularity and active user

Some item is extremly popular(massive user shows interest to it)

In [None]:
# fig = plt.figure(figsize=(15, 7))
# recmetrics.long_tail_plot(df=interaction_matrix,
#              item_id_column="item_id",
#              interaction_type="received interests",
#              percentage=0.5,
#              x_labels=False)
# interaction_matrix['item_id'].value_counts()