In [1]:
import sys
import pandas as pd
import logging

# set the environment path to find Recommenders
from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.models.sar import SAR

# set the environment for Surprise
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset from csv file
df1 = pd.read_csv('rats.csv')
df2 = pd.read_csv('icat.csv')
merged_df = pd.merge(df1, df2, on='itemId')

# create new dataframe
df3 = pd.DataFrame(merged_df)
df3['Category'] = df3['Category'].astype('category').cat.codes
df3['userID'] = df3['userId']
df3['itemID'] = df3['itemId']
df3 = df3.drop(['userId', 'itemId'], axis=1)
df3.head()

Unnamed: 0,rating,ItemName,Category,Quality,userID,itemID
0,1.533462,Restaurant Fake,5,1.647351,13,0
1,1.283205,Restaurant Fake,5,1.647351,15,0
2,1.506836,Restaurant Fake,5,1.647351,17,0
3,1.260289,Restaurant Fake,5,1.647351,19,0
4,3.203168,Restaurant Fake,5,1.647351,23,0


In [3]:
# top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 15
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "lightgcn.yaml"

header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "Category"
}

In [4]:
train, test = python_stratified_split(df3, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42)

# SAR Model

In [5]:
# set log level to INFO
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [6]:
model.fit(train)

2023-05-15 13:34:47,350 INFO     Collecting user affinity matrix
2023-05-15 13:34:47,357 INFO     Calculating time-decayed affinities
2023-05-15 13:34:47,540 INFO     Creating index columns
2023-05-15 13:34:47,879 INFO     Building user affinity sparse matrix
2023-05-15 13:34:47,894 INFO     Calculating item co-occurrence
2023-05-15 13:34:47,929 INFO     Calculating item similarity
2023-05-15 13:34:47,930 INFO     Using jaccard based similarity
2023-05-15 13:34:47,932 INFO     Done training


In [7]:
# top k items to recommend
top_k = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

2023-05-15 13:44:27,543 INFO     Calculating recommendation scores
2023-05-15 13:44:27,640 INFO     Removing seen items


In [12]:
top_k_with_titles = (top_k.join(df3[['itemID', 'ItemName']].drop_duplicates().set_index('itemID'), 
                                on='itemID', 
                                how='inner').sort_values(by=['userID','prediction'], ascending=False))
display(top_k_with_titles.head(10))

Unnamed: 0,userID,itemID,prediction,ItemName
693070,99999,2,0.621051,Random Shopping Mall
693071,99999,1,0.619378,Fiction Nightclub
693072,99999,0,0.61197,Restaurant Fake
693073,99999,3,0.604664,Bogus Waterpark
693074,99999,4,0.599879,Unknown Nature Route
693075,99999,6,0.589236,Never Happened Festival
693076,99999,7,0.580514,False Tavern
693077,99999,8,0.552795,MakeBelieve Pub
693078,99999,9,0.551365,Another Sport Event
693079,99999,10,0.513836,Surprise Concert


In [13]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='userID', 
              col_item='itemID', 
              col_rating='rating', 
              col_prediction='prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [14]:
print(f"Model: SAR",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model: SAR
Top K:		 10
MAP:		 0.054901
NDCG:		 0.094697
Precision@K:	 0.026674
Recall@K:	 0.219833


# Light GCN Model

In [15]:
data = ImplicitCF(train=train, test=test, seed=SEED)

  df = train if test is None else train.append(test)


In [18]:
hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.015,
                          eval_epoch=5,
                          top_k=TOP_K,
                          decay = 0.0001,
                          embed_size = 64
                         )

In [19]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [20]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)196.5s: train loss = 0.03209 = (mf)0.03114 + (embed)0.00095
Epoch 2 (train)175.9s: train loss = 0.00060 = (mf)0.00026 + (embed)0.00034
Epoch 3 (train)175.7s: train loss = 0.00046 = (mf)0.00014 + (embed)0.00032
Epoch 4 (train)174.7s: train loss = 0.00040 = (mf)0.00007 + (embed)0.00032
Epoch 5 (train)178.2s + (eval)25.6s: train loss = 0.00037 = (mf)0.00006 + (embed)0.00031, recall = 0.44415, ndcg = 0.20566, precision = 0.05273, map = 0.12741
Epoch 6 (train)182.1s: train loss = 0.00037 = (mf)0.00007 + (embed)0.00031
Epoch 7 (train)176.3s: train loss = 0.00036 = (mf)0.00006 + (embed)0.00030
Epoch 8 (train)176.6s: train loss = 0.00033 = (mf)0.00004 + (embed)0.00029
Epoch 9 (train)175.9s: train loss = 0.00034 = (mf)0.00006 + (embed)0.00028
Epoch 10 (train)177.2s + (eval)24.8s: train loss = 0.00031 = (mf)0.00004 + (embed)0.00027, recall = 0.46330, ndcg = 0.21596, precision = 0.05525, map = 0.13466
Epoch 11 (train)180.1s: train loss = 0.00029 = (mf)0.00003 + (embed)0.00026
Epoch

In [23]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)
topk_scores.head(10)

Unnamed: 0,userID,itemID,prediction
0,0,0,-0.707872
1,0,9,-0.784956
2,0,7,-0.894916
3,0,12,-1.025794
4,0,2,-1.049443
5,0,4,-1.128146
6,0,8,-1.315912
7,0,3,-1.326609
8,0,13,-1.546036
9,0,6,-1.555954


In [22]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("Model: Light GCN",
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model: Light GCN
MAP:	0.118860
NDCG:	0.193370
Precision@K:	0.050437
Recall@K:	0.420329


# SVD Model