# Import libraries

In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter

# Loading data

In [2]:
time_feature = 'z_timestamp'
df = pd.read_csv('../data/interaction-clean.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp', time_feature]]
df.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp,z_timestamp
0,76561197960432447,10,1,1738278781,0.559647
1,76561198071230926,10,1,1736206418,0.535447
2,76561198206216352,10,1,1738041574,0.556877
3,76561198110801124,10,1,1738015332,0.55657
4,76561199813732773,10,1,1737853720,0.554683


# Indexing data

In [3]:
index_manager = IndexManager()
index_manager.fit(df_interaction=df)
index_manager.transform_interactions(df, inplace=True)
df.head()

Indexed 836887 users and 69001 items
User index range: 0-836886
Item index range: 0-69000


Unnamed: 0,user_id,item_id,rating_imp,timestamp,z_timestamp
0,0,0,1,1738278781,0.559647
1,1,0,1,1736206418,0.535447
2,2,0,1,1738041574,0.556877
3,3,0,1,1738015332,0.55657
4,4,0,1,1737853720,0.554683


# Creating datasets

In [4]:
splitter = Splitter(df)
df_train, df_val, df_test = splitter.leave_k_out_split()

train_dataset = NCFDataset(df_train, time_feature=time_feature)
val_dataset = NCFDataset(df_val, time_feature=time_feature)

train_dataloader = DataLoader(train_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=True)
eval_dataloader = DataLoader(val_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=False)

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Total users: 836887
Interactions per user: min=1, max=1035, avg=1.4
Note: 706515 users have fewer than 2 interactions.
These users will be placed entirely in the training set.
Split complete: 1156226 total interactions
Train set: 895482 interactions (77.4%)
Validation set: 130372 interactions (11.3%)
Test set: 130372 interactions (11.3%)


# Training model

In [5]:
params = {'factors': 8, 'mlp_user_item_dim': 256, 'mlp_time_dim': 8, 'learning_rate': 0.001, 'epochs': 10, 'optimizer': 'adagrad', 'dropout': 0.0, 'weight_decay': 0.0001, 'loss_fn': 'mse'}

unique_users = index_manager.get_indexed_users()
unique_items = index_manager.get_indexed_items()

model = NCFRecommender(unique_users, unique_items, **params)
model.fit(train_dataloader, eval_dataloader)

All weights initialized with Gaussian distribution (mean=0, std=0.01)
Epoch 1/10
Train loss: 0.244691, Validation loss: 0.230383
Epoch 2/10
Train loss: 0.174671, Validation loss: 0.152157
Epoch 3/10
Train loss: 0.134825, Validation loss: 0.146012
Epoch 4/10
Train loss: 0.127594, Validation loss: 0.143337
Epoch 5/10
Train loss: 0.121328, Validation loss: 0.141221
Epoch 6/10
Train loss: 0.112610, Validation loss: 0.139303
Epoch 7/10
Train loss: 0.098028, Validation loss: 0.138571
Epoch 8/10
Train loss: 0.080500, Validation loss: 0.140322
Early stopping triggered after 8 epochs
Training completed!


# Evaluating model

In [6]:
evaluator = Evaluation(recommender=model, test_data=df_test, time_feature=time_feature)
metrics = evaluator.evaluate()

Starting evaluation preparation...
Creating ground truth sets...
Ground truth created for 130372 users with an average of 1.0 items each
Ground truth creation completed in 1.95 seconds
Extracting timestamps for each user...
Timestamp extraction completed in 0.12 seconds
Generating predictions for 130372 users...
Processing predictions for 130372 users and 69001 items
Processing 1 of 130372 users... (0.00%)
Memory usage: 0.57421875 . Increased user batch size from 128 to 180
Memory usage: 0.57421875 . Increased item batch size from 1024 to 1440
Processing 129 of 130372 users... (0.10%)
Memory usage: 0.456787109375 . Increased user batch size from 180 to 318
Memory usage: 0.456787109375 . Increased item batch size from 1440 to 2546
Processing 309 of 130372 users... (0.24%)
Processing 627 of 130372 users... (0.48%)
Processing 945 of 130372 users... (0.72%)
Processing 1263 of 130372 users... (0.97%)
Processing 1581 of 130372 users... (1.21%)
Processing 1899 of 130372 users... (1.46%)
Proce

In [7]:
metrics

{'Hit Ratio@10': 0.03712453594330071,
 'NDCG@10': 0.017861551634027355,
 'Recall@10': 0.03712453594330071}