# Import libraries

In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter

# Loading data

In [2]:
df = pd.read_csv('../data/interaction-clean.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp']]
df.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,76561197960432447,10,1,1738278781
1,76561198071230926,10,1,1736206418
2,76561198206216352,10,1,1738041574
3,76561198110801124,10,1,1738015332
4,76561199813732773,10,1,1737853720


# Indexing data

In [3]:
index_manager = IndexManager()
index_manager.fit(df_interaction=df)
index_manager.transform_interactions(df, inplace=True)
df.head()

Indexed 836887 users and 69001 items
User index range: 0-836886
Item index range: 0-69000


Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,0,0,1,1738278781
1,1,0,1,1736206418
2,2,0,1,1738041574
3,3,0,1,1738015332
4,4,0,1,1737853720


# Creating datasets

In [4]:
splitter = Splitter(df)
df_train, df_val, df_test = splitter.leave_k_out_split()

train_dataset = NCFDataset(df_train)
val_dataset = NCFDataset(df_val)

train_dataloader = DataLoader(train_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=True)
eval_dataloader = DataLoader(val_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=False)

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Total users: 836887
Interactions per user: min=1, max=1035, avg=1.4
Note: 706515 users have fewer than 2 interactions.
These users will be placed entirely in the training set.
Split complete: 1156226 total interactions
Train set: 895482 interactions (77.4%)
Validation set: 130372 interactions (11.3%)
Test set: 130372 interactions (11.3%)


# Training model

In [5]:
params = {'factors': 8, 'mlp_user_item_dim': 256, 'learning_rate': 0.001, 'epochs': 10, 'optimizer': 'adagrad', 'dropout': 0.0, 'weight_decay': 0.0001, 'loss_fn': 'mse'}

unique_users = index_manager.get_indexed_users()
unique_items = index_manager.get_indexed_items()

model = NCFRecommender(unique_users, unique_items, **params)
model.fit(train_dataloader, eval_dataloader)

All weights initialized with Gaussian distribution (mean=0, std=0.01)
Epoch 1/10
Train loss: 0.245561, Validation loss: 0.235540
Epoch 2/10
Train loss: 0.184829, Validation loss: 0.155125
Epoch 3/10
Train loss: 0.136058, Validation loss: 0.146271
Epoch 4/10
Train loss: 0.127947, Validation loss: 0.143217
Epoch 5/10
Train loss: 0.121494, Validation loss: 0.141107
Epoch 6/10
Train loss: 0.113362, Validation loss: 0.139281
Epoch 7/10
Train loss: 0.099958, Validation loss: 0.138419
Epoch 8/10
Train loss: 0.082749, Validation loss: 0.139902
Epoch 9/10
Train loss: 0.071747, Validation loss: 0.141591
Epoch 10/10
Train loss: 0.066262, Validation loss: 0.142757
Training completed!


# Evaluating model

In [6]:
evaluator = Evaluation(recommender=model, test_data=df_test)
metrics = evaluator.evaluate()

Starting evaluation preparation...
Creating ground truth sets...
Ground truth created for 130372 users with an average of 1.0 items each
Ground truth creation completed in 1.90 seconds
Generating predictions for 130372 users...
Processing predictions for 130372 users and 69001 items
Processing 1 of 130372 users... (0.00%)
Memory usage: 0.573486328125 . Increased user batch size from 128 to 212
Memory usage: 0.573486328125 . Increased item batch size from 1024 to 1696
Processing 129 of 130372 users... (0.10%)
Memory usage: 0.496826171875 . Increased user batch size from 212 to 405
Memory usage: 0.496826171875 . Increased item batch size from 1696 to 3243
Processing 341 of 130372 users... (0.26%)
Memory usage: 1.356201171875 . Reduced item batch size from 3243 to 1621
Processing 746 of 130372 users... (0.57%)
Memory usage: 0.79736328125 . Increased user batch size from 405 to 483
Memory usage: 0.79736328125 . Increased item batch size from 1621 to 1931
Processing 1151 of 130372 users... 

In [8]:
metrics

{'Hit Ratio@10': 0.03712453594330071,
 'NDCG@10': 0.01787509744846914,
 'Recall@10': 0.03712453594330071}