# Import libraries

In [8]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter
from helpers.collate_fn import collate_fn
from helpers.h5_dataloader import H5DataLoader
import glob

# Loading data

In [9]:
df = pd.read_csv('../data/interaction-clean.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp']]
df.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,76561197960432447,10,1,1738278781
1,76561198071230926,10,1,1736206418
2,76561198206216352,10,1,1738041574
3,76561198110801124,10,1,1738015332
4,76561199813732773,10,1,1737853720


In [10]:
df_metadata = pd.read_csv('../data/metadata-features.csv')[['item_id']]
df_metadata.head()

Unnamed: 0,item_id
0,1430720
1,1430740
2,1430100
3,1430680
4,1430970


# Indexing data

In [11]:
index_manager = IndexManager()
index_manager.fit(df_interaction=df)
index_manager.transform_interactions(df, inplace=True)
index_manager.transform_metadata(df_metadata, inplace=True)

Indexed 836887 users and 69001 items
User index range: 0-836886
Item index range: 0-69000


89949
36882
36872
36881
36886
...
41579
41580
41581
76829
41582


# Creating datasets

In [12]:
splitter = Splitter(df)
df_train, df_val, df_test = splitter.leave_k_out_split()

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Total users: 836887
Interactions per user: min=1, max=1035, avg=1.4
Note: 706515 users have fewer than 2 interactions.
These users will be placed entirely in the training set.
Split complete: 1156226 total interactions
Train set: 895482 interactions (77.4%)
Validation set: 130372 interactions (11.3%)
Test set: 130372 interactions (11.3%)


In [13]:
train_dataset = NCFDataset(df_train, index_manager=index_manager)
val_dataset = NCFDataset(df_val, index_manager=index_manager)

dataloader_params = {
    'batch_size': 2**11,
    # 'num_workers': 2,
    # 'persistent_workers': True,
    # 'prefetch_factor': 2,
    'pin_memory': True,
    'pin_memory_device': 'cuda',
    'collate_fn': collate_fn,
}

train_dataloader = DataLoader(train_dataset, shuffle=True, **dataloader_params)
eval_dataloader = DataLoader(val_dataset, shuffle=False, **dataloader_params)

In [14]:
import time
times = []
for i in range(20):
    start_time = time.time()
    next(iter(train_dataloader))
    times.append(time.time() - start_time)
print(sum(times[1:]) / len(times[1:]))

3.9730093102706108


# Training model

In [8]:
params = {'factors': 8, 'mlp_user_item_dim': 128, 'learning_rate': 0.001, 'epochs': 5, 'optimizer': 'adagrad', 'dropout': 0.0, 'weight_decay': 0.0001, 'loss_fn': 'mse', 'image_dim': 64}

unique_users = index_manager.get_indexed_users()
unique_items = index_manager.get_indexed_items()

model = NCFRecommender(unique_users, unique_items, **params)
model.fit(train_dataloader, eval_dataloader)

Epoch 1/5
Time taken to process a batch: 0.35227513313293457 seconds
Time taken to process a batch: 0.16077852249145508 seconds
Time taken to process a batch: 0.13482022285461426 seconds
Time taken to process a batch: 0.13246822357177734 seconds
Time taken to process a batch: 0.22655320167541504 seconds
Time taken to process a batch: 0.21555280685424805 seconds
Time taken to process a batch: 0.3228273391723633 seconds
Time taken to process a batch: 0.21659159660339355 seconds
Time taken to process a batch: 0.24698328971862793 seconds
Time taken to process a batch: 0.22623825073242188 seconds
Time taken to process a batch: 0.23589396476745605 seconds
Time taken to process a batch: 0.2157297134399414 seconds
Time taken to process a batch: 0.22769808769226074 seconds
Time taken to process a batch: 0.23836827278137207 seconds
Time taken to process a batch: 0.2392868995666504 seconds
Time taken to process a batch: 0.22526121139526367 seconds
Time taken to process a batch: 0.2389616966247558

KeyboardInterrupt: 

# Evaluating model

In [7]:
evaluator = Evaluation(recommender=model, test_data=df_test)
metrics = evaluator.evaluate()

Creating ground truth sets...
Generating predictions...
Processing predictions for 130372 users and 69001 items
Processing 1 of 130372 users... (0.00%)
Time taken to process a user batch: 0.5205566883087158 seconds
Memory usage: 0.292724609375 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 128 to 141
Increased item batch size from 1024 to 1126
Processing 129 of 130372 users... (0.10%)
Time taken to process a user batch: 0.4030911922454834 seconds
Memory usage: 0.181884765625 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 141 to 155
Increased item batch size from 1126 to 1239
Processing 270 of 130372 users... (0.21%)
Time taken to process a user batch: 0.4364657402038574 seconds
Memory usage: 0.19677734375 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 155 to 170
Increased item batch size from 1239 to 1363
Processing 425 of 130372 users... (0.33%)
Time taken to process a user batc

In [8]:
metrics

{'Hit Ratio@10': 0.012011781670910932,
 'NDCG@10': 0.0044630279227807325,
 'Recall@10': 0.012011781670910932}