# Import libraries

In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
import ast
import json
from helpers.dataloader_custom_functions import collate_fn
from ncf import ModelType
from itertools import combinations

# Loading data

In [2]:
df_train = pd.read_csv('../data/train-leave2.csv')
df_val = pd.read_csv('../data/val-leave2.csv')
df_test = pd.read_csv('../data/test-leave2.csv')
df_features = pd.read_csv('../data/metadata.csv')
index_manager = IndexManager()
index_manager.load('../data/index.pkl')
with open('../data/feature-dims.json') as file:
    input_dims = json.load(file)

In [3]:
features = ['mm_total_recommendation', 'developers', 'genres', 'publishers']
feature_dims = {'mm_total_recommendation': [1, 16], 'developers': [43743, 128], 'genres': [28, 32], 'publishers': [36059, 128]}

In [4]:
def str_to_list(df, cols):
    for col in cols:
        df[col] = df[col].apply(ast.literal_eval)
    return df

category_cols = ['developers', 'genres', 'publishers']
df_train = str_to_list(df_train, category_cols)
df_val = str_to_list(df_val, category_cols)
df_test = str_to_list(df_test, category_cols)
df_features = str_to_list(df_features, category_cols)

In [5]:
feature_combinations = []
for i in range(1, 2):
    feature_combinations.extend(list(combinations(features, i)))
print(len(feature_combinations))
print(feature_combinations)

4
[('mm_total_recommendation',), ('developers',), ('genres',), ('publishers',)]


In [6]:
ndcg = []
recall = []
arp = []
poprate = []

for comb in feature_combinations:
    feature_dim = {feature: feature_dims[feature] for feature in comb}

    dataset_params = {
        'df_features': df_features,
        'feature_dims': feature_dim,
    }

    train_dataset = NCFDataset(df_train, **dataset_params)
    val_dataset = NCFDataset(df_val, **dataset_params)

    dataloader_params = {
        'batch_size': 2048,
        'num_workers': 6,
        'persistent_workers': True,
        'prefetch_factor': 4,
        'pin_memory': True,
        'pin_memory_device': 'cuda',
        'collate_fn': collate_fn,
    }

    train_dataloader = DataLoader(train_dataset, shuffle=True, **dataloader_params)
    val_dataloader = DataLoader(val_dataset, shuffle=False, **dataloader_params)

    params =  {'factors': 32, 'mlp_user_item_dim': 128, 'learning_rate': 0.001, 'epochs': 50, 'optimizer': 'adagrad', 'dropout': 0.3, 'weight_decay': 0.0005, 'loss_fn': 'mse'}
    unique_users = index_manager.get_indexed_values('user_id')
    unique_items = index_manager.get_indexed_values('item_id')

    model = NCFRecommender(unique_users, unique_items, mlp_feature_dims=feature_dim, df_features=df_features[['item_id'] + list(comb)], model_type=ModelType.LATE_FUSION, **params)
    model.fit(train_dataloader, val_dataloader)

    evaluator = Evaluation(recommender=model, test_data=df_test)
    metrics = evaluator.evaluate()

    ndcg.append(metrics['NDCG@10'])
    recall.append(metrics['Recall@10'])
    arp.append(metrics['ARP@10'])
    poprate.append(metrics['Pop Ratio@10'])

Epoch 1/50
Train loss: 0.179662, Validation loss: 0.227659
Epoch 2/50
Train loss: 0.162369, Validation loss: 0.228319
Epoch 3/50
Train loss: 0.161645, Validation loss: 0.228057
Epoch 4/50
Train loss: 0.161224, Validation loss: 0.228093
Epoch 5/50
Train loss: 0.161034, Validation loss: 0.227995
Epoch 6/50
Train loss: 0.160860, Validation loss: 0.227875
Early stopping triggered after 6 epochs
Training completed!
Creating ground truth sets...
Generating predictions...
Processing 1 of 121682 users... (0.00%)
Memory usage: 0.61328125
Increased user batch size from 128 to 141
Increased item batch size from 1024 to 1126
Processing 129 of 121682 users... (0.11%)
Memory usage: 0.61328125
Increased user batch size from 141 to 155
Increased item batch size from 1126 to 1239
Processing 270 of 121682 users... (0.22%)
Memory usage: 0.61328125
Increased user batch size from 155 to 170
Increased item batch size from 1239 to 1363
Processing 425 of 121682 users... (0.35%)
Memory usage: 0.61328125
Increa

In [7]:
df_metrics = pd.DataFrame({'feature': feature_combinations, 'ndcg': ndcg, 'recall': recall, 'arp': arp, 'poprate': poprate})
df_metrics

Unnamed: 0,feature,ndcg,recall,arp,poprate
0,"(mm_total_recommendation,)",0.013537,0.024161,3281.4,0.8
1,"(developers,)",0.004852,0.014719,86234.1,0.4
2,"(genres,)",0.003399,0.01061,59713.567654,0.295655
3,"(publishers,)",0.001892,0.001964,13696.4,0.4
