# Import libraries

In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
import ast
import json
from helpers.dataloader_custom_functions import collate_fn

# Loading data

In [2]:
df_train = pd.read_csv('../data/train-leave2.csv')
df_val = pd.read_csv('../data/val-leave2.csv')
df_test = pd.read_csv('../data/test-leave2.csv')
df_features = pd.read_csv('../data/metadata.csv')
index_manager = IndexManager()
index_manager.load('../data/index.pkl')
with open('../data/feature-dims.json') as file:
    input_dims = json.load(file)

In [3]:
# features = []
features = ['tags', 'supported_languages', 'genres', 'categories', 'mm_total_recommendation', 'z_total_recommendation', 'log_total_recommendation', 'mm_price', 'z_price', 'log_price', 'mm_released_date', 'z_released_date', 'log_released_date']
feature_dims = {}
for feature in features:
    input_dim = input_dims.get(feature, 1)
    feature_dims[feature] = (input_dim, 8)

In [4]:
def str_to_list(df, cols):
    for col in cols:
        df[col] = df[col].apply(ast.literal_eval)
    return df

category_cols = ['tags', 'publishers', 'developers', 'genres', 'categories', 'supported_languages']
df_train = str_to_list(df_train, category_cols)
df_val = str_to_list(df_val, category_cols)
df_test = str_to_list(df_test, category_cols)
df_features = str_to_list(df_features, category_cols)

In [5]:
metrics = {}

for feature in features:
    feature_dim = {feature: feature_dims[feature]}

    dataset_params = {
        'df_features': df_features,
        'feature_dims': feature_dim,
    }

    train_dataset = NCFDataset(df_train, **dataset_params)
    val_dataset = NCFDataset(df_val, **dataset_params)

    dataloader_params = {
        'batch_size': 2**13,
        'num_workers': 4,
        'persistent_workers': True,
        'prefetch_factor': 4,
        'pin_memory': True,
        'pin_memory_device': 'cuda',
        'collate_fn': collate_fn,
    }

    train_dataloader = DataLoader(train_dataset, shuffle=True, **dataloader_params)
    val_dataloader = DataLoader(val_dataset, shuffle=False, **dataloader_params)

    params = {'factors': 32, 'mlp_user_item_dim': 64, 'learning_rate': 0.001, 'epochs': 20, 'optimizer': 'adagrad', 'dropout': 0.2, 'weight_decay': 0.0001, 'loss_fn': 'mse'}

    unique_users = index_manager.get_indexed_values('user_id')
    unique_items = index_manager.get_indexed_values('item_id')

    model = NCFRecommender(unique_users, unique_items, mlp_feature_dims=feature_dim, df_features=df_features[['item_id', feature]], **params)
    model.fit(train_dataloader, val_dataloader)

    evaluator = Evaluation(recommender=model, test_data=df_test)
    metrics[feature] = evaluator.evaluate()

Epoch 1/20
Train loss: 0.235830, Validation loss: 0.239136
Epoch 2/20
Train loss: 0.226644, Validation loss: 0.232569
Epoch 3/20
Train loss: 0.208872, Validation loss: 0.220253
Epoch 4/20
Train loss: 0.181337, Validation loss: 0.211083
Epoch 5/20
Train loss: 0.163239, Validation loss: 0.210726
Epoch 6/20
Train loss: 0.155896, Validation loss: 0.211990
Epoch 7/20
Train loss: 0.152138, Validation loss: 0.212582
Epoch 8/20
Train loss: 0.149553, Validation loss: 0.212140
Early stopping triggered after 8 epochs
Training completed!
Creating ground truth sets...
Generating predictions...
Processing 1 of 121682 users... (0.00%)
Memory usage: 0.213623046875 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 128 to 141
Increased item batch size from 1024 to 1126
Processing 129 of 121682 users... (0.11%)
Memory usage: 0.135498046875 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 141 to 155
Increased item batch size from 1126 to 1

KeyboardInterrupt: 

In [None]:
metrics

In [7]:
hitratio = [metrics[feature]['Hit Ratio@10'] for feature in features]
ndcg = [metrics[feature]['NDCG@10'] for feature in features]
recall = [metrics[feature]['Recall@10'] for feature in features]

In [8]:
df_metrics = pd.DataFrame({'feature': features, 'hitratio': hitratio, 'ndcg': ndcg, 'recall': recall})
df_metrics

Unnamed: 0,feature,hitratio,ndcg,recall
0,tags,0.040121,0.01988,0.040121
