# Import libraries

In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter
import ast
from helpers.get_feature_dims import get_feature_dims
from helpers.dataloader_custom_functions import collate_fn
from helpers.category_index_manager import CategoryIndexManager

# Loading data

In [2]:
metadata_features = ['tags', 'supported_languages', 'genres', 'categories', 'mm_total_recommendation', 'z_total_recommendation', 'log_total_recommendation', 'mm_price', 'z_price', 'log_price', 'mm_released_date', 'z_released_date', 'log_released_date']

In [3]:
df = pd.read_csv('../data/interaction-metadata.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp'] + metadata_features]
df.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp,tags,supported_languages,genres,categories,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
0,76561197960432447,10,1,1738278781,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
1,76561198071230926,10,1,1736206418,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
2,76561198206216352,10,1,1738041574,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
3,76561198110801124,10,1,1738015332,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
4,76561199813732773,10,1,1737853720,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],"['Multi-player', 'PvP', 'Online PvP', 'Shared/...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349


In [4]:
# df['publishers'] = df['publishers'].apply(ast.literal_eval)
# df['developers'] = df['developers'].apply(ast.literal_eval)
df['tags'] = df['tags'].apply(ast.literal_eval)
df['supported_languages'] = df['supported_languages'].apply(ast.literal_eval)
df['genres'] = df['genres'].apply(ast.literal_eval)
df['categories'] = df['categories'].apply(ast.literal_eval)

# Indexing data

In [5]:
index_manager = IndexManager()
index_manager.fit(df_interaction=df)
index_manager.transform(df, inplace=True)
df.head()

Indexed 832660 users and 68396 items
User index range: 0-832659
Item index range: 0-68395


Unnamed: 0,user_id,item_id,rating_imp,timestamp,tags,supported_languages,genres,categories,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
0,0,0,1,1738278781,"[Action, FPS, Multiplayer, Shooter, Classic, T...","[English, French, German, Italian, Spanish - S...",[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
1,1,0,1,1736206418,"[Action, FPS, Multiplayer, Shooter, Classic, T...","[English, French, German, Italian, Spanish - S...",[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
2,2,0,1,1738041574,"[Action, FPS, Multiplayer, Shooter, Classic, T...","[English, French, German, Italian, Spanish - S...",[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
3,3,0,1,1738015332,"[Action, FPS, Multiplayer, Shooter, Classic, T...","[English, French, German, Italian, Spanish - S...",[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
4,4,0,1,1737853720,"[Action, FPS, Multiplayer, Shooter, Classic, T...","[English, French, German, Italian, Spanish - S...",[Action],"[Multi-player, PvP, Online PvP, Shared/Split S...",0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349


# Splitting data

In [6]:
splitter = Splitter(df)
df_train, df_val, df_test = splitter.leave_k_out_split()

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Total users: 832660
Interactions per user: min=1, max=1034, avg=1.4
Note: 703199 users have fewer than 2 interactions.
These users will be placed entirely in the training set.
Split complete: 1149955 total interactions
Train set: 891033 interactions (77.5%)
Validation set: 129461 interactions (11.3%)
Test set: 129461 interactions (11.3%)


In [7]:
metrics = {}
df_features = df[['item_id'] + metadata_features].drop_duplicates(subset='item_id')

category_index_manager = CategoryIndexManager()
category_index_manager.fit(df_features, ['tags', 'supported_languages', 'genres', 'categories'])
category_index_manager.transform(df_features, ['tags', 'supported_languages', 'genres', 'categories'], inplace=True)


for feature in metadata_features:
    feature_dims = get_feature_dims(df, [feature])

    dataset_params = {
        'df_features': df_features,
        'feature_dims': feature_dims,
    }

    train_dataset = NCFDataset(df_train, **dataset_params)
    val_dataset = NCFDataset(df_val, **dataset_params)

    dataloader_params = {
        'batch_size': 2**13,
        'num_workers': 4,
        'persistent_workers': True,
        'prefetch_factor': 4,
        'pin_memory': True,
        'pin_memory_device': 'cuda',
        'collate_fn': collate_fn,
    }

    train_dataloader = DataLoader(train_dataset, shuffle=True, **dataloader_params)
    val_dataloader = DataLoader(val_dataset, shuffle=False, **dataloader_params)

    params = {'factors': 32, 'mlp_user_item_dim': 64, 'learning_rate': 0.005, 'epochs': 1, 'optimizer': 'adagrad', 'dropout': 0.2, 'weight_decay': 0.0001, 'loss_fn': 'mse'}

    unique_users = index_manager.get_indexed_users()
    unique_items = index_manager.get_indexed_items()

    model = NCFRecommender(unique_users, unique_items,mlp_feature_dims=feature_dims, df_features=df_features, **params)
    model.fit(train_dataloader, val_dataloader)

    evaluator = Evaluation(recommender=model, test_data=df_test)
    metrics[feature] = evaluator.evaluate()

Epoch 1/1
Train loss: 0.181554, Validation loss: 0.152132
Training completed!
Creating ground truth sets...
Generating predictions...
Processing 1 of 129461 users... (0.00%)
Memory usage: 0.208984375 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 128 to 141
Increased item batch size from 1024 to 1126
Processing 129 of 129461 users... (0.10%)
Memory usage: 0.141845703125 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 141 to 155
Increased item batch size from 1126 to 1239
Processing 270 of 129461 users... (0.21%)
Memory usage: 0.152099609375 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 155 to 170
Increased item batch size from 1239 to 1363
Processing 425 of 129461 users... (0.33%)
Memory usage: 0.169677734375 . Increasing batch size with increasing rate of 1.1
Increased user batch size from 170 to 187
Increased item batch size from 1363 to 1499
Processing 595 of 129461 users... (

KeyboardInterrupt: 

In [8]:
metrics

{'publishers': {'Hit Ratio@10': 0.013007778404307089,
  'NDCG@10': 0.008701784235183826,
  'Recall@10': 0.013007778404307089},
 'developers': {'Hit Ratio@10': 0.009732660801322407,
  'NDCG@10': 0.005949515582659989,
  'Recall@10': 0.009732660801322407}}

In [9]:
hitratio = [metrics[feature]['Hit Ratio@10'] for feature in metadata_features]
ndcg = [metrics[feature]['NDCG@10'] for feature in metadata_features]
recall = [metrics[feature]['Recall@10'] for feature in metadata_features]

In [10]:
df_metrics = pd.DataFrame({'feature': metadata_features, 'hitratio': hitratio, 'ndcg': ndcg, 'recall': recall})
df_metrics

Unnamed: 0,feature,hitratio,ndcg,recall
0,publishers,0.033137,0.017441,0.033137
1,developers,0.018662,0.010835,0.018662
2,tags,0.033925,0.017372,0.033925
3,supported_languages,0.026016,0.014591,0.026016
4,genres,0.01962,0.011927,0.01962
5,categories,0.028225,0.014162,0.028225
6,mm_total_recommendation,0.026456,0.01467,0.026456
7,z_total_recommendation,0.01748,0.010773,0.01748
8,log_total_recommendation,0.021396,0.013094,0.021396
9,mm_price,0.01232,0.008114,0.01232
