# Import libraries

In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter
import ast

# Loading data

In [2]:
df = pd.read_csv('../data/interaction-clean.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp']]
df.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,76561197960432447,10,1,1738278781
1,76561198071230926,10,1,1736206418
2,76561198206216352,10,1,1738041574
3,76561198110801124,10,1,1738015332
4,76561199813732773,10,1,1737853720


In [3]:
df_metadata = pd.read_csv('../data/metadata-features-extracted.csv')
df_metadata.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,pub_encoded,dev_encoded,tag_encoded,lang_encoded,cat_encoded,gen_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474


In [4]:
df_metadata['pub_encoded'] = df_metadata['pub_encoded'].apply(ast.literal_eval)
df_metadata['dev_encoded'] = df_metadata['dev_encoded'].apply(ast.literal_eval)
df_metadata['tag_encoded'] = df_metadata['tag_encoded'].apply(ast.literal_eval)
df_metadata['lang_encoded'] = df_metadata['lang_encoded'].apply(ast.literal_eval)
df_metadata['gen_encoded'] = df_metadata['gen_encoded'].apply(ast.literal_eval)
df_metadata['cat_encoded'] = df_metadata['cat_encoded'].apply(ast.literal_eval)

In [5]:
metadata_features = ['pub_encoded', 'dev_encoded', 'tag_encoded', 'lang_encoded', 'gen_encoded', 'cat_encoded', 'mm_total_recommendation', 'z_total_recommendation', 'log_total_recommendation', 'mm_price', 'z_price', 'log_price', 'mm_released_date', 'z_released_date', 'log_released_date']

# Indexing data

In [6]:
index_manager = IndexManager()
index_manager.fit(df_interaction=df)
index_manager.transform_interactions(df, inplace=True)
index_manager.transform_metadata(df_metadata, inplace=True)
df.head()

Indexed 836887 users and 69001 items
User index range: 0-836886
Item index range: 0-69000


Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,0,0,1,1738278781
1,1,0,1,1736206418
2,2,0,1,1738041574
3,3,0,1,1738015332
4,4,0,1,1737853720


# Splitting data

In [7]:
splitter = Splitter(df)
df_train, df_val, df_test = splitter.leave_k_out_split()

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Total users: 836887
Interactions per user: min=1, max=1035, avg=1.4
Note: 706515 users have fewer than 2 interactions.
These users will be placed entirely in the training set.
Split complete: 1156226 total interactions
Train set: 895482 interactions (77.4%)
Validation set: 130372 interactions (11.3%)
Test set: 130372 interactions (11.3%)


In [8]:
metrics = {}

for feature in metadata_features:
    train_dataset = NCFDataset(df_train, df_metadata=df_metadata, metadata_features=[feature])
    val_dataset = NCFDataset(df_val, df_metadata=df_metadata, metadata_features=[feature])

    train_dataloader = DataLoader(train_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=True)
    eval_dataloader = DataLoader(val_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=False)

    metadata_feature_dims = train_dataset.get_feature_dims()

    params = {'factors': 16, 'mlp_user_item_dim': 32, 'learning_rate': 0.001, 'epochs': 100, 'optimizer': 'adagrad', 'dropout': 0.5, 'weight_decay': 1e-05, 'loss_fn': 'mse', 'mlp_metadata_embedding_dims': [8]*1, 'mlp_metadata_feature_dims': metadata_feature_dims}

    unique_users = index_manager.get_indexed_users()
    unique_items = index_manager.get_indexed_items()

    model = NCFRecommender(unique_users, unique_items, **params)
    model.fit(train_dataloader, eval_dataloader)

    evaluator = Evaluation(recommender=model, test_data=df_test, df_metadata=df_metadata, metadata_features=[feature])
    metrics[feature] = evaluator.evaluate()

Items in interactions: 62465
Items in metadata: 91284
Items in interactions with metadata: 61907
Items in interactions WITHOUT metadata: 558
Items in interactions: 28601
Items in metadata: 91284
Items in interactions with metadata: 28361
Items in interactions WITHOUT metadata: 240
All weights initialized with Gaussian distribution (mean=0, std=0.01)
Epoch 1/100
Train loss: 0.248616, Validation loss: 0.247876
Epoch 2/100
Train loss: 0.247203, Validation loss: 0.246832
Epoch 3/100
Train loss: 0.246217, Validation loss: 0.245971
Epoch 4/100
Train loss: 0.245341, Validation loss: 0.245155
Epoch 5/100
Train loss: 0.244477, Validation loss: 0.244316
Epoch 6/100
Train loss: 0.243556, Validation loss: 0.243387
Epoch 7/100
Train loss: 0.242498, Validation loss: 0.242285
Epoch 8/100
Train loss: 0.241208, Validation loss: 0.240929
Epoch 9/100
Train loss: 0.239604, Validation loss: 0.239236
Epoch 10/100
Train loss: 0.237600, Validation loss: 0.237131
Epoch 11/100
Train loss: 0.235158, Validation l

In [9]:
metrics

{'pub_encoded': {'Hit Ratio@10': 0.032836805449022795,
  'NDCG@10': 0.014553707970829602,
  'Recall@10': 0.032836805449022795},
 'dev_encoded': {'Hit Ratio@10': 0.028802196790721935,
  'NDCG@10': 0.011136635010792392,
  'Recall@10': 0.028802196790721935},
 'tag_encoded': {'Hit Ratio@10': 0.026240296996287545,
  'NDCG@10': 0.013835045237743673,
  'Recall@10': 0.026240296996287545},
 'lang_encoded': {'Hit Ratio@10': 0.023402264289878193,
  'NDCG@10': 0.012177368863336448,
  'Recall@10': 0.023402264289878193},
 'gen_encoded': {'Hit Ratio@10': 0.028180897738778264,
  'NDCG@10': 0.013459431920974001,
  'Recall@10': 0.028180897738778264},
 'cat_encoded': {'Hit Ratio@10': 0.01790261712637683,
  'NDCG@10': 0.008837353163368047,
  'Recall@10': 0.01790261712637683},
 'mm_total_recommendation': {'Hit Ratio@10': 0.03585125640474949,
  'NDCG@10': 0.01741366467396306,
  'Recall@10': 0.03585125640474949},
 'z_total_recommendation': {'Hit Ratio@10': 0.0001764182493173381,
  'NDCG@10': 5.46762184192287

In [10]:
hitratio = [metrics[feature]['Hit Ratio@10'] for feature in metadata_features]
ndcg = [metrics[feature]['NDCG@10'] for feature in metadata_features]
recall = [metrics[feature]['Recall@10'] for feature in metadata_features]

In [11]:
df_metrics = pd.DataFrame({'feature': metadata_features, 'hitratio': hitratio, 'ndcg': ndcg, 'recall': recall})
df_metrics

Unnamed: 0,feature,hitratio,ndcg,recall
0,pub_encoded,0.032837,0.014554,0.032837
1,dev_encoded,0.028802,0.011137,0.028802
2,tag_encoded,0.02624,0.013835,0.02624
3,lang_encoded,0.023402,0.012177,0.023402
4,gen_encoded,0.028181,0.013459,0.028181
5,cat_encoded,0.017903,0.008837,0.017903
6,mm_total_recommendation,0.035851,0.017414,0.035851
7,z_total_recommendation,0.000176,5.5e-05,0.000176
8,log_total_recommendation,0.000176,5.5e-05,0.000176
9,mm_price,0.037132,0.017961,0.037132
