# Import libraries

In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from dataset import NCFDataset
from recom_ncf import NCFRecommender
from evaluation import Evaluation
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter

# Loading data

In [2]:
metadata_features = ['log_total_recommendation']
# metadata_features = ['log_total_recommendation', 'cat_encoded', 'gen_encoded', 'publishers']

In [3]:
df = pd.read_csv('../data/interaction-clean.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp']]
df.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,76561197960432447,10,1,1738278781
1,76561198071230926,10,1,1736206418
2,76561198206216352,10,1,1738041574
3,76561198110801124,10,1,1738015332
4,76561199813732773,10,1,1737853720


In [4]:
df_metadata = pd.read_csv('../data/metadata-features-extracted.csv')
df_metadata.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,mm_price,z_price,log_price,lang_encoded,cat_encoded,gen_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",...,0.003679,-0.049534,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",...,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",...,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",...,0.015784,1.516628,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",...,0.007889,0.495218,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474


# Encoding categorical metadata features

In [5]:
import ast
df_metadata['publishers'] = df_metadata['publishers'].apply(ast.literal_eval)
df_metadata['developers'] = df_metadata['developers'].apply(ast.literal_eval)
df_metadata['lang_encoded'] = df_metadata['lang_encoded'].apply(ast.literal_eval)
df_metadata['gen_encoded'] = df_metadata['gen_encoded'].apply(ast.literal_eval)
df_metadata['cat_encoded'] = df_metadata['cat_encoded'].apply(ast.literal_eval)
df_metadata['tag_encoded'] = df_metadata['tags'].apply(ast.literal_eval)

In [6]:
pop_pub = df_metadata.explode(column='publishers').groupby(by='publishers')['total_recommendations'].sum().reset_index().sort_values(by='total_recommendations', ascending=False)
pop_dev = df_metadata.explode(column='developers').groupby(by='developers')['total_recommendations'].sum().reset_index().sort_values(by='total_recommendations', ascending=False)
# todo: add pop_tag
# pop_tag = df_metadata.explode(column='tags').groupby(by='tags').count().reset_index().sort_values(by='count', ascending=False)
pop_pub = pop_pub[:256]['publishers'].unique()
pop_dev = pop_dev[:256]['developers'].unique()
# pop_tag = pop_tag[:256]['tags'].unique()

In [7]:
import hashlib

def encode_high_cardinality(values, top_values, num_buckets=256):
    encoding = [0] * (len(top_values) + num_buckets)

    value_to_idx = {val: idx for idx, val in enumerate(top_values)}

    for val in values:
        if val in top_values:
            encoding[value_to_idx[val]] = 1
        else:
            hash_value = int(hashlib.md5(str(val).encode()).hexdigest(), 16)
            bucket = hash_value % num_buckets
            encoding[len(top_values) + bucket] = 1

    return encoding

encode_high_cardinality(['a', 'c', 'e'], ['a'], num_buckets=2)

[1, 1, 1]

In [8]:
df_metadata['publishers'] = df_metadata['publishers'].apply(lambda x: encode_high_cardinality(x, pop_pub))
df_metadata['developers'] = df_metadata['developers'].apply(lambda x: encode_high_cardinality(x, pop_dev))
df_metadata.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,z_price,log_price,lang_encoded,cat_encoded,gen_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date,tag_encoded
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",...,-0.049534,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931,"[Turn-Based Tactics, Strategy, Wargame, Auto B..."
1,Mine Crazy: The Korean Grinder,1430740,['English'],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",...,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682,"[Casual, RPG, Simulation, Clicker, Farming Sim..."
2,Fade,1430100,['English'],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",...,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149,"[Horror, RPG, Survival Horror, Top-Down Shoote..."
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",...,1.516628,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398,"[Action, Adventure, RPG, Souls-like, Singlepla..."
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",...,0.495218,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474,"[Early Access, Visual Novel, Card Battler, Car..."


# Indexing data

In [9]:
index_manager = IndexManager()
index_manager.fit(df_interaction=df)
index_manager.transform_interactions(df, inplace=True)
index_manager.transform_metadata(df_metadata, inplace=True)
df.head()

Indexed 836887 users and 69001 items
User index range: 0-836886
Item index range: 0-69000


Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,0,0,1,1738278781
1,1,0,1,1736206418
2,2,0,1,1738041574
3,3,0,1,1738015332
4,4,0,1,1737853720


# Creating datasets

In [10]:
splitter = Splitter(df)
df_train, df_val, df_test = splitter.leave_k_out_split()

train_dataset = NCFDataset(df_train, df_metadata=df_metadata, metadata_features=metadata_features)
val_dataset = NCFDataset(df_val, df_metadata=df_metadata, metadata_features=metadata_features)

train_dataloader = DataLoader(train_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=True)
eval_dataloader = DataLoader(val_dataset, batch_size=16384, num_workers=4, persistent_workers=True, prefetch_factor=2, pin_memory=True, shuffle=False)

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Total users: 836887
Interactions per user: min=1, max=1035, avg=1.4
Note: 706515 users have fewer than 2 interactions.
These users will be placed entirely in the training set.
Split complete: 1156226 total interactions
Train set: 895482 interactions (77.4%)
Validation set: 130372 interactions (11.3%)
Test set: 130372 interactions (11.3%)
Items in interactions: 62465
Items in metadata: 91284
Items in interactions with metadata: 61907
Items in interactions WITHOUT metadata: 558
Items in interactions: 28601
Items in metadata: 91284
Items in interactions with metadata: 28361
Items in interactions WITHOUT metadata: 240


# Training model

In [11]:
metadata_feature_dims = train_dataset.get_feature_dims()

params = {'factors': 8, 'mlp_user_item_dim': 256, 'learning_rate': 0.001, 'epochs': 10, 'optimizer': 'adagrad', 'dropout': 0.0, 'weight_decay': 0.0001, 'loss_fn': 'mse', 'mlp_metadata_embedding_dims': [8]*1, 'mlp_metadata_feature_dims': metadata_feature_dims}

unique_users = index_manager.get_indexed_users()
unique_items = index_manager.get_indexed_items()

model = NCFRecommender(unique_users, unique_items, **params)
model.fit(train_dataloader, eval_dataloader)

All weights initialized with Gaussian distribution (mean=0, std=0.01)
Epoch 1/10
Train loss: 0.246424, Validation loss: 0.236327
Epoch 2/10
Train loss: 0.187293, Validation loss: 0.165338
Epoch 3/10
Train loss: 0.152059, Validation loss: 0.159027
Epoch 4/10
Train loss: 0.146095, Validation loss: 0.154211
Epoch 5/10
Train loss: 0.141160, Validation loss: 0.150619
Epoch 6/10
Train loss: 0.138004, Validation loss: 0.148990
Epoch 7/10
Train loss: 0.136482, Validation loss: 0.148569
Epoch 8/10
Train loss: 0.135612, Validation loss: 0.148360
Epoch 9/10
Train loss: 0.134729, Validation loss: 0.148121
Epoch 10/10
Train loss: 0.133539, Validation loss: 0.147829
Training completed!


# Evaluating model

In [12]:
evaluator = Evaluation(recommender=model, test_data=df_test, df_metadata=df_metadata, metadata_features=metadata_features)
metrics = evaluator.evaluate()

Starting evaluation preparation...
Creating ground truth sets...
Ground truth created for 130372 users with an average of 1.0 items each
Ground truth creation completed in 1.85 seconds
Analyzing metadata features...
Analyzed 1 metadata features
Metadata analysis completed in 0.02 seconds
Generating predictions for 130372 users...
Processing predictions for 130372 users and 91889 items
Precomputing metadata features...
Metadata precomputation completed in 0.39 seconds
Processing 1 of 130372 users... (0.00%)
Memory usage: 0.587646484375 . Increased user batch size from 128 to 176
Memory usage: 0.587646484375 . Increased item batch size from 1024 to 1407
Processing 129 of 130372 users... (0.10%)
Memory usage: 0.45361328125 . Increased user batch size from 176 to 313
Memory usage: 0.45361328125 . Increased item batch size from 1407 to 2505
Processing 305 of 130372 users... (0.23%)
Processing 618 of 130372 users... (0.47%)
Processing 931 of 130372 users... (0.71%)
Processing 1244 of 130372 

In [13]:
metrics

{'Hit Ratio@10': 0.010516061731046543,
 'NDCG@10': 0.006159693978149528,
 'Recall@10': 0.010516061731046543}