In [1]:
import pandas as pd
import json
from helpers.category_index_manager import CategoryIndexManager
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter
import ast

# Loading data

In [2]:
df_interaction = pd.read_csv('../data/interaction-clean.csv')
df_interaction.head()

Unnamed: 0,user_id,review,timestamp,rating,item_id,rating_exp,rating_imp,mm_timestamp,z_timestamp,log_timestamp
0,76561197960432447,A legendary tactical shooter that shaped the g...,1738278781,True,10,1,1,0.998997,0.559647,19.927005
1,76561198071230926,"The best CS sure, but server browser is the il...",1736206418,True,10,1,1,0.994407,0.535447,19.922399
2,76561198206216352,Some of the best memories of my childhood were...,1738041574,True,10,1,1,0.998472,0.556877,19.926479
3,76561198110801124,This game feels so much better than CS2. I kno...,1738015332,True,10,1,1,0.998414,0.55657,19.92642
4,76561199813732773,its very fun to play you can make friends out ...,1737853720,True,10,1,1,0.998056,0.554683,19.926062


In [3]:
df_metadata = pd.read_csv('../data/metadata-features.csv')
df_metadata.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",...,0.0,-0.046711,-23.025851,0.003679,-0.049534,6.549651,1612656000.0,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",...,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1602115000.0,0.84327,-0.103532,20.414682
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",...,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1603930000.0,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",...,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,1678320000.0,0.930761,0.670236,20.513398
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",...,5.6e-05,-0.035483,5.493061,0.007889,0.495218,7.312553,1639613000.0,0.886321,0.277211,20.464474


In [4]:
df_interaction_metadata = pd.merge(df_interaction, df_metadata, on='item_id', how='left')
df_interaction_metadata.dropna(subset=['name'], inplace=True)
df_interaction_metadata.head()

Unnamed: 0,user_id,review,timestamp,rating,item_id,rating_exp,rating_imp,mm_timestamp,z_timestamp,log_timestamp,...,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,released_timestamp,mm_released_date,z_released_date,log_released_date
0,76561197960432447,A legendary tactical shooter that shaped the g...,1738278781,True,10,1,1,0.998997,0.559647,19.927005,...,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,973036800.0,0.12102,-6.491067,18.473349
1,76561198071230926,"The best CS sure, but server browser is the il...",1736206418,True,10,1,1,0.994407,0.535447,19.922399,...,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,973036800.0,0.12102,-6.491067,18.473349
2,76561198206216352,Some of the best memories of my childhood were...,1738041574,True,10,1,1,0.998472,0.556877,19.926479,...,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,973036800.0,0.12102,-6.491067,18.473349
3,76561198110801124,This game feels so much better than CS2. I kno...,1738015332,True,10,1,1,0.998414,0.55657,19.92642,...,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,973036800.0,0.12102,-6.491067,18.473349
4,76561199813732773,its very fun to play you can make friends out ...,1737853720,True,10,1,1,0.998056,0.554683,19.926062,...,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,973036800.0,0.12102,-6.491067,18.473349


# Convert categorical columns to lists

In [5]:
df_metadata['publishers'] = df_metadata['publishers'].apply(ast.literal_eval)
df_metadata['developers'] = df_metadata['developers'].apply(ast.literal_eval)
df_metadata['tags'] = df_metadata['tags'].apply(ast.literal_eval)
df_metadata['supported_languages'] = df_metadata['supported_languages'].apply(ast.literal_eval)
df_metadata['genres'] = df_metadata['genres'].apply(ast.literal_eval)
df_metadata['categories'] = df_metadata['categories'].apply(ast.literal_eval)

# Indexing data

In [6]:
index_manager = IndexManager()
index_manager.fit(df_interaction_metadata, ['user_id', 'item_id'])
index_manager.fit(df_metadata, ['item_id'])
index_manager.transform(df_interaction_metadata, ['user_id', 'item_id'], inplace=True)
index_manager.transform(df_metadata, ['item_id'], inplace=True)

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,68396,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,2021-02-07,"[Turn-Based Tactics, Strategy, Wargame, Auto B...",...,0.000000,-0.046711,-23.025851,0.003679,-0.049534,6.549651,1.612656e+09,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,12027,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,2020-10-08,"[Casual, RPG, Simulation, Clicker, Farming Sim...",...,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1.602115e+09,0.843270,-0.103532,20.414682
2,Fade,12017,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,2020-10-29,"[Horror, RPG, Survival Horror, Top-Down Shoote...",...,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1.603930e+09,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,12026,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2023-03-09,"[Action, Adventure, RPG, Souls-like, Singlepla...",...,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,1.678320e+09,0.930761,0.670236,20.513398
4,Astatos,12031,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,2021-12-16,"[Early Access, Visual Novel, Card Battler, Car...",...,0.000056,-0.035483,5.493061,0.007889,0.495218,7.312553,1.639613e+09,0.886321,0.277211,20.464474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91279,Survive Avalon,16733,[English],[Sky Empire Games],[Sky Empire Games],"[Single-player, Partial Controller Support, Fa...","[Action, Adventure]",0,2021-05-31,"[Action, Survival, Open World, Adventure, Surv...",...,0.000000,-0.046711,-23.025851,0.006837,0.359030,7.169350,1.622419e+09,0.866581,0.102631,20.441951
91280,Cam Circle VR,16734,[English],[Reality Inside Ltd.],[Reality Inside Ltd.],"[Tracked Controller Support, VR Only]",[Utilities],0,2021-11-16,"[Utilities, VR, Software]",...,0.000000,-0.046711,-23.025851,0.001574,-0.321910,5.700444,1.637021e+09,0.883345,0.250892,20.461111
91281,Neon Light,16735,[English],[Dani Gas],[Louie Inc],"[Single-player, Steam Achievements, Family Sha...","[Adventure, Casual, Indie]",0,2021-05-17,"[Casual, Adventure, Arcade, Platformer, 2D Pla...",...,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1.621210e+09,0.865192,0.090349,20.440347
91282,G for Gravity,91283,[English],[OGW G5],[MasterDroid],"[Single-player, Steam Achievements]","[Casual, Free To Play, Indie]",0,2021-05-10,[Unknown],...,0.000000,-0.046711,-23.025851,0.000000,-0.525511,-23.025851,1.620605e+09,0.864498,0.084208,20.439544


In [7]:
category_index_manager = CategoryIndexManager()
categorical_features = ['publishers', 'developers', 'genres', 'categories', 'supported_languages', 'tags']
category_index_manager.fit(df_metadata, categorical_features)
category_index_manager.transform(df_metadata, categorical_features, inplace=True)

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,68396,"[0, 1]",[0],[0],"[0, 1, 2]",[0],0,2021-02-07,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",...,0.000000,-0.046711,-23.025851,0.003679,-0.049534,6.549651,1.612656e+09,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,12027,[2],[1],[1],"[0, 2]","[1, 2, 3, 4]",0,2020-10-08,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 14]",...,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1.602115e+09,0.843270,-0.103532,20.414682
2,Fade,12017,[2],[2],[2],"[0, 2]","[2, 3]",0,2020-10-29,"[27, 17, 28, 29, 30, 31, 32, 7, 33, 24, 34, 35...",...,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1.603930e+09,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,12026,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 12]",[3],[3],"[0, 3, 4, 5, 6, 2]","[5, 6, 2]",759,2023-03-09,"[38, 39, 17, 40, 14, 41, 42, 43, 9, 44, 30, 45...",...,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,1.678320e+09,0.930761,0.670236,20.513398
4,Astatos,12031,"[2, 0, 1]",[4],[4],"[0, 7, 8, 9, 10, 11, 3, 1, 2]","[6, 2, 0, 7]",243,2021-12-16,"[53, 54, 55, 56, 57, 1, 58, 44, 0, 59, 9, 60, ...",...,0.000056,-0.035483,5.493061,0.007889,0.495218,7.312553,1.639613e+09,0.886321,0.277211,20.464474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91279,Survive Avalon,16733,[2],[60661],[50327],"[0, 5, 2]","[5, 6]",0,2021-05-31,"[38, 32, 31, 39, 28, 76, 72, 27, 44, 85, 88, 2...",...,0.000000,-0.046711,-23.025851,0.006837,0.359030,7.169350,1.622419e+09,0.866581,0.102631,20.441951
91280,Cam Circle VR,16734,[2],[21958],[18521],"[12, 13]",[14],0,2021-11-16,"[364, 92, 392]",...,0.000000,-0.046711,-23.025851,0.001574,-0.321910,5.700444,1.637021e+09,0.883345,0.250892,20.461111
91281,Neon Light,16735,[2],[22443],[608],"[0, 3, 2]","[6, 1, 2]",0,2021-05-17,"[16, 39, 120, 67, 66, 65, 23, 25, 26, 49, 24, ...",...,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,1.621210e+09,0.865192,0.090349,20.440347
91282,G for Gravity,91283,[2],[60662],[50328],"[0, 3]","[1, 8, 2]",0,2021-05-10,[119],...,0.000000,-0.046711,-23.025851,0.000000,-0.525511,-23.025851,1.620605e+09,0.864498,0.084208,20.439544


# Get metadata categorical input dimension

In [8]:
feature_dims = {}
for feature in categorical_features:
    num_unique_values = df_metadata[feature].explode().nunique()
    feature_dims[feature] = num_unique_values

# Splitting datasets

In [9]:
splitter = Splitter(df_interaction_metadata)
df_train, df_val, df_test = splitter.leave_k_out_split()

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Note: Ensuring test samples contain only positive interactions (where rating_imp == 1)
Total users: 832660
Interactions per user: min=1, max=1034, avg=1.4


100%|██████████| 832660/832660 [01:55<00:00, 7181.00it/s] 
  test_df = pd.concat(test_dfs, ignore_index=True)


7779 users had insufficient positive interactions for testing.
Split complete: 1149955 total interactions
Train set: 898812 interactions (78.2%)
Validation set: 129461 interactions (11.3%)
Test set: 121682 interactions (10.6%)
Test set positive ratio: 100.0% (should be 100%)


# Save to files

In [27]:
df_train.to_csv('../data/train-leave2.csv', index=False)
df_val.to_csv('../data/val-leave2.csv', index=False)
df_test.to_csv('../data/test-leave2.csv', index=False)
df_metadata.to_csv('../data/metadata.csv', index=False)
index_manager.save('../data/index.pkl')
category_index_manager.save('../data/category-index.pkl')
with open('../data/feature-dims.json', 'w') as file:
    file.write(json.dumps(feature_dims))
with open('../data/user-interaction.json', 'w') as file:
    file.write(json.dumps(user_interaction))