In [1]:
import pandas as pd
import json
from helpers.category_index_manager import CategoryIndexManager
from helpers.index_manager import IndexManager
from helpers.splitter import Splitter
import ast

# Loading data

In [2]:
df_interaction = pd.read_csv('../data/interaction-clean.csv')[['user_id', 'item_id', 'rating_imp', 'timestamp',]]
df_interaction.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp
0,76561197960432447,10,1,1738278781
1,76561198071230926,10,1,1736206418
2,76561198206216352,10,1,1738041574
3,76561198110801124,10,1,1738015332
4,76561199813732773,10,1,1737853720


In [3]:
df_metadata = pd.read_csv('../data/metadata-features.csv')[['item_id', 'name', 'publishers', 'developers', 'tags', 'supported_languages', 'genres', 'categories', 'total_recommendations', 'mm_total_recommendation', 'z_total_recommendation', 'log_total_recommendation', 'mm_price', 'z_price', 'log_price', 'mm_released_date', 'z_released_date', 'log_released_date',]]
df_metadata.head()

Unnamed: 0,item_id,name,publishers,developers,tags,supported_languages,genres,categories,total_recommendations,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
0,1430720,Clash of Warlords,['XINLINE GAMES'],['XINLINE GAMES'],"['Turn-Based Tactics', 'Strategy', 'Wargame', ...","['Simplified Chinese', 'Traditional Chinese']",['Strategy'],"['Single-player', 'In-App Purchases', 'Family ...",0,0.0,-0.046711,-23.025851,0.003679,-0.049534,6.549651,0.855371,0.003497,20.428931
1,1430740,Mine Crazy: The Korean Grinder,['RealMono Inc.'],['Dano Sato'],"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",['English'],"['Casual', 'Indie', 'RPG', 'Simulation']","['Single-player', 'Family Sharing']",0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.84327,-0.103532,20.414682
2,1430100,Fade,['Azimyth Studios'],['Azimyth Studios'],"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",['English'],"['Indie', 'RPG']","['Single-player', 'Family Sharing']",0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.845353,-0.085109,20.417149
3,1430680,Clash: Artifacts of Chaos,['Nacon'],['ACE Team'],"['Action', 'Adventure', 'RPG', 'Souls-like', '...","['English', 'French', 'Italian', 'German', 'Sp...","['Action', 'Adventure', 'Indie']","['Single-player', 'Steam Achievements', 'Steam...",759,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,0.930761,0.670236,20.513398
4,1430970,Astatos,['Studio Klondike'],['Studio Klondike Australia'],"['Early Access', 'Visual Novel', 'Card Battler...","['English', 'Simplified Chinese', 'Traditional...","['Adventure', 'Indie', 'Strategy', 'Early Acce...","['Single-player', 'Multi-player', 'PvP', 'Onli...",243,5.6e-05,-0.035483,5.493061,0.007889,0.495218,7.312553,0.886321,0.277211,20.464474


In [4]:
item_ids = df_interaction['item_id'].unique()
df_metadata = df_metadata[df_metadata['item_id'].isin(item_ids)]
df_metadata.head()

Unnamed: 0,item_id,name,publishers,developers,tags,supported_languages,genres,categories,total_recommendations,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
1,1430740,Mine Crazy: The Korean Grinder,['RealMono Inc.'],['Dano Sato'],"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",['English'],"['Casual', 'Indie', 'RPG', 'Simulation']","['Single-player', 'Family Sharing']",0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.84327,-0.103532,20.414682
2,1430100,Fade,['Azimyth Studios'],['Azimyth Studios'],"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",['English'],"['Indie', 'RPG']","['Single-player', 'Family Sharing']",0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.845353,-0.085109,20.417149
3,1430680,Clash: Artifacts of Chaos,['Nacon'],['ACE Team'],"['Action', 'Adventure', 'RPG', 'Souls-like', '...","['English', 'French', 'Italian', 'German', 'Sp...","['Action', 'Adventure', 'Indie']","['Single-player', 'Steam Achievements', 'Steam...",759,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,0.930761,0.670236,20.513398
4,1430970,Astatos,['Studio Klondike'],['Studio Klondike Australia'],"['Early Access', 'Visual Novel', 'Card Battler...","['English', 'Simplified Chinese', 'Traditional...","['Adventure', 'Indie', 'Strategy', 'Early Acce...","['Single-player', 'Multi-player', 'PvP', 'Onli...",243,5.6e-05,-0.035483,5.493061,0.007889,0.495218,7.312553,0.886321,0.277211,20.464474
6,1430640,Into The Haze,['MSOFT'],['MSOFT'],"['Early Access', 'Action', 'Survival', 'Surviv...","['English', 'Thai']","['Action', 'Adventure', 'Indie', 'Early Access']","['Single-player', 'Family Sharing']",123,2.8e-05,-0.041027,4.812184,0.007889,0.495218,7.312553,0.855173,0.001742,20.428699


In [5]:
df_interaction_metadata = pd.merge(df_interaction, df_metadata, on='item_id', how='left')
df_interaction_metadata.dropna(subset=['name'], inplace=True)
df_interaction_metadata.head()

Unnamed: 0,user_id,item_id,rating_imp,timestamp,name,publishers,developers,tags,supported_languages,genres,...,total_recommendations,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
0,76561197960432447,10,1,1738278781,Counter-Strike,['Valve'],['Valve'],"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],...,157570.0,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
1,76561198071230926,10,1,1736206418,Counter-Strike,['Valve'],['Valve'],"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],...,157570.0,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
2,76561198206216352,10,1,1738041574,Counter-Strike,['Valve'],['Valve'],"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],...,157570.0,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
3,76561198110801124,10,1,1738015332,Counter-Strike,['Valve'],['Valve'],"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],...,157570.0,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349
4,76561199813732773,10,1,1737853720,Counter-Strike,['Valve'],['Valve'],"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...","['English', 'French', 'German', 'Italian', 'Sp...",['Action'],...,157570.0,0.036216,7.233859,11.967625,0.005258,0.154748,6.906755,0.12102,-6.491067,18.473349


# Convert categorical columns to lists

In [6]:
df_metadata['publishers'] = df_metadata['publishers'].apply(ast.literal_eval)
df_metadata['developers'] = df_metadata['developers'].apply(ast.literal_eval)
df_metadata['tags'] = df_metadata['tags'].apply(ast.literal_eval)
df_metadata['supported_languages'] = df_metadata['supported_languages'].apply(ast.literal_eval)
df_metadata['genres'] = df_metadata['genres'].apply(ast.literal_eval)
df_metadata['categories'] = df_metadata['categories'].apply(ast.literal_eval)

# Indexing data

In [7]:
index_manager = IndexManager()
index_manager.fit(df_interaction_metadata, ['user_id', 'item_id'])
index_manager.transform(df_interaction_metadata, ['user_id', 'item_id'], inplace=True)
index_manager.transform(df_metadata, ['item_id'], inplace=True)

Unnamed: 0,item_id,name,publishers,developers,tags,supported_languages,genres,categories,total_recommendations,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
1,12027,Mine Crazy: The Korean Grinder,[RealMono Inc.],[Dano Sato],"[Casual, RPG, Simulation, Clicker, Farming Sim...",[English],"[Casual, Indie, RPG, Simulation]","[Single-player, Family Sharing]",0,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.843270,-0.103532,20.414682
2,12017,Fade,[Azimyth Studios],[Azimyth Studios],"[Horror, RPG, Survival Horror, Top-Down Shoote...",[English],"[Indie, RPG]","[Single-player, Family Sharing]",0,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.845353,-0.085109,20.417149
3,12026,Clash: Artifacts of Chaos,[Nacon],[ACE Team],"[Action, Adventure, RPG, Souls-like, Singlepla...","[English, French, Italian, German, Spanish - S...","[Action, Adventure, Indie]","[Single-player, Steam Achievements, Steam Trad...",759,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,0.930761,0.670236,20.513398
4,12031,Astatos,[Studio Klondike],[Studio Klondike Australia],"[Early Access, Visual Novel, Card Battler, Car...","[English, Simplified Chinese, Traditional Chin...","[Adventure, Indie, Strategy, Early Access]","[Single-player, Multi-player, PvP, Online PvP,...",243,0.000056,-0.035483,5.493061,0.007889,0.495218,7.312553,0.886321,0.277211,20.464474
6,12025,Into The Haze,[MSOFT],[MSOFT],"[Early Access, Action, Survival, Survival Horr...","[English, Thai]","[Action, Adventure, Indie, Early Access]","[Single-player, Family Sharing]",123,0.000028,-0.041027,4.812184,0.007889,0.495218,7.312553,0.855173,0.001742,20.428699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91278,16732,The Scorchfarer,[Michiyuki Project],[Michiyuki Project],[Unknown],[English],"[Adventure, Casual, Simulation, Free To Play]","[Single-player, Steam Achievements, Stats]",0,0.000000,-0.046711,-23.025851,0.000000,-0.525511,-23.025851,0.898224,0.382485,20.477815
91279,16733,Survive Avalon,[Sky Empire Games],[Sky Empire Games],"[Action, Survival, Open World, Adventure, Surv...",[English],"[Action, Adventure]","[Single-player, Partial Controller Support, Fa...",0,0.000000,-0.046711,-23.025851,0.006837,0.359030,7.169350,0.866581,0.102631,20.441951
91280,16734,Cam Circle VR,[Reality Inside Ltd.],[Reality Inside Ltd.],"[Utilities, VR, Software]",[English],[Utilities],"[Tracked Controller Support, VR Only]",0,0.000000,-0.046711,-23.025851,0.001574,-0.321910,5.700444,0.883345,0.250892,20.461111
91281,16735,Neon Light,[Louie Inc],[Dani Gas],"[Casual, Adventure, Arcade, Platformer, 2D Pla...",[English],"[Adventure, Casual, Indie]","[Single-player, Steam Achievements, Family Sha...",0,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.865192,0.090349,20.440347


In [8]:
category_index_manager = CategoryIndexManager()
categorical_features = ['publishers', 'developers', 'genres', 'categories', 'supported_languages', 'tags']
category_index_manager.fit(df_metadata, categorical_features)
category_index_manager.transform(df_metadata, categorical_features, inplace=True)

Unnamed: 0,item_id,name,publishers,developers,tags,supported_languages,genres,categories,total_recommendations,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,mm_released_date,z_released_date,log_released_date
1,12027,Mine Crazy: The Korean Grinder,[0],[0],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]",[0],"[0, 1, 2, 3]","[0, 1]",0,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.843270,-0.103532,20.414682
2,12017,Fade,[1],[1],"[12, 1, 13, 14, 15, 16, 17, 18, 19, 8, 20, 21,...",[0],"[1, 2]","[0, 1]",0,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.845353,-0.085109,20.417149
3,12026,Clash: Artifacts of Chaos,[2],[2],"[25, 26, 1, 27, 11, 28, 29, 30, 23, 31, 15, 32...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[4, 5, 1]","[0, 2, 3, 4, 5, 1]",759,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,0.930761,0.670236,20.513398
4,12031,Astatos,[3],[3],"[40, 41, 42, 43, 44, 45, 46, 31, 47, 48, 23, 4...","[0, 10, 11]","[5, 1, 6, 7]","[0, 6, 7, 8, 9, 10, 2, 11, 1]",243,0.000056,-0.035483,5.493061,0.007889,0.495218,7.312553,0.886321,0.277211,20.464474
6,12025,Into The Haze,[4],[4],"[40, 25, 17, 13, 54, 55, 56, 30, 57, 34, 58, 5...","[0, 13]","[4, 5, 1, 7]","[0, 1]",123,0.000028,-0.041027,4.812184,0.007889,0.495218,7.312553,0.855173,0.001742,20.428699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91278,16732,The Scorchfarer,[36056],[43740],[168],[0],"[5, 0, 3, 11]","[0, 2, 17]",0,0.000000,-0.046711,-23.025851,0.000000,-0.525511,-23.025851,0.898224,0.382485,20.477815
91279,16733,Survive Avalon,[36057],[43741],"[25, 17, 16, 26, 13, 61, 56, 12, 31, 70, 73, 2...",[0],"[4, 5]","[0, 4, 1]",0,0.000000,-0.046711,-23.025851,0.006837,0.359030,7.169350,0.866581,0.102631,20.441951
91280,16734,Cam Circle VR,[13895],[16517],"[354, 78, 386]",[0],[13],"[12, 13]",0,0.000000,-0.046711,-23.025851,0.001574,-0.321910,5.700444,0.883345,0.250892,20.461111
91281,16735,Neon Light,[465],[16859],"[0, 26, 99, 191, 122, 192, 7, 9, 10, 36, 8, 29...",[0],"[5, 0, 1]","[0, 2, 1]",0,0.000000,-0.046711,-23.025851,0.001047,-0.390004,5.293305,0.865192,0.090349,20.440347


# Get metadata categorical input dimension

In [9]:
feature_dims = {}
for feature in categorical_features:
    num_unique_values = df_metadata[feature].explode().nunique()
    feature_dims[feature] = num_unique_values

# Splitting datasets

In [10]:
splitter = Splitter(df_interaction_metadata)
df_train, df_val, df_test = splitter.leave_k_out_split()

Splitting data with leave-2-out strategy (1 for validation, 1 for testing)
Note: Ensuring test samples contain only positive interactions (where rating_imp == 1)
Total users: 832660
Interactions per user: min=1, max=1034, avg=1.4


100%|██████████| 832660/832660 [01:56<00:00, 7168.04it/s] 
  test_df = pd.concat(test_dfs, ignore_index=True)


7779 users had insufficient positive interactions for testing.
Split complete: 1149955 total interactions
Train set: 898812 interactions (78.2%)
Validation set: 129461 interactions (11.3%)
Test set: 121682 interactions (10.6%)
Test set positive ratio: 100.0% (should be 100%)


# Save to files

In [11]:
df_train.to_csv('../data/train-leave2.csv', index=False)
df_val.to_csv('../data/val-leave2.csv', index=False)
df_test.to_csv('../data/test-leave2.csv', index=False)
df_metadata.to_csv('../data/metadata.csv', index=False)
index_manager.save('../data/index.pkl')
category_index_manager.save('../data/category-index.pkl')
with open('../data/feature-dims.json', 'w') as file:
    file.write(json.dumps(feature_dims))