In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/metadata-clean.csv')
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,699.0,2021-02-07
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,199.0,2020-10-08
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,199.0,2020-10-29
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2999.0,2023-03-09
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,1499.0,2021-12-16


## Numerical feature encoding

In [3]:
def min_max(values: pd.Series):
    min_time = values.min()
    max_time = values.max()
    normalized = (values - min_time) / (max_time - min_time)
    return normalized

def z_score(values: pd.Series):
    mean_time = values.mean()
    std_time = values.std()
    standardized = (values - mean_time) / std_time
    return standardized

def log_normalize(values: pd.Series):
    min_time = values.min()
    offset_timestamps = values - min_time
    epsilon = 1e-10
    return np.log(offset_timestamps + epsilon)

### Total recommendation

In [4]:
df['mm_total_recommendation'] = min_max(df['total_recommendations'])
df['z_total_recommendation'] = z_score(df['total_recommendations'])
df['log_total_recommendation'] = log_normalize(df['total_recommendations'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,mm_total_recommendation,z_total_recommendation,log_total_recommendation
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,699.0,2021-02-07,0.0,-0.046711,-23.025851
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,199.0,2020-10-08,0.0,-0.046711,-23.025851
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,199.0,2020-10-29,0.0,-0.046711,-23.025851
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2999.0,2023-03-09,0.000174,-0.011641,6.632002
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,1499.0,2021-12-16,5.6e-05,-0.035483,5.493061


### Price

In [5]:
df['mm_price'] = min_max(df['price'])
df['z_price'] = z_score(df['price'])
df['log_price'] = log_normalize(df['price'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,699.0,2021-02-07,0.0,-0.046711,-23.025851,0.003679,-0.023378,6.549651
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,199.0,2020-10-08,0.0,-0.046711,-23.025851,0.001047,-0.367941,5.293305
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,199.0,2020-10-29,0.0,-0.046711,-23.025851,0.001047,-0.367941,5.293305
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2999.0,2023-03-09,0.000174,-0.011641,6.632002,0.015784,1.561612,8.006034
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,1499.0,2021-12-16,5.6e-05,-0.035483,5.493061,0.007889,0.527923,7.312553


## Categorical features encoding

In [6]:
import ast
df['supported_languages'] = df['supported_languages'].apply(ast.literal_eval)
df['developers'] = df['developers'].apply(ast.literal_eval)
df['publishers'] = df['publishers'].apply(ast.literal_eval)
df['categories'] = df['categories'].apply(ast.literal_eval)
df['genres'] = df['genres'].apply(ast.literal_eval)

In [7]:
unique_lang = sorted(df['supported_languages'].explode().unique())
unique_dev = sorted(df['developers'].explode().unique())
unique_pub = sorted(df['publishers'].explode().unique())
unique_cat = sorted(df['categories'].explode().unique())
unique_gen = sorted(df['genres'].explode().unique())

In [8]:
print(f'Number of unique languages: {len(unique_lang)}')
print(f'Number of unique developers: {len(unique_dev)}')
print(f'Number of unique publishers: {len(unique_pub)}')
print(f'Number of unique categories: {len(unique_cat)}')
print(f'Number of unique genres: {len(unique_gen)}')

Number of unique languages: 103
Number of unique developers: 60664
Number of unique publishers: 50330
Number of unique categories: 42
Number of unique genres: 28


### Multi-hot encoding

In [9]:
def encode(values, unique_values):
    encoding = [0] * len(unique_values)
    
    value_to_idx = {val: idx for idx, val in enumerate(unique_values)}
    
    for val in values:
        if val in unique_values:
            encoding[value_to_idx[val]] = 1
    
    return encoding

df['lang_encoded'] = df['supported_languages'].apply(lambda x: encode(x, unique_lang))
df['cat_encoded'] = df['categories'].apply(lambda x: encode(x, unique_cat))
df['gen_encoded'] = df['genres'].apply(lambda x: encode(x, unique_gen))
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,lang_encoded,cat_encoded,gen_encoded
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,0.0,-0.046711,-23.025851,0.003679,-0.023378,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,0.0,-0.046711,-23.025851,0.001047,-0.367941,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,0.0,-0.046711,-23.025851,0.001047,-0.367941,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,0.000174,-0.011641,6.632002,0.015784,1.561612,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,5.6e-05,-0.035483,5.493061,0.007889,0.527923,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."


### Developers and Publishers (high cardinality)

In [10]:
pop_dev = df.explode(column='developers').groupby(by='developers')['total_recommendations'].sum().reset_index().sort_values(by='total_recommendations', ascending=False)
pop_pub = df.explode(column='publishers').groupby(by='publishers')['total_recommendations'].sum().reset_index().sort_values(by='total_recommendations', ascending=False)

In [11]:
pop_dev = pop_dev[:1000]['developers'].unique()
pop_pub = pop_pub[:1000]['publishers'].unique()

In [12]:
import hashlib

def encode_high_cardinality(values, top_values, num_buckets=1000):
    encoding = [0] * (len(top_values) + num_buckets)
    
    value_to_idx = {val: idx for idx, val in enumerate(top_values)}
    
    for val in values:
        if val in top_values:
            encoding[value_to_idx[val]] = 1
        else:
            hash_value = int(hashlib.md5(str(val).encode()).hexdigest(), 16)
            bucket = hash_value % num_buckets
            encoding[len(top_values) + bucket] = 1
    
    return encoding

encode_high_cardinality(['a', 'c', 'e'], ['a'], num_buckets=2)

[1, 1, 1]

In [13]:
df['dev_encoded'] = df['developers'].apply(lambda x: encode_high_cardinality(x, pop_dev, num_buckets=1000))
df['pub_encoded'] = df['publishers'].apply(lambda x: encode_high_cardinality(x, pop_pub, num_buckets=1000))
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,lang_encoded,cat_encoded,gen_encoded,dev_encoded,pub_encoded
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,-0.046711,-23.025851,0.003679,-0.023378,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,-0.046711,-23.025851,0.001047,-0.367941,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,-0.046711,-23.025851,0.001047,-0.367941,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,-0.011641,6.632002,0.015784,1.561612,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,-0.035483,5.493061,0.007889,0.527923,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Date features encoding

In [14]:
df['released_date'] = pd.to_datetime(df['released_date'])
df['released_timestamp'] = df['released_date'].apply(lambda x: x.timestamp())

In [15]:
df['mm_released_date'] = min_max(df['released_timestamp'])
df['z_released_date'] = z_score(df['released_timestamp'])
df['log_released_date'] = log_normalize(df['released_timestamp'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,log_price,lang_encoded,cat_encoded,gen_encoded,dev_encoded,pub_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474


In [16]:
df['released_year'] = df['released_date'].dt.year.astype('category')
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,lang_encoded,cat_encoded,gen_encoded,dev_encoded,pub_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date,released_year
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931,2021
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682,2020
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149,2020
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398,2023
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474,2021


In [17]:
df['released_decade'] = (df['released_date'].dt.year // 10 * 10).astype('category')
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,cat_encoded,gen_encoded,dev_encoded,pub_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date,released_year,released_decade
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931,2021,2020
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682,2020,2020
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149,2020,2020
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398,2023,2020
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474,2021,2020


In [18]:
df['released_year'] = df['released_date'].dt.year.astype('category')
df['released_month'] = df['released_date'].dt.month.astype('category')
df['released_day_of_week'] = df['released_date'].dt.dayofweek.astype('category')
df['released_quarter'] = df['released_date'].dt.quarter.astype('category')
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,pub_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date,released_year,released_decade,released_month,released_day_of_week,released_quarter
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931,2021,2020,2,6,1
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682,2020,2020,10,3,4
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149,2020,2020,10,3,4
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398,2023,2020,3,3,1
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474,2021,2020,12,3,4


In [19]:
df_interactions = pd.read_csv('data/review-clean.csv')
df_interactions.head()

Unnamed: 0,user_id,review,timestamp,rating,item_id,user_idx,item_idx,rating_exp,rating_imp,mm_timestamp,...,relative_current_timestamp,relative_last_interaction_timestamp,hour_sin,hour_cos,weekday_sin,weekday_cos,day_of_month_sin,day_of_month_cos,month_sin,month_cos
0,76561197960432447,A legendary tactical shooter that shaped the g...,1738278781,True,10,0,0,1,1,0.998997,...,0.061671,0.082497,-0.258819,0.965926,0.433884,-0.900969,0.0,1.0,0.0,1.0
1,76561198071230926,"The best CS sure, but server browser is the il...",1736206418,True,10,1,0,1,1,0.994407,...,0.06014,1.0,-0.258819,0.965926,0.0,1.0,0.0,1.0,0.0,1.0
2,76561198206216352,Some of the best memories of my childhood were...,1738041574,True,10,2,0,1,1,0.998472,...,0.061456,1.0,0.965926,0.258819,0.781831,0.62349,0.0,1.0,0.0,1.0
3,76561198110801124,This game feels so much better than CS2. I kno...,1738015332,True,10,3,0,1,1,0.998414,...,0.061433,1.0,-0.5,0.866025,0.0,1.0,0.0,1.0,0.0,1.0
4,76561199813732773,its very fun to play you can make friends out ...,1737853720,True,10,4,0,1,1,0.998056,...,0.061294,1.0,0.258819,0.965926,-0.781831,0.62349,0.0,1.0,0.0,1.0


In [20]:
max_timestamp = df_interactions['timestamp'].max() # Latest date in the interaction data
reference_date = pd.to_datetime(max_timestamp, unit='s')
reference_date

Timestamp('2025-02-05 04:58:51')

In [21]:
df['age'] = (reference_date - pd.to_datetime(df['released_date'])).dt.days.clip(lower=0) // 365.25
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,released_timestamp,mm_released_date,z_released_date,log_released_date,released_year,released_decade,released_month,released_day_of_week,released_quarter,age
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,1612656000.0,0.855371,0.003497,20.428931,2021,2020,2,6,1,3.0
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,1602115000.0,0.84327,-0.103532,20.414682,2020,2020,10,3,4,4.0
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,1603930000.0,0.845353,-0.085109,20.417149,2020,2020,10,3,4,4.0
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,1678320000.0,0.930761,0.670236,20.513398,2023,2020,3,3,1,1.0
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,1639613000.0,0.886321,0.277211,20.464474,2021,2020,12,3,4,3.0


In [22]:
age_buckets = ['new_release', 'recent', 'established', 'classic']
df['age_category'] = pd.cut(
    df['age'], 
    bins=[0, 1, 3, 10, float('inf')],
    labels=age_buckets
).astype('category')
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,mm_released_date,z_released_date,log_released_date,released_year,released_decade,released_month,released_day_of_week,released_quarter,age,age_category
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,0.855371,0.003497,20.428931,2021,2020,2,6,1,3.0,recent
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,0.84327,-0.103532,20.414682,2020,2020,10,3,4,4.0,established
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,0.845353,-0.085109,20.417149,2020,2020,10,3,4,4.0,established
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,0.930761,0.670236,20.513398,2023,2020,3,3,1,1.0,new_release
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,0.886321,0.277211,20.464474,2021,2020,12,3,4,3.0,recent


In [23]:
df['mm_age'] = min_max(df['age'])
df['z_age'] = z_score(df['age'])
df['log_age'] = log_normalize(df['age'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,price,released_date,...,released_year,released_decade,released_month,released_day_of_week,released_quarter,age,age_category,mm_age,z_age,log_age
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",[XINLINE GAMES],[XINLINE GAMES],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,699.0,2021-02-07,...,2021,2020,2,6,1,3.0,recent,0.111111,-0.16781,1.098612
1,Mine Crazy: The Korean Grinder,1430740,[English],[Dano Sato],[RealMono Inc.],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,199.0,2020-10-08,...,2020,2020,10,3,4,4.0,established,0.148148,0.154011,1.386294
2,Fade,1430100,[English],[Azimyth Studios],[Azimyth Studios],"[Single-player, Family Sharing]","[Indie, RPG]",0,199.0,2020-10-29,...,2020,2020,10,3,4,4.0,established,0.148148,0.154011,1.386294
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",[ACE Team],[Nacon],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2999.0,2023-03-09,...,2023,2020,3,3,1,1.0,new_release,0.037037,-0.81145,1e-10
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",[Studio Klondike Australia],[Studio Klondike],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,1499.0,2021-12-16,...,2021,2020,12,3,4,3.0,recent,0.111111,-0.16781,1.098612


## Save file

In [24]:
df.to_csv('data/metadata-features-extracted.csv', index=False)