In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/metadata-clean.csv')
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,price
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",699.0
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",199.0
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",199.0
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",2999.0
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",1499.0


## Numerical feature encoding

In [3]:
def min_max(values: pd.Series):
    min_time = values.min()
    max_time = values.max()
    normalized = (values - min_time) / (max_time - min_time)
    return normalized

def z_score(values: pd.Series):
    mean_time = values.mean()
    std_time = values.std()
    standardized = (values - mean_time) / std_time
    return standardized

def log_normalize(values: pd.Series):
    min_time = values.min()
    offset_timestamps = values - min_time
    epsilon = 1e-10
    return np.log(offset_timestamps + epsilon)

### Total recommendation

In [4]:
df['mm_total_recommendation'] = min_max(df['total_recommendations'])
df['z_total_recommendation'] = z_score(df['total_recommendations'])
df['log_total_recommendation'] = log_normalize(df['total_recommendations'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,price,mm_total_recommendation,z_total_recommendation,log_total_recommendation
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",699.0,0.0,-0.046711,-23.025851
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",199.0,0.0,-0.046711,-23.025851
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",199.0,0.0,-0.046711,-23.025851
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",2999.0,0.000174,-0.011641,6.632002
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",1499.0,5.6e-05,-0.035483,5.493061


### Price

In [5]:
df['mm_price'] = min_max(df['price'])
df['z_price'] = z_score(df['price'])
df['log_price'] = log_normalize(df['price'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,price,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price
0,Clash of Warlords,1430720,"['Simplified Chinese', 'Traditional Chinese']",['XINLINE GAMES'],['XINLINE GAMES'],"['Single-player', 'In-App Purchases', 'Family ...",['Strategy'],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",699.0,0.0,-0.046711,-23.025851,0.003679,-0.049534,6.549651
1,Mine Crazy: The Korean Grinder,1430740,['English'],['Dano Sato'],['RealMono Inc.'],"['Single-player', 'Family Sharing']","['Casual', 'Indie', 'RPG', 'Simulation']",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",199.0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305
2,Fade,1430100,['English'],['Azimyth Studios'],['Azimyth Studios'],"['Single-player', 'Family Sharing']","['Indie', 'RPG']",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",199.0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305
3,Clash: Artifacts of Chaos,1430680,"['English', 'French', 'Italian', 'German', 'Sp...",['ACE Team'],['Nacon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Adventure', 'Indie']",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",2999.0,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034
4,Astatos,1430970,"['English', 'Simplified Chinese', 'Traditional...",['Studio Klondike Australia'],['Studio Klondike'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'Indie', 'Strategy', 'Early Acce...",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",1499.0,5.6e-05,-0.035483,5.493061,0.007889,0.495218,7.312553


## Categorical features encoding

In [6]:
import ast
df['supported_languages'] = df['supported_languages'].apply(ast.literal_eval)
df['categories'] = df['categories'].apply(ast.literal_eval)
df['genres'] = df['genres'].apply(ast.literal_eval)

In [7]:
unique_lang = sorted(df['supported_languages'].explode().unique())
unique_cat = sorted(df['categories'].explode().unique())
unique_gen = sorted(df['genres'].explode().unique())

In [8]:
print(f'Number of unique languages: {len(unique_lang)}')
print(f'Number of unique categories: {len(unique_cat)}')
print(f'Number of unique genres: {len(unique_gen)}')

Number of unique languages: 103
Number of unique categories: 42
Number of unique genres: 28


### Multi-hot encoding

In [9]:
def encode(values, unique_values):
    encoding = [0] * len(unique_values)
    
    value_to_idx = {val: idx for idx, val in enumerate(unique_values)}
    
    for val in values:
        if val in unique_values:
            encoding[value_to_idx[val]] = 1
    
    return encoding

df['lang_encoded'] = df['supported_languages'].apply(lambda x: encode(x, unique_lang))
df['cat_encoded'] = df['categories'].apply(lambda x: encode(x, unique_cat))
df['gen_encoded'] = df['genres'].apply(lambda x: encode(x, unique_gen))
df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,price,mm_total_recommendation,z_total_recommendation,log_total_recommendation,mm_price,z_price,log_price,lang_encoded,cat_encoded,gen_encoded
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",['XINLINE GAMES'],['XINLINE GAMES'],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",699.0,0.0,-0.046711,-23.025851,0.003679,-0.049534,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Mine Crazy: The Korean Grinder,1430740,[English],['Dano Sato'],['RealMono Inc.'],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",199.0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,Fade,1430100,[English],['Azimyth Studios'],['Azimyth Studios'],"[Single-player, Family Sharing]","[Indie, RPG]",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",199.0,0.0,-0.046711,-23.025851,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",['ACE Team'],['Nacon'],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",2999.0,0.000174,-0.011641,6.632002,0.015784,1.516628,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",['Studio Klondike Australia'],['Studio Klondike'],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",1499.0,5.6e-05,-0.035483,5.493061,0.007889,0.495218,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."


## Date features encoding

In [10]:
df['released_date'] = pd.to_datetime(df['released_date'])
df['released_timestamp'] = df['released_date'].apply(lambda x: x.timestamp())

In [11]:
df['mm_released_date'] = min_max(df['released_timestamp'])
df['z_released_date'] = z_score(df['released_timestamp'])
df['log_released_date'] = log_normalize(df['released_timestamp'])

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,mm_price,z_price,log_price,lang_encoded,cat_encoded,gen_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",['XINLINE GAMES'],['XINLINE GAMES'],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",...,0.003679,-0.049534,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,1430740,[English],['Dano Sato'],['RealMono Inc.'],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",...,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682
2,Fade,1430100,[English],['Azimyth Studios'],['Azimyth Studios'],"[Single-player, Family Sharing]","[Indie, RPG]",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",...,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",['ACE Team'],['Nacon'],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",...,0.015784,1.516628,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",['Studio Klondike Australia'],['Studio Klondike'],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",...,0.007889,0.495218,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474


In [12]:
# df['released_year'] = df['released_date'].dt.year.astype('category')
# df.head()

In [13]:
# df['released_decade'] = (df['released_date'].dt.year // 10 * 10).astype('category')
# df.head()

In [14]:
# df['released_year'] = df['released_date'].dt.year.astype('category')
# df['released_month'] = df['released_date'].dt.month.astype('category')
# df['released_day_of_week'] = df['released_date'].dt.dayofweek.astype('category')
# df['released_quarter'] = df['released_date'].dt.quarter.astype('category')
# df.head()

In [15]:
# df_interactions = pd.read_csv('data/review-clean.csv')
# df_interactions.head()

In [16]:
# max_timestamp = df_interactions['timestamp'].max() # Latest date in the interaction data
# reference_date = pd.to_datetime(max_timestamp, unit='s')
# reference_date

In [17]:
# df['age'] = (reference_date - pd.to_datetime(df['released_date'])).dt.days.clip(lower=0) // 365.25
# df.head()

In [18]:
# age_buckets = ['new_release', 'recent', 'established', 'classic']
# df['age_category'] = pd.cut(
#     df['age'],
#     bins=[0, 1, 3, 10, float('inf')],
#     labels=age_buckets
# ).astype('category')
# df.head()

In [19]:
# df['mm_age'] = min_max(df['age'])
# df['z_age'] = z_score(df['age'])
# df['log_age'] = log_normalize(df['age'])
#
# df.head()

## Save file

In [20]:
df.to_csv('data/metadata-features-extracted.csv', index=False)

In [21]:

df.head()

Unnamed: 0,name,item_id,supported_languages,developers,publishers,categories,genres,total_recommendations,released_date,tags,...,mm_price,z_price,log_price,lang_encoded,cat_encoded,gen_encoded,released_timestamp,mm_released_date,z_released_date,log_released_date
0,Clash of Warlords,1430720,"[Simplified Chinese, Traditional Chinese]",['XINLINE GAMES'],['XINLINE GAMES'],"[Single-player, In-App Purchases, Family Sharing]",[Strategy],0,2021-02-07,"['Turn-Based Tactics', 'Strategy', 'Wargame', ...",...,0.003679,-0.049534,6.549651,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1612656000.0,0.855371,0.003497,20.428931
1,Mine Crazy: The Korean Grinder,1430740,[English],['Dano Sato'],['RealMono Inc.'],"[Single-player, Family Sharing]","[Casual, Indie, RPG, Simulation]",0,2020-10-08,"['Casual', 'RPG', 'Simulation', 'Clicker', 'Fa...",...,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1602115000.0,0.84327,-0.103532,20.414682
2,Fade,1430100,[English],['Azimyth Studios'],['Azimyth Studios'],"[Single-player, Family Sharing]","[Indie, RPG]",0,2020-10-29,"['Horror', 'RPG', 'Survival Horror', 'Top-Down...",...,0.001047,-0.390004,5.293305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1603930000.0,0.845353,-0.085109,20.417149
3,Clash: Artifacts of Chaos,1430680,"[English, French, Italian, German, Spanish - S...",['ACE Team'],['Nacon'],"[Single-player, Steam Achievements, Steam Trad...","[Action, Adventure, Indie]",759,2023-03-09,"['Action', 'Adventure', 'RPG', 'Souls-like', '...",...,0.015784,1.516628,8.006034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1678320000.0,0.930761,0.670236,20.513398
4,Astatos,1430970,"[English, Simplified Chinese, Traditional Chin...",['Studio Klondike Australia'],['Studio Klondike'],"[Single-player, Multi-player, PvP, Online PvP,...","[Adventure, Indie, Strategy, Early Access]",243,2021-12-16,"['Early Access', 'Visual Novel', 'Card Battler...",...,0.007889,0.495218,7.312553,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...",1639613000.0,0.886321,0.277211,20.464474
