In [3]:
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [4]:
df = pd.read_csv('../Data/Raw/steam_store_games_only.csv', keep_default_na=True)
df.head()

Unnamed: 0.1,Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,platforms,metacritic,categories,release_date,controller_support,price_overview
0,0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"{'windows': True, 'mac': True, 'linux': True}","{'score': 90, 'url': 'https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f'}","[{'id': 1, 'description': 'Multi-player'}, {'id': 9, 'description': 'Co-op'}, {'id': 29, 'description': 'Steam Trading Cards'}, {'id': 30, 'description': 'Steam Workshop'}, {'id': 40, 'description...","{'coming_soon': False, 'date': '9 Jul, 2013'}",,
1,1,game,Counter-Strike: Global Offensive,730,0.0,True,,"['Valve', 'Hidden Path Entertainment']",['Valve'],"{'windows': True, 'mac': True, 'linux': True}","{'score': 83, 'url': 'https://www.metacritic.com/game/pc/counter-strike-global-offensive?ftag=MCD-06-10aaa1f'}","[{'id': 1, 'description': 'Multijugador'}, {'id': 22, 'description': 'Logros de Steam'}, {'id': 28, 'description': 'Compat. total con control'}, {'id': 29, 'description': 'Cromos de Steam'}, {'id'...","{'coming_soon': False, 'date': '21 AGO 2012'}",full,
2,2,game,Apex Legends™,1172470,0.0,True,,['Respawn Entertainment'],['Electronic Arts'],"{'windows': True, 'mac': False, 'linux': False}","{'score': 88, 'url': 'https://www.metacritic.com/game/pc/apex-legends?ftag=MCD-06-10aaa1f'}","[{'id': 1, 'description': 'Multi-player'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 9, 'description': 'Co-op'}, {'id': 38, 'description': 'Online Co-op'}, ...","{'coming_soon': False, 'date': '4 Nov, 2020'}",full,
3,3,game,PUBG: BATTLEGROUNDS,578080,0.0,True,,"['KRAFTON, Inc.']","['KRAFTON, Inc.']","{'windows': True, 'mac': False, 'linux': False}",,"[{'id': 1, 'description': 'Multi-player'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 15, 'description': 'Stats'}, {'id': 41, 'description': 'Remote Play on ...","{'coming_soon': False, 'date': '21 Dec, 2017'}",,
4,4,game,New World,1063730,0.0,False,,['Amazon Games'],['Amazon Games'],"{'windows': True, 'mac': False, 'linux': False}","{'score': 70, 'url': 'https://www.metacritic.com/game/pc/new-world?ftag=MCD-06-10aaa1f'}","[{'id': 1, 'description': 'Multi-player'}, {'id': 20, 'description': 'MMO'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 9, 'description': 'Co-op'}, {'id': 38...","{'coming_soon': False, 'date': '28 Sep, 2021'}",,"{'currency': 'SGD', 'initial': 3400, 'final': 1700, 'discount_percent': 50, 'initial_formatted': 'S$34.00', 'final_formatted': 'S$17.00'}"


In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
print(df.shape)
df.isnull().sum()

(61640, 14)


type                    182
name                      7
steam_appid               0
required_age            182
is_free                 182
dlc                   51483
developers              324
publishers              182
platforms               182
metacritic            57783
categories              964
release_date            182
controller_support    48445
price_overview         8818
dtype: int64

## Clean Data

### type

In [7]:
df['type'].unique()

array(['game', nan, 'hardware'], dtype=object)

In [8]:
# Keep only rows with game type
df = df[df['type'] == 'game']
df.shape

(61457, 14)

In [9]:
df.isnull().sum()

type                      0
name                      6
steam_appid               0
required_age              0
is_free                   0
dlc                   51300
developers              141
publishers                0
platforms                 0
metacritic            57600
categories              782
release_date              0
controller_support    48263
price_overview         8635
dtype: int64

### name

In [10]:
# Replace name with blank string 
df['name'] = df['name'].fillna('')
df['name'].isnull().sum()

0

### dlc

In [11]:
# Replace NaN with empty list
df['dlc'] = df['dlc'].fillna('[]')
df['dlc'][:5]

0    [1241930, 652720]
1                   []
2                   []
3                   []
4                   []
Name: dlc, dtype: object

### developers

In [12]:
# Replace NaN with empty list
df['developers'] = df['developers'].fillna('[]')
df['developers'][:5]

0                                 ['Valve']
1    ['Valve', 'Hidden Path Entertainment']
2                 ['Respawn Entertainment']
3                         ['KRAFTON, Inc.']
4                          ['Amazon Games']
Name: developers, dtype: object

### platforms

In [13]:
# Since there are 3 fixed platforms, encode before train-test split

def fix_platforms(x):
    windows = 0
    mac = 0
    linux = 0
    if type(x) == str:
        data = eval(x)
        if data['windows'] == True:
            windows = 1
        if data['mac'] == True:
            mac = 1
        if data['linux'] == True:
            linux = 1
    return windows, mac, linux
        
df['platforms'].unique()

array(["{'windows': True, 'mac': True, 'linux': True}",
       "{'windows': True, 'mac': False, 'linux': False}",
       "{'windows': True, 'mac': True, 'linux': False}",
       "{'windows': True, 'mac': False, 'linux': True}",
       "{'windows': False, 'mac': True, 'linux': False}",
       "{'windows': False, 'mac': True, 'linux': True}",
       "{'windows': False, 'mac': False, 'linux': True}"], dtype=object)

In [14]:
res = df['platforms'].apply(fix_platforms)

In [15]:
windows = []
mac = []
linux = []
for tup in res:
    windows.append(tup[0])
    mac.append(tup[1])
    linux.append(tup[2])
df['windows'] = windows
df['mac'] = mac
df['linux'] = linux
df = df.drop('platforms', axis = 1)
df.loc[[0]]

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,metacritic,categories,release_date,controller_support,price_overview,windows,mac,linux
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"{'score': 90, 'url': 'https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f'}","[{'id': 1, 'description': 'Multi-player'}, {'id': 9, 'description': 'Co-op'}, {'id': 29, 'description': 'Steam Trading Cards'}, {'id': 30, 'description': 'Steam Workshop'}, {'id': 40, 'description...","{'coming_soon': False, 'date': '9 Jul, 2013'}",,,1,1,1


### metacritic

In [16]:
# Leave NaN as is (impute after train test split)
df['metacritic_score'] = df['metacritic'].apply(lambda x: eval(x)['score'] if not pd.isnull(x) else None)
df['metacritic_url'] = df['metacritic'].apply(lambda x: eval(x)['url'] if not pd.isnull(x) else None)

In [17]:
df.drop(['metacritic'], axis=1, inplace=True)
df.loc[[0]]

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,controller_support,price_overview,windows,mac,linux,metacritic_score,metacritic_url
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"[{'id': 1, 'description': 'Multi-player'}, {'id': 9, 'description': 'Co-op'}, {'id': 29, 'description': 'Steam Trading Cards'}, {'id': 30, 'description': 'Steam Workshop'}, {'id': 40, 'description...","{'coming_soon': False, 'date': '9 Jul, 2013'}",,,1,1,1,90.0,https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f


#### categories

In [18]:
dic = {}
for arr in df['categories'].values:
    if type(arr) == str:
        arr = eval(arr)
        for data in arr:
            if data["id"] not in dic.keys():
                dic[data["id"]] = set([data['description']])
            else:
                dic[data["id"]].add(data['description'])

dic

{1: {'Multi-player', 'Multijoueur', 'Multijugador'},
 9: {'Co-op', 'Cooperativos', 'Coopération'},
 29: {'Cartes à échanger Steam', 'Cromos de Steam', 'Steam Trading Cards'},
 30: {'Steam Workshop', 'Workshop Steam', 'Мастерская Steam'},
 40: {'SteamVR Collectibles'},
 35: {'Compras dentro de la aplicación', 'In-App Purchases'},
 8: {'Antitriche Valve activée',
  'Con sist. antitrampas de Valve',
  'Valve Anti-Cheat enabled'},
 22: {'Logros de Steam',
  'Steam Achievements',
  'Steam-Errungenschaften',
  'Достижения Steam'},
 28: {'Compat. total con control',
  'Compat. total con mando',
  'Full controller support',
  'Контроллер (полностью)'},
 15: {'Estadísticas', 'Stats'},
 41: {'Remote Play on Phone', 'Remote Play para móviles'},
 42: {'Remote Play on Tablet', 'Remote Play para tabletas'},
 43: {'Remote Play on TV', 'Remote Play para TV'},
 49: {'JcJ', 'PvP'},
 36: {'JcJ en ligne', 'JcJ en línea', 'Online PvP'},
 38: {'Cooperativos en línea', 'Coopération en ligne', 'Online Co-op'}

In [19]:
mapping = {1: 'Multi-player',
 9: 'Co-op',
 29: 'Steam Trading Cards',
 30: 'Steam Workshop',
 40: 'SteamVR Collectibles',
 35: 'In-App Purchases',
 8: 'Valve Anti-Cheat enabled',
 22: 'Steam Achievements',
 28: 'Full controller support',
 15: 'Stats',
 41: 'Remote Play on Phone',
 42: 'Remote Play on Tablet',
 43: 'Remote Play on TV',
 49: 'PvP', 
 36: 'Online PvP',
 38: 'Online Co-op', 
 20: 'MMO',
 27: 'Cross-Platform Multiplayer', 
 13: 'Captions available', 
 18: 'Partial Controller Support',
 17: 'Includes level editor',
 14: 'Commentary available',
 2: 'Single-player',
 47: 'LAN PvP',
 48: 'LAN Co-op',
 23: 'Steam Cloud',
 16: 'Includes Source SDK',
 44: 'Remote Play Together',
 37: 'Shared/Split Screen PvP',
 39: 'Shared/Split Screen Co-op',
 24: 'Shared/Split Screen', 
 53: 'VR Supported',
 51: 'Steam Workshop',
 52: 'Tracked Controller Support',
 25: 'Steam Leaderboards',
 31: 'VR Support',
 32: 'Steam Turn Notifications',
 54: 'Solo RV',
 19: 'Mods',
 6: 'Mods (require HL2)',
 10: 'Game demo',
 21: 'Downloadable Content',
 50: 'Additional High-Quality Audio',
 33: 'Native Steam Controller Support',
 0: 'No Category'}

In [20]:
# Replace categories with their ID
df['categories'] = df['categories'].apply(lambda x: [dic['id'] for dic in (eval(x) if type((x)) == str else [{"id":"0"}])])

# Map the genre values to genre names using the mapping dictionary
df['categories'] = df['categories'].apply(lambda x: [mapping.get(i) for i in x])

df.loc[[0]]

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,controller_support,price_overview,windows,mac,linux,metacritic_score,metacritic_url
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"[Multi-player, Co-op, Steam Trading Cards, Steam Workshop, SteamVR Collectibles, In-App Purchases, Valve Anti-Cheat enabled]","{'coming_soon': False, 'date': '9 Jul, 2013'}",,,1,1,1,90.0,https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f


### release_date

In [21]:
df['release_date'] = df['release_date'].apply(lambda x: eval(x) if type(x) == str else x)

In [22]:
# Number of unreleased games
temp = df[~df['release_date'].isnull()]
len(temp[temp['release_date'].apply(lambda x: x['coming_soon'])])

118

In [23]:
temp[temp['release_date'].apply(lambda x: x['coming_soon'])]['release_date']

9244         {'coming_soon': True, 'date': 'Coming soon'}
25858        {'coming_soon': True, 'date': 'Coming soon'}
27804        {'coming_soon': True, 'date': 'Coming soon'}
31507      {'coming_soon': True, 'date': 'December 2024'}
33572    {'coming_soon': True, 'date': 'To be announced'}
                               ...                       
61629       {'coming_soon': True, 'date': '28 Sep, 2023'}
61631        {'coming_soon': True, 'date': 'Coming soon'}
61633        {'coming_soon': True, 'date': 'Coming soon'}
61635        {'coming_soon': True, 'date': 'Coming soon'}
61637        {'coming_soon': True, 'date': 'Coming soon'}
Name: release_date, Length: 118, dtype: object

In [24]:
df['coming_soon'] = df['release_date'].apply(lambda x: x['coming_soon'])
df['release_date'] = df['release_date'].apply(lambda x: x['date'])
df.loc[[0]]

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,controller_support,price_overview,windows,mac,linux,metacritic_score,metacritic_url,coming_soon
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"[Multi-player, Co-op, Steam Trading Cards, Steam Workshop, SteamVR Collectibles, In-App Purchases, Valve Anti-Cheat enabled]","9 Jul, 2013",,,1,1,1,90.0,https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f,False


### controller_support

In [25]:
df['controller_support'].unique()

array([nan, 'full'], dtype=object)

In [26]:
# Replace "full" with 1 and nan with 0
df['controller_support'] = df['controller_support'].apply(lambda x: 1 if not pd.isnull(x) else 0)

df.loc[[0]]

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,controller_support,price_overview,windows,mac,linux,metacritic_score,metacritic_url,coming_soon
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"[Multi-player, Co-op, Steam Trading Cards, Steam Workshop, SteamVR Collectibles, In-App Purchases, Valve Anti-Cheat enabled]","9 Jul, 2013",0,,1,1,1,90.0,https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f,False


### price_overview

In [27]:
df['price_overview'] = df['price_overview'].apply(lambda x: eval(x) if type(x) == str else x)
df['price_overview'][:5]

0                                                                                                                                          NaN
1                                                                                                                                          NaN
2                                                                                                                                          NaN
3                                                                                                                                          NaN
4    {'currency': 'SGD', 'initial': 3400, 'final': 1700, 'discount_percent': 50, 'initial_formatted': 'S$34.00', 'final_formatted': 'S$17.00'}
Name: price_overview, dtype: object

In [28]:
# Replace NaN with default dicts (impute after train test split)
fix_index = df[df['price_overview'].isnull()].index
lst = []
for value in df['price_overview']:
    if type(value) == dict:
        lst.append(value)
    else:
        lst.append({
            'currency': '', 
            'initial': None, 
            'final': None, 
            'discount_percent':None, 
            'initial_formatted': 'NaN', 
            'final_formatted': 'NaN'})
df['price_overview'] = lst
df[df['price_overview'].isnull()]

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,controller_support,price_overview,windows,mac,linux,metacritic_score,metacritic_url,coming_soon


In [29]:
import re
def sub_price(s):
    if s == 'NaN':
        return None
    else:
        return re.sub("[^0-9]", "", s)

df['currency'] = df['price_overview'].apply(lambda x: x['currency'])
df['initial'] = df['price_overview'].apply(lambda x: x['initial'])
df['final'] = df['price_overview'].apply(lambda x: x['final'])
df['discount_percent'] = df['price_overview'].apply(lambda x: x['discount_percent'])
df['initial_formatted'] = df['price_overview'].apply(lambda x: sub_price(x['initial_formatted']))
df['final_formatted'] = df['price_overview'].apply(lambda x: sub_price(x['final_formatted']))

In [30]:
df.drop(['price_overview'], axis=1, inplace=True)
df.head()

Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,controller_support,windows,mac,linux,metacritic_score,metacritic_url,coming_soon,currency,initial,final,discount_percent,initial_formatted,final_formatted
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"[Multi-player, Co-op, Steam Trading Cards, Steam Workshop, SteamVR Collectibles, In-App Purchases, Valve Anti-Cheat enabled]","9 Jul, 2013",0,1,1,1,90.0,https://www.metacritic.com/game/pc/dota-2?ftag=MCD-06-10aaa1f,False,,,,,,
1,game,Counter-Strike: Global Offensive,730,0.0,True,[],"['Valve', 'Hidden Path Entertainment']",['Valve'],"[Multi-player, Steam Achievements, Full controller support, Steam Trading Cards, Steam Workshop, In-App Purchases, Valve Anti-Cheat enabled, Stats, Remote Play on Phone, Remote Play on Tablet, Rem...",21 AGO 2012,1,1,1,1,83.0,https://www.metacritic.com/game/pc/counter-strike-global-offensive?ftag=MCD-06-10aaa1f,False,,,,,,
2,game,Apex Legends™,1172470,0.0,True,[],['Respawn Entertainment'],['Electronic Arts'],"[Multi-player, PvP, Online PvP, Co-op, Online Co-op, Steam Achievements, Full controller support, Steam Trading Cards, In-App Purchases]","4 Nov, 2020",1,1,0,0,88.0,https://www.metacritic.com/game/pc/apex-legends?ftag=MCD-06-10aaa1f,False,,,,,,
3,game,PUBG: BATTLEGROUNDS,578080,0.0,True,[],"['KRAFTON, Inc.']","['KRAFTON, Inc.']","[Multi-player, PvP, Online PvP, Stats, Remote Play on Phone, Remote Play on Tablet]","21 Dec, 2017",0,1,0,0,,,False,,,,,,
4,game,New World,1063730,0.0,False,[],['Amazon Games'],['Amazon Games'],"[Multi-player, MMO, PvP, Online PvP, Co-op, Online Co-op, Steam Achievements, In-App Purchases]","28 Sep, 2021",0,1,0,0,70.0,https://www.metacritic.com/game/pc/new-world?ftag=MCD-06-10aaa1f,False,SGD,3400.0,1700.0,50.0,3400.0,1700.0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61457 entries, 0 to 61639
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   type                61457 non-null  object 
 1   name                61457 non-null  object 
 2   steam_appid         61457 non-null  int64  
 3   required_age        61457 non-null  object 
 4   is_free             61457 non-null  object 
 5   dlc                 61457 non-null  object 
 6   developers          61457 non-null  object 
 7   publishers          61457 non-null  object 
 8   categories          61457 non-null  object 
 9   release_date        61457 non-null  object 
 10  controller_support  61457 non-null  int64  
 11  windows             61457 non-null  int64  
 12  mac                 61457 non-null  int64  
 13  linux               61457 non-null  int64  
 14  metacritic_score    3857 non-null   float64
 15  metacritic_url      3857 non-null   object 
 16  comi

In [33]:
df.to_csv("../Data/steam_store_games_clean.csv")