# Cleaning data

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import json
import re

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("archive/SteamApps.csv")
temp = pd.read_csv("archive/steam_games.csv")
df = pd.concat([df, temp])
df

Unnamed: 0.2,Unnamed: 0,type,name,steam_appid,required_age,is_free,detailed_description,about_the_game,short_description,fullgame,...,controller_support,dlc,demos,achievements,reviews,recommendations,drm_notice,metacritic,alternate_appid,Unnamed: 0.1
0,0,demo,Pin Them Demo,1904630.0,0,True,,,,"{'appid': '1764220', 'name': 'Pin Them'}",...,,,,,,,,,,
1,1,game,Al-Qadim: The Genie's Curse,1904640.0,0,False,"As an outcast, betrothed to the caliph's daugh...","As an outcast, betrothed to the caliph's daugh...",Experience the mysterious Al-Qadim game world ...,,...,,,,,,,,,,
2,2,game,Dungeons & Dragons - Stronghold: Kingdom Simul...,1904650.0,0,False,Build and command your own kingdom in the Dung...,Build and command your own kingdom in the Dung...,Run your own kingdom in the legendary Dungeons...,,...,,,,,,,,,,
3,3,game,Chapel 3-D: The Ascent,1904680.0,0,False,"<h1>🔥 WISHLIST TO SURVIVE 🔥</h1><p><img src=""h...","<a href=""https://steamcommunity.com/linkfilter...","🔥 Chapel 3-D: The Ascent is a break-neck, viol...",,...,,,,,,,,,,
4,4,game,VTuber Gallery : Anime Pose,1904690.0,0,True,<strong>VTuber Gallery</strong> is <strong>#1 ...,<strong>VTuber Gallery</strong> is <strong>#1 ...,VTuber Gallery is #1 anime pose app that allow...,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20554,270,video,The Chair,966800.0,13,False,<h1>Steam Video</h1><p>This content is only av...,Richard Sullivan has spent ten years on death ...,Richard Sullivan has spent ten years on death ...,,...,full,,,,,,,,,270.0
20555,271,video,Head,966810.0,13,False,<h1>Steam Video</h1><p>This content is only av...,A group of 20-something puppets go on a weeken...,A group of 20-something puppets go on a weeken...,,...,full,,,,,,,,,271.0
20556,272,video,The Barn,966820.0,13,False,<h1>Steam Video</h1><p>This content is only av...,It is Halloween 1989. Best friends Sam and Jos...,Halloween 1989. Sam and Josh are enjoying what...,,...,full,,,,,,,,,272.0
20557,273,video,Be My Cat: A Film for Anne,966830.0,13,False,<h1>Steam Video</h1><p>This content is only av...,Obsessed with convincing Hollywood actress Ann...,Obsessed with convincing Hollywood actress Ann...,,...,full,,,,,,,,,273.0


In [4]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.drop(columns=["header_image", "capsule_image", "capsule_imagev5", "website",
                 "screenshots", "background", "background_raw", "content_descriptors",
                 "support_info", "short_description", "about_the_game", "short_description",
                 "detailed_description", "legal_notice", "pc_requirements", "mac_requirements",
                 "linux_requirements", "movies", "ratings",
                 "ext_user_account_notice", "achievements", "reviews"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(3)

Unnamed: 0,type,name,steam_appid,required_age,is_free,fullgame,developers,publishers,package_groups,platforms,...,price_overview,packages,genres,controller_support,dlc,demos,recommendations,drm_notice,metacritic,alternate_appid
0,demo,Pin Them Demo,1904630.0,0,True,"{'appid': '1764220', 'name': 'Pin Them'}",['PRODUKTIVKELLER Studios'],['PRODUKTIVKELLER Studios'],[],"{'windows': 'True', 'mac': 'False', 'linux': '...",...,,,,,,,,,,
1,game,Al-Qadim: The Genie's Curse,1904640.0,0,False,,['Cyberlore Studios'],['SNEG'],"[{'name': 'default', 'title': ""Buy Al-Qadim: T...","{'windows': 'True', 'mac': 'False', 'linux': '...",...,"{'currency': 'VND', 'initial': 8000000, 'final...",[685932],"[{'id': '1', 'description': 'Action'}, {'id': ...",,,,,,,
2,game,Dungeons & Dragons - Stronghold: Kingdom Simul...,1904650.0,0,False,,['Stormfront Studios'],['SNEG'],"[{'name': 'default', 'title': 'Buy Dungeons & ...","{'windows': 'True', 'mac': 'False', 'linux': '...",...,"{'currency': 'VND', 'initial': 8000000, 'final...",[685935],"[{'id': '28', 'description': 'Simulation'}, {'...",,,,,,,


## Get platforms

___

In [5]:
def true_false_string_to_json(x):
    if pd.isna(x):
        return {}
    try:
        return json.loads(x.replace("\'", "\"").replace("\"True\"", "true").replace("\"False\"", "false"))
    except:
        return json.loads(x.replace("\'", "\"").replace("True", "true").replace("False", "false"))


def simple_string_to_json(x, returning={}):
    if pd.isna(x):
        return returning
    else:
        return json.loads(x.replace("\'", "\""))


def string_with_apostrophe_to_json(x):
    if pd.isna(x):
        return {}
    else:
        try:
            temp = re.sub(r"\b'\B|\B'\b|\B'\B", "\"", x)
            return json.loads(temp)
        except:
            return {}

In [6]:
df["platforms"].dropna().head(5)

0    {'windows': 'True', 'mac': 'False', 'linux': '...
1    {'windows': 'True', 'mac': 'False', 'linux': '...
2    {'windows': 'True', 'mac': 'False', 'linux': '...
3    {'windows': 'True', 'mac': 'False', 'linux': '...
4    {'windows': 'True', 'mac': 'False', 'linux': '...
Name: platforms, dtype: object

In [7]:
df["platforms"] = df["platforms"].apply(true_false_string_to_json)

In [8]:
df["platform_windows"] = df["platforms"].apply(lambda x: x.get("windows"))
df["platform_mac"] = df["platforms"].apply(lambda x: x.get("mac"))
df["platform_linux"] = df["platforms"].apply(lambda x: x.get("linux"))
df.drop(columns=["platforms"], inplace=True)

## Get required age

___

In [9]:
df["required_age"].unique()

array([0, 18, 16, 12, 13, 17, 10, 15, 7, 3, 5, 171, 14, 6, 11, '0', '15',
       '3', '16', '17', '12', '18', '10', '11', '7', '13', '14', '18+', 1,
       4, '6', '1'], dtype=object)

In [10]:
def age_to_number(x):
    if isinstance(x, int):
        return x
    else:
        x = x.replace("+", "")
        return int(x)

df["required_age"] = df["required_age"].apply(age_to_number)

## Get release date

___

In [11]:
def convert_to_datetime(x):
    try:
        return pd.to_datetime(x)
    except:
        return None

In [12]:
df["release_date"] = df["release_date"].apply(true_false_string_to_json)

In [13]:
df["coming_date"] = df["release_date"].apply(lambda x: convert_to_datetime(x["date"]))
df["coming_soon"] = df["release_date"].apply(lambda x: x["coming_soon"])
df.drop(columns=["release_date"], inplace=True)

## Simplify the varibles

Including packages, prices and discounts, etc.

___

In [14]:
df["package_groups"].dropna().head(5)

0                                                   []
1    [{'name': 'default', 'title': "Buy Al-Qadim: T...
2    [{'name': 'default', 'title': 'Buy Dungeons & ...
3                                                   []
4                                                   []
Name: package_groups, dtype: object

In [15]:
df["package_number"] = df["package_groups"].apply(lambda x: len(x))
df.drop(columns=["package_groups"], inplace=True)

In [16]:
df["price_overview"].dropna().head(5)

1    {'currency': 'VND', 'initial': 8000000, 'final...
2    {'currency': 'VND', 'initial': 8000000, 'final...
5    {'currency': 'VND', 'initial': 7350000, 'final...
6    {'currency': 'VND', 'initial': 34900000, 'fina...
9    {'currency': 'VND', 'initial': 53500000, 'fina...
Name: price_overview, dtype: object

In [17]:
df["price_overview"] = df["price_overview"].apply(simple_string_to_json)

In [18]:
df["currency"] = df["price_overview"].apply(lambda x: x.get("currency", None))
df["initial_price"] = df["price_overview"].apply(lambda x: x.get("initial", None))
df["final_price"] = df["price_overview"].apply(lambda x: x.get("final", None))
df["discount_percent"] = df["price_overview"].apply(lambda x: x.get("discount_percent", None))
df.drop(columns=["price_overview"], inplace=True)

In [19]:
df["fullgame"].dropna().head()

0              {'appid': '1764220', 'name': 'Pin Them'}
5     {'appid': '469800', 'name': 'Evolution Board G...
8     {'appid': '1712110', 'name': 'Deep Space Outpo...
16    {'appid': '1884430', 'name': 'Succubus Girl St...
19                {'appid': '1827970', 'name': 'STAMP'}
Name: fullgame, dtype: object

In [20]:
def string_with_apostrophe_to_json(x):
    if pd.isna(x):
        return {}
    else:
        try:
            temp = re.sub(r"\b'\B|\B'\b|\B'\B", "\"", x)
            return json.loads(temp)
        except:
            return {}

In [21]:
df["fullgame"] = df["fullgame"].apply(string_with_apostrophe_to_json)
df["fullgame_appid"] = df["fullgame"].apply(lambda x: x.get("appid", None))
df.drop(columns=["fullgame"], inplace=True)

## Get alternative games

___

In [22]:
df["demos"].dropna().head()

6     [{'appid': 1955620, 'description': ''}]
9     [{'appid': 2612930, 'description': ''}]
13    [{'appid': 1990120, 'description': ''}]
34    [{'appid': 2129060, 'description': ''}]
53    [{'appid': 2021590, 'description': ''}]
Name: demos, dtype: object

In [23]:
def get_appid_with_exception(x):
    try:
        return x.get("appid", None)
    except:
        return get_appid_with_exception(x[0])

In [24]:
df["demos"] = df["demos"].apply(string_with_apostrophe_to_json)
df["demo_appid"] = df["demos"].apply(get_appid_with_exception)
df.drop(columns=["demos"], inplace=True)

In [25]:
df["recommendations"].dropna().head()

9        {'total': 252}
13      {'total': 1096}
21       {'total': 415}
53     {'total': 12769}
107      {'total': 110}
Name: recommendations, dtype: object

In [26]:
df["recommendations"] = df["recommendations"].apply(simple_string_to_json)
df["recommendations"] = df["recommendations"].apply(lambda x: x.get("total", None))

In [27]:
df["metacritic"] = df["metacritic"].apply(simple_string_to_json)
df["metacritic"] = df["metacritic"].apply(lambda x: x.get("score", None))

## Get developers

___

In [28]:
np.sort(df.loc[:, "developers"].apply(lambda x: None if pd.isna(x) else len(x)).unique())

array([  5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,  15.,
        16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,  26.,
        27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,  37.,
        38.,  39.,  40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,  48.,
        49.,  50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.,
        60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,  70.,
        71.,  72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.,  80.,  81.,
        82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,  91.,  92.,
        93.,  94.,  96.,  97.,  99., 100., 101., 102., 103., 104., 105.,
       106., 107., 108., 109., 110., 111., 112., 113., 114., 115., 119.,
       123., 124., 126., 128., 131., 133., 134., 135., 136., 137., 139.,
       141., 144., 146., 148., 149., 151., 152., 155., 156., 157., 164.,
       166., 167., 171., 178., 181., 185., 187., 202., 205., 218., 220.,
       228., 231., 235., 251., 252., 307., 341., 38

In [29]:
df["developers_amount"] = df["developers"].apply(lambda x: None if pd.isna(x) else len(x))
df["publishers_amount"] = df["publishers"].apply(lambda x: None if pd.isna(x) else len(x))
df.drop(columns=["developers", "publishers"], inplace=True)

## Get categories

Check if the game is single-player or multi-player, has VR support, or controller supp.

___

In [30]:
df.loc[0, "categories"]

"[{'id': 10, 'description': 'Game demo'}]"

In [31]:
false_table = np.full((df.shape[0], 2), False)

player_table = pd.DataFrame(false_table, columns=["single", "multi"])
df = pd.concat([df, player_table], axis=1)

In [32]:
df["categories"] = df["categories"].apply(lambda x: simple_string_to_json(x, []))

In [33]:
player_mode_code = {
	"Single-player": "single",
	"Multi-player": "multi",
	"PvP": "multi",
	"Online PvP": "multi",
	"Shared/Split Screen PvP": "multi",
	"Cross-Platform Multiplayer": "multi",
	"Remote Play Together": "multi",
	"Co-op": "multi",
	"Online Co-op": "multi",
	"Shared/Split Screen Co-op": "multi",
	"LAN PvP": "multi",
	"MMO": "multi",
	"LAN Co-op": "multi"
}

count = 0
for i in range(df.shape[0]):
    for category in df.loc[i, "categories"]:
        key = player_mode_code.get(category["description"], "")
        if len(key) != "":
            df.loc[i, key] = True
    if count % 10000 == 0:
        print(f"Step {count} done")
    count += 1

df[["single", "multi"]]

Step 0 done
Step 10000 done
Step 20000 done
Step 30000 done
Step 40000 done
Step 50000 done
Step 60000 done
Step 70000 done
Step 80000 done
Step 90000 done
Step 100000 done
Step 110000 done


Unnamed: 0,single,multi
0,False,False
1,True,False
2,True,False
3,True,False
4,False,False
...,...,...
115965,False,False
115966,False,False
115967,False,False
115968,False,False


In [34]:
# Some games can have both singleplayer and multiplayer modes but they must have either one.
(~(~df["single"] & ~df["multi"])).sum()

102584

In [35]:
false_table = np.full((df.shape[0], 2), False)

player_table = pd.DataFrame(false_table, columns=["support_vr", "support_controller"])
df = pd.concat([df, player_table], axis=1)

In [36]:
count = 0
for i in range(df.shape[0]):
    for category in df.loc[i, "categories"]:
        if category["description"].find("VR") != -1:
            df.loc[i, "support_vr"] = True
        elif category["description"].find("ontrol") != -1:   # Control, case-insensitive
            df.loc[i, "support_controller"] = True
    if count % 10000 == 0:
        print(f"Step {count} done")
    count += 1

df[["support_vr", "support_controller"]]

Step 0 done
Step 10000 done
Step 20000 done
Step 30000 done
Step 40000 done
Step 50000 done
Step 60000 done
Step 70000 done
Step 80000 done
Step 90000 done
Step 100000 done
Step 110000 done


Unnamed: 0,support_vr,support_controller
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
115965,False,True
115966,False,True
115967,False,True
115968,False,True


In [37]:
df.drop(columns=["categories"], inplace=True)

In [39]:
df.drop(columns=[""], inplace=True)

In [40]:
df.columns

Index(['type', 'name', 'steam_appid', 'required_age', 'is_free',
       'supported_languages', 'packages', 'genres', 'controller_support',
       'dlc', 'recommendations', 'drm_notice', 'metacritic', 'alternate_appid',
       'platform_windows', 'platform_mac', 'platform_linux', 'coming_date',
       'coming_soon', 'package_number', 'currency', 'initial_price',
       'final_price', 'discount_percent', 'fullgame_appid', 'demo_appid',
       'developers_amount', 'publishers_amount', 'single', 'multi',
       'support_vr', 'support_controller'],
      dtype='object')

## Get languages

Selected languages: English, French, German, Spanish, Portuguese, Italian, Russian, Japanese, Chinese (both simplified and traditional), Korean, and Arabic.

___

In [41]:
def get_language_list(x):
    if pd.isna(x):
        return []
    x = x.replace("<strong>*</strong>", "")
    x = x.replace("<br>", "")
    x = x.replace("languages with full audio support", "")
    x = x.split(", ")
    return x

df["supported_languages"] = df["supported_languages"].apply(get_language_list)
df["supported_languages"].head()

0                                                   []
1           [English, French, German, Spanish - Spain]
2                            [English, French, German]
3    [English, French, German, Spanish - Spain, Por...
4                                            [English]
Name: supported_languages, dtype: object

In [42]:
with open("language_to_code.json", encoding="utf8") as file:
    language_codes = file.read()
    language_codes = json.loads(language_codes)

In [43]:
false_table = np.full((df.shape[0], 11), False)

lang_table = pd.DataFrame(false_table, columns=["en", "fr", "de", "es", "po", "zh", "ja", "ko", "it", "ru", "ar"])
df = pd.concat([df, lang_table], axis=1)

In [44]:
count = 0
for i in range(df.shape[0]):
    for lang in df.loc[i, "supported_languages"]:
        key = language_codes.get(lang, "")
        if len(key) != "":
            df.loc[i, key] = True
    if count % 10000 == 0:
        print(f"Step {count} done")
    count += 1

df[["en", "fr", "de", "es", "po", "zh", "ja", "ko", "it", "ru", "ar"]]

Step 0 done
Step 10000 done
Step 20000 done
Step 30000 done
Step 40000 done
Step 50000 done
Step 60000 done
Step 70000 done
Step 80000 done
Step 90000 done
Step 100000 done
Step 110000 done


Unnamed: 0,en,fr,de,es,po,zh,ja,ko,it,ru,ar
0,False,False,False,False,False,False,False,False,False,False,False
1,True,True,True,True,False,False,False,False,False,False,False
2,True,True,True,False,False,False,False,False,False,False,False
3,True,True,True,True,True,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
115965,True,False,False,False,False,False,False,False,False,False,False
115966,True,False,False,False,False,False,False,False,False,False,False
115967,True,False,False,False,False,False,False,False,False,False,False
115968,True,False,False,False,False,False,False,False,False,False,False


In [45]:
df[["en", "fr", "de", "es", "po", "zh", "ja", "ko", "it", "ru", "ar"]].sum()

en    103255
fr     30944
de     31417
es     30279
po     19632
zh     30914
ja     26932
ko     17287
it     22665
ru     25726
ar      4501
dtype: int64

In [46]:
df.rename(columns={
    "en": "lang_en",
    "fr": "lang_fr",
    "de": "lang_de",
    "es": "lang_es",
    "po": "lang_po",
    "zh": "lang_zh",
    "ja": "lang_ja",
    "ko": "lang_ko",
    "it": "lang_it",
    "ru": "lang_ru",
    "ar": "lang_ar"
}, inplace=True)

In [48]:
df.drop(columns=["supported_languages", ""], inplace=True)

In [49]:
df.columns

Index(['type', 'name', 'steam_appid', 'required_age', 'is_free', 'packages',
       'genres', 'controller_support', 'dlc', 'recommendations', 'drm_notice',
       'metacritic', 'alternate_appid', 'platform_windows', 'platform_mac',
       'platform_linux', 'coming_date', 'coming_soon', 'package_number',
       'currency', 'initial_price', 'final_price', 'discount_percent',
       'fullgame_appid', 'demo_appid', 'developers_amount',
       'publishers_amount', 'single', 'multi', 'support_vr',
       'support_controller', 'lang_en', 'lang_fr', 'lang_de', 'lang_es',
       'lang_po', 'lang_zh', 'lang_ja', 'lang_ko', 'lang_it', 'lang_ru',
       'lang_ar'],
      dtype='object')

## Extract genres

___

In [50]:
false_table = np.full((df.shape[0], 2), False)

genre_table = pd.DataFrame(false_table, columns=["tool", "nsfw"])
df = pd.concat([df, genre_table], axis=1)

In [51]:
df["genres"] = df["genres"].apply(lambda x: simple_string_to_json(x, []))

In [52]:
genre_code = {
	"Animation & Modeling": "tool",
	"Design & Illustration": "tool",
	"Photo Editing": "tool",
	"Utilities": "tool",
	"Web Publishing": "tool",
	"Video Production": "tool",
	"Audio Production": "tool",
	"Software Training": "tool",
	"Accounting": "tool",
	"Movie": "tool",
	"Gore": "nsfw",
	"Documentary": "tool",
	"Nudity": "nsfw",
	"Sexual Content": "nsfw",
	"Tutorial": "tool"
}

count = 0
for i in range(df.shape[0]):
    for genre in df.loc[i, "genres"]:
        key = genre_code.get(genre["description"], "")
        if len(key) != "":
            df.loc[i, key] = True
    if count % 10000 == 0:
        print(f"Step {count} done")
    count += 1

df[["tool", "nsfw"]]

Step 0 done
Step 10000 done
Step 20000 done
Step 30000 done
Step 40000 done
Step 50000 done
Step 60000 done
Step 70000 done
Step 80000 done
Step 90000 done
Step 100000 done
Step 110000 done


Unnamed: 0,tool,nsfw
0,False,False
1,False,False
2,False,False
3,False,False
4,True,False
...,...,...
115965,True,False
115966,True,False
115967,True,False
115968,True,False


In [55]:
df.drop(columns=["genres", ""], inplace=True)

In [56]:
df.columns

Index(['type', 'name', 'steam_appid', 'required_age', 'is_free', 'packages',
       'controller_support', 'dlc', 'recommendations', 'drm_notice',
       'metacritic', 'alternate_appid', 'platform_windows', 'platform_mac',
       'platform_linux', 'coming_date', 'coming_soon', 'package_number',
       'currency', 'initial_price', 'final_price', 'discount_percent',
       'fullgame_appid', 'demo_appid', 'developers_amount',
       'publishers_amount', 'single', 'multi', 'support_vr',
       'support_controller', 'lang_en', 'lang_fr', 'lang_de', 'lang_es',
       'lang_po', 'lang_zh', 'lang_ja', 'lang_ko', 'lang_it', 'lang_ru',
       'lang_ar', 'tool', 'nsfw'],
      dtype='object')

## Done!!!

___

In [57]:
df.drop(columns=["packages"], inplace=True)

In [58]:
df.loc[80000, :]

type                                          game
name                  Fill and Cross World Contest
steam_appid                              1398600.0
required_age                                     0
is_free                                      False
controller_support                             NaN
dlc                                            NaN
recommendations                                NaN
drm_notice                                     NaN
metacritic                                     NaN
alternate_appid                                NaN
platform_windows                              True
platform_mac                                 False
platform_linux                               False
coming_date                    2020-09-18 00:00:00
coming_soon                                  False
package_number                                 468
currency                                       VND
initial_price                            7350000.0
final_price                    

In [59]:
df.to_csv("archive/cleaned_steam_db_v1.csv")