In [1]:
import shutil

import pandas as pd
import numpy as np
import torchtext
from bs4 import BeautifulSoup
import requests

# Exploring the Data

In [2]:
main_df = pd.read_csv("../data/steam/steam.csv")
desc_df = pd.read_csv("../data/steam/steam_description_data.csv")
media_df = pd.read_csv("../data/steam/steam_media_data.csv")

In [3]:
main_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [4]:
desc_df.head()

Unnamed: 0,steam_appid,detailed_description,about_the_game,short_description
0,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...
3,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...
4,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...


In [5]:
media_df.head()

Unnamed: 0,steam_appid,header_image,screenshots,background,movies
0,10,https://steamcdn-a.akamaihd.net/steam/apps/10/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/10/...,
1,20,https://steamcdn-a.akamaihd.net/steam/apps/20/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/20/...,
2,30,https://steamcdn-a.akamaihd.net/steam/apps/30/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/30/...,
3,40,https://steamcdn-a.akamaihd.net/steam/apps/40/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/40/...,
4,50,https://steamcdn-a.akamaihd.net/steam/apps/50/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/50/...,


Each csv uses the appid column so we can merge all csvs into one.

In [6]:
merged_df = main_df.merge(desc_df, left_on="appid", right_on="steam_appid")

In [7]:
merged_df = merged_df.merge(media_df, left_on="appid", right_on="steam_appid")

In [8]:
print(merged_df.columns)
merged_df.head()

Index(['appid', 'name', 'release_date', 'english', 'developer', 'publisher',
       'platforms', 'required_age', 'categories', 'genres', 'steamspy_tags',
       'achievements', 'positive_ratings', 'negative_ratings',
       'average_playtime', 'median_playtime', 'owners', 'price',
       'steam_appid_x', 'detailed_description', 'about_the_game',
       'short_description', 'steam_appid_y', 'header_image', 'screenshots',
       'background', 'movies'],
      dtype='object')


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,price,steam_appid_x,detailed_description,about_the_game,short_description,steam_appid_y,header_image,screenshots,background,movies
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,7.19,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,10,https://steamcdn-a.akamaihd.net/steam/apps/10/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/10/...,
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,3.99,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,20,https://steamcdn-a.akamaihd.net/steam/apps/20/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/20/...,
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,3.99,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,30,https://steamcdn-a.akamaihd.net/steam/apps/30/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/30/...,
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,3.99,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,40,https://steamcdn-a.akamaihd.net/steam/apps/40/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/40/...,
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,...,3.99,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,50,https://steamcdn-a.akamaihd.net/steam/apps/50/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/50/...,


We can drop the columns several columns from this new merged df.

In [9]:
merged_df.drop(
    columns=[
        "steam_appid_x", 
        "steam_appid_y", 
        "background", 
        "platforms", 
        "required_age", 
        "release_date", 
        "positive_ratings", 
        "negative_ratings", 
        "screenshots", 
        "background", 
        "median_playtime", 
        "movies", 
        "screenshots", 
        "about_the_game", 
        "short_description",
        "price",
        "owners",
        "categories",
        "steamspy_tags",
        "achievements",
        "average_playtime",
        "english"], 
    inplace=True)

In [10]:
merged_df.head()

Unnamed: 0,appid,name,developer,publisher,genres,detailed_description,header_image
0,10,Counter-Strike,Valve,Valve,Action,Play the world's number 1 online action game. ...,https://steamcdn-a.akamaihd.net/steam/apps/10/...
1,20,Team Fortress Classic,Valve,Valve,Action,One of the most popular online action games of...,https://steamcdn-a.akamaihd.net/steam/apps/20/...
2,30,Day of Defeat,Valve,Valve,Action,Enlist in an intense brand of Axis vs. Allied ...,https://steamcdn-a.akamaihd.net/steam/apps/30/...
3,40,Deathmatch Classic,Valve,Valve,Action,Enjoy fast-paced multiplayer gaming with Death...,https://steamcdn-a.akamaihd.net/steam/apps/40/...
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,Action,Return to the Black Mesa Research Facility as ...,https://steamcdn-a.akamaihd.net/steam/apps/50/...


In [11]:
# merged_df.to_csv("../data/datasetv1.csv", index=False)

# Cleaning the Dataset

In [12]:
dataset = pd.read_csv("../data/datasetv1.csv")

In [13]:
dataset.rename(columns={"detailed_description": "description"}, inplace=True)

First I'm going to remove all HTML, new-lines, tabs, and extra spaces from the description column.

In [14]:
for i, r in dataset.iterrows():
    if "<" in r.description:
        break

In [15]:
r.description

'1998. HALF-LIFE sends a shock through the game industry with its combination of pounding action and continuous, immersive storytelling. Valve\'s debut title wins more than 50 game-of-the-year awards on its way to being named "Best PC Game Ever" by PC Gamer, and launches a franchise with more than eight million retail units sold worldwide.<br><br>\r\n\t\tNOW. By taking the suspense, challenge and visceral charge of the original, and adding startling new realism and responsiveness, Half-Life 2 opens the door to a world where the player\'s presence affects everything around him, from the physical environment to the behaviors even the emotions of both friends and enemies.<br><br>\r\n\t\tThe player again picks up the crowbar of research scientist Gordon Freeman, who finds himself on an alien-infested Earth being picked to the bone, its resources depleted, its populace dwindling. Freeman is thrust into the unenviable role of rescuing the world from the wrong he unleashed back at Black Mesa.

In [16]:
text = BeautifulSoup(r.description).get_text()

In [17]:
text = " ".join(text.split())

In [18]:
text

'1998. HALF-LIFE sends a shock through the game industry with its combination of pounding action and continuous, immersive storytelling. Valve\'s debut title wins more than 50 game-of-the-year awards on its way to being named "Best PC Game Ever" by PC Gamer, and launches a franchise with more than eight million retail units sold worldwide. NOW. By taking the suspense, challenge and visceral charge of the original, and adding startling new realism and responsiveness, Half-Life 2 opens the door to a world where the player\'s presence affects everything around him, from the physical environment to the behaviors even the emotions of both friends and enemies. The player again picks up the crowbar of research scientist Gordon Freeman, who finds himself on an alien-infested Earth being picked to the bone, its resources depleted, its populace dwindling. Freeman is thrust into the unenviable role of rescuing the world from the wrong he unleashed back at Black Mesa. And a lot of people he cares 

Now we can write a function to apply it to every row of data.

In [19]:
def clean_description(desc: str) -> str:
    text = BeautifulSoup(desc).get_text()
    text = " ".join(text.split())
    return text

In [20]:
dataset.description = dataset.description.apply(clean_description)

In [21]:
dataset.description

0        Play the world's number 1 online action game. ...
1        One of the most popular online action games of...
2        Enlist in an intense brand of Axis vs. Allied ...
3        Enjoy fast-paced multiplayer gaming with Death...
4        Return to the Black Mesa Research Facility as ...
                               ...                        
27070    This is my first indie game on Steam. I played...
27071    Have you ever been so lonely that no one but y...
27072    Super Star Blast is a space based game with ch...
27073    Pursue a snow-white deer through an enchanted ...
27074    A portal has opened and dark magic is pouring ...
Name: description, Length: 27075, dtype: object

As you can see, the genre category is plural. Just as a quick shortcut, I'm going to take the first genre from each value. I realize it's going to be a biased selection towards genres that come first alphabetically, but I'm okay with that.

dataset.genres = dataset.genres.apply(lambda x: x.split(";")[0])

dataset

In [22]:
dataset.to_csv("../data/datasetv2.csv", index=False)

# Downloading the Header Images

Next we need the header images to train our image generator.

In [155]:
img_url = dataset.header_image[0]
img_url

'https://steamcdn-a.akamaihd.net/steam/apps/10/header.jpg?t=1528733245'

In [156]:
r = requests.get(img_url, stream=True)
r

<Response [200]>

In [157]:
r.raw.decode_content = True
    
with open("temp.jpg", 'wb') as f:
    shutil.copyfileobj(r.raw, f)