In [1]:
import shutil

import pandas as pd
import numpy as np
import torchtext
from bs4 import BeautifulSoup
import requests

# Exploring the Data

In [2]:
main_df = pd.read_csv("../data/steam/steam.csv")
desc_df = pd.read_csv("../data/steam/steam_description_data.csv")
media_df = pd.read_csv("../data/steam/steam_media_data.csv")

In [3]:
main_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [4]:
desc_df.head()

Unnamed: 0,steam_appid,detailed_description,about_the_game,short_description
0,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...
3,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...
4,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...


In [5]:
media_df.head()

Unnamed: 0,steam_appid,header_image,screenshots,background,movies
0,10,https://steamcdn-a.akamaihd.net/steam/apps/10/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/10/...,
1,20,https://steamcdn-a.akamaihd.net/steam/apps/20/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/20/...,
2,30,https://steamcdn-a.akamaihd.net/steam/apps/30/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/30/...,
3,40,https://steamcdn-a.akamaihd.net/steam/apps/40/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/40/...,
4,50,https://steamcdn-a.akamaihd.net/steam/apps/50/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/50/...,


Each csv uses the appid column so we can merge all csvs into one.

In [6]:
merged_df = main_df.merge(desc_df, left_on="appid", right_on="steam_appid")

In [7]:
merged_df = merged_df.merge(media_df, left_on="appid", right_on="steam_appid")

In [8]:
print(merged_df.columns)
merged_df.head()

Index(['appid', 'name', 'release_date', 'english', 'developer', 'publisher',
       'platforms', 'required_age', 'categories', 'genres', 'steamspy_tags',
       'achievements', 'positive_ratings', 'negative_ratings',
       'average_playtime', 'median_playtime', 'owners', 'price',
       'steam_appid_x', 'detailed_description', 'about_the_game',
       'short_description', 'steam_appid_y', 'header_image', 'screenshots',
       'background', 'movies'],
      dtype='object')


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,price,steam_appid_x,detailed_description,about_the_game,short_description,steam_appid_y,header_image,screenshots,background,movies
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,7.19,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,10,https://steamcdn-a.akamaihd.net/steam/apps/10/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/10/...,
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,3.99,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,20,https://steamcdn-a.akamaihd.net/steam/apps/20/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/20/...,
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,3.99,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,30,https://steamcdn-a.akamaihd.net/steam/apps/30/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/30/...,
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,3.99,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,40,https://steamcdn-a.akamaihd.net/steam/apps/40/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/40/...,
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,...,3.99,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,50,https://steamcdn-a.akamaihd.net/steam/apps/50/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/50/...,


We can drop the columns several columns from this new merged df.

In [9]:
merged_df.drop(
    columns=[
        "steam_appid_x", 
        "steam_appid_y", 
        "background", 
        "platforms", 
        "required_age", 
        "release_date", 
        "positive_ratings", 
        "negative_ratings", 
        "background", 
        "median_playtime", 
        "movies", 
        "about_the_game", 
        "short_description",
        "price",
        "owners",
        "steamspy_tags",
        "achievements",
        "average_playtime",
        "english"], 
    inplace=True)

In [10]:
merged_df.head()

Unnamed: 0,appid,name,developer,publisher,categories,genres,detailed_description,header_image,screenshots
0,10,Counter-Strike,Valve,Valve,Multi-player;Online Multi-Player;Local Multi-P...,Action,Play the world's number 1 online action game. ...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn..."
1,20,Team Fortress Classic,Valve,Valve,Multi-player;Online Multi-Player;Local Multi-P...,Action,One of the most popular online action games of...,https://steamcdn-a.akamaihd.net/steam/apps/20/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn..."
2,30,Day of Defeat,Valve,Valve,Multi-player;Valve Anti-Cheat enabled,Action,Enlist in an intense brand of Axis vs. Allied ...,https://steamcdn-a.akamaihd.net/steam/apps/30/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn..."
3,40,Deathmatch Classic,Valve,Valve,Multi-player;Online Multi-Player;Local Multi-P...,Action,Enjoy fast-paced multiplayer gaming with Death...,https://steamcdn-a.akamaihd.net/steam/apps/40/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,Single-player;Multi-player;Valve Anti-Cheat en...,Action,Return to the Black Mesa Research Facility as ...,https://steamcdn-a.akamaihd.net/steam/apps/50/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn..."


In [11]:
# merged_df.to_csv("../data/datasetv1.csv", index=False)

# Cleaning the Dataset

In [12]:
merged_df.rename(columns={"detailed_description": "description"}, inplace=True)

First I'm going to remove all HTML, new-lines, tabs, and extra spaces from the description column.

In [13]:
def clean_description(desc: str) -> str:
    text = BeautifulSoup(desc).get_text()
    text = " ".join(text.split())
    return text

In [14]:
merged_df.description = merged_df.description.apply(clean_description)

In [15]:
merged_df.description

0        Play the world's number 1 online action game. ...
1        One of the most popular online action games of...
2        Enlist in an intense brand of Axis vs. Allied ...
3        Enjoy fast-paced multiplayer gaming with Death...
4        Return to the Black Mesa Research Facility as ...
                               ...                        
27070    This is my first indie game on Steam. I played...
27071    Have you ever been so lonely that no one but y...
27072    Super Star Blast is a space based game with ch...
27073    Pursue a snow-white deer through an enchanted ...
27074    A portal has opened and dark magic is pouring ...
Name: description, Length: 27075, dtype: object

In [None]:
# merged_df.to_csv("../data/datasetv3.csv", index=False)

In [18]:
dataset = pd.read_csv("../data/datasetv3.csv")

In [28]:
genres = set()

In [29]:
for u in dataset.genres.unique():
    genre_list = u.split(";")
    for genre in genre_list:
        genres.add(genre)

In [30]:
genres

{'Accounting',
 'Action',
 'Adventure',
 'Animation & Modeling',
 'Audio Production',
 'Casual',
 'Design & Illustration',
 'Documentary',
 'Early Access',
 'Education',
 'Free to Play',
 'Game Development',
 'Gore',
 'Indie',
 'Massively Multiplayer',
 'Nudity',
 'Photo Editing',
 'RPG',
 'Racing',
 'Sexual Content',
 'Simulation',
 'Software Training',
 'Sports',
 'Strategy',
 'Tutorial',
 'Utilities',
 'Video Production',
 'Violent',
 'Web Publishing'}

In [41]:
from collections import Counter

In [42]:
genre_counter = Counter()

In [43]:
for g in dataset.genres:
    genre_list = g.split(";")
    for genre in genre_list:
        genre_counter[genre] += 1

In [51]:
genre_counter.most_common()

[('Indie', 19421),
 ('Action', 11903),
 ('Casual', 10210),
 ('Adventure', 10032),
 ('Strategy', 5247),
 ('Simulation', 5194),
 ('RPG', 4311),
 ('Early Access', 2954),
 ('Free to Play', 1704),
 ('Sports', 1322),
 ('Racing', 1024),
 ('Violent', 843),
 ('Massively Multiplayer', 723),
 ('Gore', 537),
 ('Nudity', 266),
 ('Sexual Content', 245),
 ('Utilities', 146),
 ('Design & Illustration', 87),
 ('Animation & Modeling', 79),
 ('Education', 51),
 ('Video Production', 38),
 ('Software Training', 31),
 ('Audio Production', 29),
 ('Web Publishing', 28),
 ('Game Development', 17),
 ('Photo Editing', 12),
 ('Accounting', 6),
 ('Documentary', 1),
 ('Tutorial', 1)]

In [66]:
drop_genres = [
    "Tutorial",
    "Documentary",
    "Accounting",
    "Photo Editing",
    "Game Development",
    "Sexual Content",
    "Software Training",
    "Education",
    "Web Publishing",
    "Design & Illustration",
    "Utilities",
    "Nudity",
    "Video Production",
    "Animation & Modeling",
    "Audio Production"
]

In [67]:
dataset = dataset[~dataset.genres.apply(lambda genres: any(g in drop_genres for g in genres.split(";")))]

In [68]:
genre_counter = Counter()

In [69]:
for g in dataset.genres:
    genre_list = g.split(";")
    for genre in genre_list:
        genre_counter[genre] += 1

In [70]:
genre_counter.most_common()

[('Indie', 19143),
 ('Action', 11794),
 ('Casual', 10022),
 ('Adventure', 9859),
 ('Strategy', 5217),
 ('Simulation', 5099),
 ('RPG', 4233),
 ('Early Access', 2883),
 ('Free to Play', 1669),
 ('Sports', 1319),
 ('Racing', 1020),
 ('Massively Multiplayer', 717),
 ('Violent', 708),
 ('Gore', 455)]

In [72]:
# dataset.to_csv("../data/datasetv4.csv", index=False)

In [73]:
dataset.genres.apply(lambda x: x.split(";")[0]).value_counts()

Action                   11212
Adventure                 5253
Casual                    4359
Indie                     2615
Violent                    708
Simulation                 628
Strategy                   532
RPG                        407
Free to Play               391
Racing                     197
Sports                      83
Gore                        81
Massively Multiplayer       15
Early Access                 9
Name: genres, dtype: int64