In [3]:
import pandas as pd 
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np

import datetime
import matplotlib.dates as mdates

sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = [15, 10]

# KBQ

What should a company make for their next game?

# Understanding the data

## Loading the dataframe

In [4]:
steam_df = pd.read_csv('steam.csv')
steam_df

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,Room of Pandora,2019-04-24,1,SHEN JIAWEI,SHEN JIAWEI,windows,0,Single-player;Steam Achievements,Adventure;Casual;Indie,Adventure;Indie;Casual,7,3,0,0,0,0-20000,2.09
27071,1065570,Cyber Gun,2019-04-23,1,Semyon Maximov,BekkerDev Studio,windows,0,Single-player,Action;Adventure;Indie,Action;Indie;Adventure,0,8,1,0,0,0-20000,1.69
27072,1065650,Super Star Blast,2019-04-24,1,EntwicklerX,EntwicklerX,windows,0,Single-player;Multi-player;Co-op;Shared/Split ...,Action;Casual;Indie,Action;Indie;Casual,24,0,1,0,0,0-20000,3.99
27073,1066700,New Yankee 7: Deer Hunters,2019-04-17,1,Yustas Game Studio,Alawar Entertainment,windows;mac,0,Single-player;Steam Cloud,Adventure;Casual;Indie,Indie;Casual;Adventure,0,2,0,0,0,0-20000,5.19


## Checking for missing data

In [5]:
steam_df.isna().sum()

appid               0
name                0
release_date        0
english             0
developer           0
publisher           0
platforms           0
required_age        0
categories          0
genres              0
steamspy_tags       0
achievements        0
positive_ratings    0
negative_ratings    0
average_playtime    0
median_playtime     0
owners              0
price               0
dtype: int64

In [6]:
steam_df.dtypes

appid                 int64
name                 object
release_date         object
english               int64
developer            object
publisher            object
platforms            object
required_age          int64
categories           object
genres               object
steamspy_tags        object
achievements          int64
positive_ratings      int64
negative_ratings      int64
average_playtime      int64
median_playtime       int64
owners               object
price               float64
dtype: object

## Statistics

In [26]:
pd.Series(steam_df['owners'].unique())

0       10000000-20000000
1        5000000-10000000
2         2000000-5000000
3       20000000-50000000
4     100000000-200000000
5      50000000-100000000
6             20000-50000
7          500000-1000000
8           100000-200000
9            50000-100000
10        1000000-2000000
11          200000-500000
12                0-20000
dtype: object

In [30]:
ave_owners = np.array([15_000_000, 7_500_000, 3_500_000, 35_000_000, 
                       150_000_000, 75_000_000, 35_000, 750_000, 150_000, 
                       75_000, 1_500_000, 350_000, 10_000])
owner_dict = dict(zip(steam_df['owners'].unique(), ave_owners))
owner_dict

{'10000000-20000000': 15000000,
 '5000000-10000000': 7500000,
 '2000000-5000000': 3500000,
 '20000000-50000000': 35000000,
 '100000000-200000000': 150000000,
 '50000000-100000000': 75000000,
 '20000-50000': 35000,
 '500000-1000000': 750000,
 '100000-200000': 150000,
 '50000-100000': 75000,
 '1000000-2000000': 1500000,
 '200000-500000': 350000,
 '0-20000': 10000}

In [32]:
main_df = steam_df.replace({'owners': owner_dict})

In [33]:
main_df.drop(columns=['appid', 'english']).describe()

Unnamed: 0,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
count,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0
mean,0.354903,45.248864,1000.559,211.027147,149.804949,146.05603,134090.5,6.078193
std,2.406044,352.670281,18988.72,4284.938531,1827.038141,2353.88008,1328089.0,7.874922
min,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,0.0
25%,0.0,0.0,6.0,2.0,0.0,0.0,10000.0,1.69
50%,0.0,7.0,24.0,9.0,0.0,0.0,10000.0,3.99
75%,0.0,23.0,126.0,42.0,0.0,0.0,35000.0,7.19
max,18.0,9821.0,2644404.0,487076.0,190625.0,190625.0,150000000.0,421.99


# Univariate Analysis

## Counting values

### Number of games by developer

In [39]:
main_df['developer'].value_counts()

Choice of Games               94
KOEI TECMO GAMES CO., LTD.    72
Ripknot Systems               62
Laush Dmitriy Sergeevich      51
Nikita "Ghost_RUS"            50
                              ..
CRAPPY ZOMBIE GAME STUDIO      1
Ramon Mujica                   1
Oomst Games                    1
Joe Censored Games             1
Adept Studios GD               1
Name: developer, Length: 17113, dtype: int64

### Number of games by publisher

In [40]:
main_df['publisher'].value_counts()

Big Fish Games           212
Strategy First           136
Ubisoft                  111
THQ Nordic                98
Square Enix               97
                        ... 
Tonka3D                    1
guyhezi                    1
Robert Gammon              1
Phun Peeticharoenthum      1
SHEN JIAWEI                1
Name: publisher, Length: 14354, dtype: int64

### Number of games by age restriction

In [42]:
main_df['required_age'].value_counts()

0     26479
18      308
16      192
12       73
7        12
3        11
Name: required_age, dtype: int64

### Number of games by platform

In [44]:
main_df['platforms'].value_counts()

windows              18398
windows;mac;linux     4623
windows;mac           3439
windows;linux          610
mac                      3
mac;linux                1
linux                    1
Name: platforms, dtype: int64

### Number of games by category

Note that categories are not mutually exclusive. Therefore, the total count per category does not correspond to the total number of games, that is, some games may be counted in multiple categories.

#### Wrangling

In [89]:
# Find all unique categories
categories = np.array(list(set([cat for entry in main_df['categories']
                                .str.split(';').tolist() 
                                for cat in entry])))
pd.Series(categories)

0                   Online Co-op
1                  Single-player
2        Full controller support
3                 Steam Workshop
4           SteamVR Collectibles
5             Captions available
6       Valve Anti-Cheat enabled
7                            MMO
8            Shared/Split Screen
9                     VR Support
10    Partial Controller Support
11    Cross-Platform Multiplayer
12            Mods (require HL2)
13                         Co-op
14          Commentary available
15                   Local Co-op
16            Steam Leaderboards
17                  Multi-player
18                   Steam Cloud
19           Steam Trading Cards
20            Steam Achievements
21                         Stats
22            Local Multi-Player
23      Steam Turn Notifications
24           Includes Source SDK
25              In-App Purchases
26                          Mods
27         Includes level editor
28           Online Multi-Player
dtype: object

#### Counting

In [152]:
# Number of games by unique combination of categories
cat_per_entry = main_df['categories'].value_counts()
cat_per_entry

Single-player                                                                                                              6110
Single-player;Steam Achievements                                                                                           2334
Single-player;Steam Achievements;Steam Trading Cards                                                                        848
Single-player;Partial Controller Support                                                                                    804
Single-player;Steam Trading Cards                                                                                           792
                                                                                                                           ... 
Single-player;Steam Achievements;Steam Trading Cards;Captions available;Partial Controller Support;Commentary available       1
Single-player;Steam Achievements;Full controller support;Steam Trading Cards;VR Support;Steam Cloud;Stea

In [153]:
# Number of games by category
cat_counts = (pd.DataFrame((cat, cat_per_entry[cat_per_entry
                                              .index
                                              .str.contains(cat)]
                           .sum()) for cat in categories)
              .rename(columns={0:'categories', 1:'count'})
              .set_index('categories')
              .squeeze().sort_values(ascending=False))
cat_counts

  cat_counts = (pd.DataFrame((cat, cat_per_entry[cat_per_entry


categories
Single-player                 25678
Steam Achievements            14130
Steam Trading Cards            7918
Steam Cloud                    7219
Full controller support        5695
Partial Controller Support     4234
Multi-player                   3974
Steam Leaderboards             3439
Co-op                          2604
Online Multi-Player            2487
Shared/Split Screen            2152
Stats                          1878
Local Multi-Player             1615
Cross-Platform Multiplayer     1081
Online Co-op                   1071
Local Co-op                    1059
Includes level editor          1036
Steam Workshop                  897
Captions available              721
In-App Purchases                690
MMO                             421
VR Support                      231
Commentary available            144
Valve Anti-Cheat enabled         94
Steam Turn Notifications         63
SteamVR Collectibles             40
Includes Source SDK              35
Mods             

### Number of games by genre

Note that genres are not mutually exclusive. Therefore, the total count per genre does not correspond to the total number of games, that is, some games may be counted in multiple genres.

#### Wrangling

In [169]:
genres = np.array(list(set([gen for entry in main_df['genres']
                                .str.split(';').tolist() 
                                for gen in entry])))
pd.Series(genres)

0                 Adventure
1     Design & Illustration
2                    Nudity
3                       RPG
4                   Violent
5                      Gore
6                  Strategy
7          Audio Production
8                Accounting
9               Documentary
10                   Action
11                    Indie
12        Software Training
13               Simulation
14           Sexual Content
15                 Tutorial
16                Utilities
17            Photo Editing
18     Animation & Modeling
19                   Sports
20    Massively Multiplayer
21                Education
22                   Racing
23                   Casual
24           Web Publishing
25             Early Access
26         Game Development
27         Video Production
28             Free to Play
dtype: object

#### Counting

In [170]:
# Number of games by unique combination of genres
gen_per_entry = main_df['genres'].value_counts()
gen_per_entry

Action;Indie                                                             1852
Casual;Indie                                                             1482
Action;Adventure;Indie                                                   1229
Adventure;Indie                                                          1170
Action;Casual;Indie                                                      1004
                                                                         ... 
Sexual Content;Violent;Gore;Action;Casual;Free to Play;Indie;Strategy       1
Nudity;Violent;Gore;Adventure;Indie;RPG;Early Access                        1
Violent;Action;Adventure;Casual;Indie;RPG;Early Access                      1
Indie;Simulation;Web Publishing                                             1
Casual;Free to Play;Massively Multiplayer;RPG;Early Access                  1
Name: genres, Length: 1552, dtype: int64

In [171]:
# Number of games by genres
gen_counts = (pd.DataFrame((gen, gen_per_entry[gen_per_entry
                                              .index
                                              .str.contains(gen)]
                           .sum()) for gen in genres)
              .rename(columns={0:'genres', 1:'count'})
              .set_index('genres')
              .squeeze().sort_values(ascending=False))
gen_counts

genres
Indie                    19421
Action                   11903
Casual                   10210
Adventure                10032
Strategy                  5247
Simulation                5194
RPG                       4311
Early Access              2954
Free to Play              1704
Sports                    1322
Racing                    1024
Violent                    843
Massively Multiplayer      723
Gore                       537
Nudity                     266
Sexual Content             245
Utilities                  146
Design & Illustration       87
Animation & Modeling        79
Education                   51
Video Production            38
Software Training           31
Audio Production            29
Web Publishing              28
Game Development            17
Photo Editing               12
Accounting                   6
Tutorial                     1
Documentary                  1
Name: count, dtype: int64

### Number of games by SteamSpy tags

Note that SteamSpy tags are not mutually exclusive. Therefore, the total count per SteamSpy tag does not correspond to the total number of games, that is, some games may be counted in multiple SteamSpy tags.

#### Wrangling

In [172]:
tags = np.array(list(set([tag for entry in main_df['steamspy_tags']
                                .str.split(';').tolist() 
                                for tag in entry])))
pd.Series(tags)

0                         Demons
1                          Short
2      Choose Your Own Adventure
3                       RPGMaker
4                          Blood
                 ...            
334                        Faith
335                      Economy
336                      Shooter
337          Interactive Fiction
338                  Bullet Hell
Length: 339, dtype: object

#### Counting

In [173]:
# Number of games by unique combination of tags
tag_per_entry = main_df['steamspy_tags'].value_counts()
tag_per_entry

Action;Indie;Casual                      845
Action;Adventure;Indie                   714
Early Access;Action;Indie                507
Adventure;Indie;Casual                   442
Indie;Casual                             378
                                        ... 
Action;Adventure;Warhammer 40K             1
Anime;RPG;Cute                             1
Action;Casual;Tower Defense                1
RPG;Turn-Based;Co-op                       1
Early Access;Adventure;Sexual Content      1
Name: steamspy_tags, Length: 6423, dtype: int64

In [177]:
# Number of games by tags
tag_counts = (pd.DataFrame((tag, tag_per_entry[tag_per_entry
                                              .index
                                              .str.contains(tag)]
                           .sum()) for tag in tags)
              .rename(columns={0:'tags', 1:'count'})
              .set_index('tags')
              .squeeze().sort_values(ascending=False))
tag_counts

tags
Indie                                16232
Action                               10344
Casual                                8205
Adventure                             7796
Strategy                              4180
                                     ...  
Sailing                                  1
Stylized                                 1
2.5D                                     1
Music-Based Procedural Generation        1
Spectacle fighter                        1
Name: count, Length: 339, dtype: int64

### Checking for overlapping values in categories, genres, and tags.

In [185]:
(np.intersect1d(categories, genres), 
 np.intersect1d(genres, tags), 
 np.intersect1d(tags, categories))

(array([], dtype='<U26'),
 array(['Action', 'Adventure', 'Animation & Modeling', 'Audio Production',
        'Casual', 'Design & Illustration', 'Documentary', 'Early Access',
        'Education', 'Free to Play', 'Game Development', 'Gore', 'Indie',
        'Massively Multiplayer', 'Nudity', 'Photo Editing', 'RPG',
        'Racing', 'Sexual Content', 'Simulation', 'Software Training',
        'Sports', 'Strategy', 'Utilities', 'Video Production', 'Violent',
        'Web Publishing'], dtype='<U33'),
 array(['Co-op'], dtype='<U33'))

**Proposition.**

We can neglect the tags that overlap with genres and categories so that we can define tags to be more specific classifications (determined by the community) for the game apart from its designated genres and categories.

### Number of games by price

In [227]:
# There are 282 unique prices
main_df['price'].value_counts()

3.99     3211
0.79     2892
0.00     2560
6.99     2050
7.19     1304
         ... 
20.51       1
11.95       1
3.92        1
6.59        1
6.10        1
Name: price, Length: 282, dtype: int64

### Feature engineering: Price range

In [None]:
# 1 GBP to PHP conversion for potential analysis
GBP_to_PHP = 65.37

In [230]:
# Custom binning to categorize price range
price_binning = np.linspace(0, 422, 20)
price_binning = np.array([-0.1, 0.31, 40, 50, 100, 500])

main_df['binned_price'] = pd.cut(main_df['price'], price_binning)
main_df['binned_price'].value_counts()

(0.31, 40.0]      24400
(-0.1, 0.31]       2560
(40.0, 50.0]         77
(50.0, 100.0]        27
(100.0, 500.0]       11
Name: binned_price, dtype: int64