In [11]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [5]:
# read and merge data
steam_data = pd.read_csv('../Data/steam_store_games_clean.csv', index_col=0)
steamspy_data = pd.read_csv('../Data/steam_spy_clean.csv', index_col=0)
    
merged = steam_data.merge(steamspy_data, how='inner', left_on='steam_appid', right_on='appid', suffixes=('', '_steamspy'))
print(merged.shape)
merged.head()

(59115, 29)


Unnamed: 0,type,name,steam_appid,required_age,is_free,dlc,developers,publishers,categories,release_date,...,name_steamspy,positive,negative,owners,average_forever,median_forever,price,initialprice,languages,tags
0,game,Dota 2,570,0.0,True,"[1241930, 652720]",['Valve'],['Valve'],"['Multi-player', 'Co-op', 'Steam Trading Cards...",2013-07-09,...,Dota 2,1607914,337779,200000000-500000000,40686,897,0.0,0.0,"English, Bulgarian, Czech, Danish, Dutch, Finn...",Free to Play;MOBA;Multiplayer
1,game,Apex Legends™,1172470,0.0,True,[],['Respawn Entertainment'],['Electronic Arts'],"['Multi-player', 'PvP', 'Online PvP', 'Co-op',...",2020-11-04,...,Apex Legends,510084,104286,50000000-100000000,7380,743,0.0,0.0,"English, French, Italian, German, Spanish - Sp...",Free to Play;Multiplayer;Battle Royale
2,game,PUBG: BATTLEGROUNDS,578080,0.0,True,[],"['KRAFTON, Inc.']","['KRAFTON, Inc.']","['Multi-player', 'PvP', 'Online PvP', 'Stats',...",2017-12-21,...,PUBG: BATTLEGROUNDS,1228783,924298,50000000-100000000,22786,6637,0.0,0.0,"English, Korean, Simplified Chinese, French, G...",Survival;Shooter;Multiplayer
3,game,New World,1063730,0.0,False,[],['Amazon Games'],['Amazon Games'],"['Multi-player', 'MMO', 'PvP', 'Online PvP', '...",2021-09-28,...,New World,176134,75910,50000000-100000000,7834,2978,3999.0,3999.0,"English, French, Italian, German, Spanish - Sp...",Massively Multiplayer;Open World;MMORPG
4,game,Team Fortress 2,440,0.0,True,[629330],['Valve'],['Valve'],"['Multi-player', 'Cross-Platform Multiplayer',...",2007-10-10,...,Team Fortress 2,881554,58658,50000000-100000000,10279,399,0.0,0.0,"English, Danish, Dutch, Finnish, French, Germa...",Free to Play;Hero Shooter;Multiplayer


In [6]:
# remove overlapping columns and reindex to reorder columns
df = merged[[
    'steam_appid',
    'type',
    'name',
    'release_date',
    'developers',
    'publishers',
    'dlc',
    'required_age',
    'is_free', 
    'categories',
    'tags',
    'languages',
    'positive',
    'negative',
    'owners',
    'average_forever',
    'median_forever',
    'price', 
    'initialprice',
    'controller_support', 
    'windows', 'mac', 'linux',
    'metacritic_score', 'metacritic_url',
    'initial', 'final', 'discount_percent',
]]

    
df = df.rename({
    'steam_appid': 'appid',
    'tags': 'steamspy_tags',
    'positive': 'positive_ratings',
    'negative': 'negative_ratings',
    'average_forever': 'average_playtime',
    'median_forever': 'median_playtime'
}, axis=1)

df.head()

Unnamed: 0,appid,type,name,release_date,developers,publishers,dlc,required_age,is_free,categories,...,initialprice,controller_support,windows,mac,linux,metacritic_score,metacritic_url,initial,final,discount_percent
0,570,game,Dota 2,2013-07-09,['Valve'],['Valve'],"[1241930, 652720]",0.0,True,"['Multi-player', 'Co-op', 'Steam Trading Cards...",...,0.0,0,1,1,1,90.0,https://www.metacritic.com/game/pc/dota-2?ftag...,,,
1,1172470,game,Apex Legends™,2020-11-04,['Respawn Entertainment'],['Electronic Arts'],[],0.0,True,"['Multi-player', 'PvP', 'Online PvP', 'Co-op',...",...,0.0,1,1,0,0,88.0,https://www.metacritic.com/game/pc/apex-legend...,,,
2,578080,game,PUBG: BATTLEGROUNDS,2017-12-21,"['KRAFTON, Inc.']","['KRAFTON, Inc.']",[],0.0,True,"['Multi-player', 'PvP', 'Online PvP', 'Stats',...",...,0.0,0,1,0,0,,,,,
3,1063730,game,New World,2021-09-28,['Amazon Games'],['Amazon Games'],[],0.0,False,"['Multi-player', 'MMO', 'PvP', 'Online PvP', '...",...,3999.0,0,1,0,0,70.0,https://www.metacritic.com/game/pc/new-world?f...,2505.692288,1252.846144,50.0
4,440,game,Team Fortress 2,2007-10-10,['Valve'],['Valve'],[629330],0.0,True,"['Multi-player', 'Cross-Platform Multiplayer',...",...,0.0,0,1,1,1,92.0,https://www.metacritic.com/game/pc/team-fortre...,,,


### Drop columns

In [None]:
# remove not useful columns
df = df.drop(['metacritic_url', # not useful for analysis
              'average_2weeks', 'median_2weeks' # not interested in temporally specific columns
              ], axis=1)


### Train Test split
Conduct data processing and EDA with training data to prevent data leakage

In [8]:
# example: owners as target variable
X_train, X_test, y_train, y_test = train_test_split(df.drop(['owners'], axis=1), df['owners'], test_size=0.2, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(47292, 27)
(11823, 27)
(47292,)
(11823,)


### Handling missing values

In [9]:
# Columns with null values
X_train.columns[X_train.isnull().sum() > 0]

Index(['metacritic_score', 'metacritic_url', 'initial', 'final',
       'discount_percent'],
      dtype='object')

In [13]:
# Impute with median

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

### Scaling and Normalising

### Outlier Analysis

### Correlation Analysis

### One Hot Encoding

In [54]:
# Function only for variables with a list of values
def find_unique(var_name):
    s = set()
    for ls in df[var_name]:
        for v in ls:
            s.add(v)
    return len(s)

list_cols = ['developers', 'publishers', 'dlc', 'categories', 'steamspy_tags', 'languages']#, 'genres']
print("Unique values:")
for col in list_cols:
    print(col, find_unique(col))

Unique values:
developers 1726
publishers 1492
dlc 14
categories 49
steamspy_tags 69
languages 60


### More Exploratory Data Analysis