### Exploratory Data Analysis of Steam data

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
playtime_df = pd.read_csv('../data/steam_playtime.csv')

In [None]:
playtime_df.sort_values(by='playtime_forever', ascending=False).head()

In [None]:
len(playtime_df)

### Domain knowledge based filtering
Play around with these to see which combinations yield better results in z-score, iqr and thereby the overall model

In [None]:
# drop counterstrike i.e. drop rows where appid is 730, 10 or 240
playtime_df = playtime_df[~playtime_df['appid'].isin([730, 10, 240])]

In [None]:
# drop user playtimes over x minutes
playtime_df = playtime_df[playtime_df['playtime_forever'] < 1000000]

In [None]:
# remove user playtimes under x minutes
playtime_df = playtime_df[playtime_df['playtime_forever'] > 60]

In [None]:
large_2week_playtimes = playtime_df[playtime_df['playtime_2weeks'] > 5000]

In [None]:
# drop users that are in large_2week_playtimes
playtime_df = playtime_df[~playtime_df['steam_id'].isin(large_2week_playtimes['steam_id'])]

In [None]:
# get mean for playtime_forever grouped by steam_id
grouped_by_users_mean = playtime_df.groupby('steam_id')['playtime_forever'].mean()

In [None]:
# show biggest grouped_by_users_mean values in hours
grouped_by_users_mean.sort_values(ascending=False, ) / 60

In [None]:
# drop extreme playtime_2weeks values, 5000 in 2 weeks equals 5.9 hours a day
playtime_df = playtime_df[playtime_df['playtime_2weeks'] < 5000]

### Z-SCORE

In [None]:
# get mean for playtime_forever
users_mean = playtime_df['playtime_forever'].mean()

In [None]:
# get standard deviation for playtime_forever
users_std = playtime_df['playtime_forever'].std()

In [None]:
print(users_mean, users_std)

In [None]:
def get_zscore(value, values):
    """Obtain the z-score of a given value"""
    m = users_mean
    s = users_std
    z_score = (value - m)/s
    return np.abs(z_score)

In [None]:
# compute z-scores for all values
playtime_df['z-score'] = playtime_df['playtime_forever'].apply(lambda x: get_zscore(x, playtime_df['playtime_forever']))

# find outliers
outliers = playtime_df[playtime_df['z-score']>3]
outliers.sort_values(by='playtime_forever', ascending=False)

In [None]:
# z-score too big around the 131k mark
inliers = playtime_df[playtime_df['z-score']<3]
inliers.sort_values(by='playtime_forever', ascending=False)

In [None]:
print(inliers['playtime_forever'].mean(), inliers['playtime_forever'].std())

In [None]:
# z-score does not take care of crazy playtime_2weeks values
inliers.sort_values(by=['playtime_2weeks'], ascending=False)

### Interquartile Range

In [None]:
# define the lower and upper bound 
q1 = np.quantile(playtime_df['playtime_forever'], 0.25)
q3 = np.quantile(playtime_df['playtime_forever'], 0.75) 
iqr = q3 - q1
lower_bound = q1 - 1.5*iqr
upper_bound = q3 + 1.5*iqr

# find records that fall outside of the lower and upper bound
iqr_outliers = playtime_df[(playtime_df['playtime_forever']<lower_bound) | (playtime_df['playtime_forever']>upper_bound)]
iqr_outliers.sort_values(by='playtime_forever', ascending=False)

In [None]:
iqr_inliers = playtime_df[(playtime_df['playtime_forever']>lower_bound) & (playtime_df['playtime_forever']<upper_bound)]
iqr_inliers.sort_values(by='playtime_forever', ascending=False)

In [None]:
iqr_inliers.sort_values(by=['playtime_2weeks'], ascending=False)

In [None]:
iqr_inliers.to_csv('../data/steam_playtime_clean_iqr.csv', index=False)

### EXPLORATION & PLOTS

In [None]:
# plot distribution of playtime_forever
sns.distplot(playtime_df['playtime_forever'], kde=False, bins=10)

In [None]:
# histogram of playtime_forever
num_bins = 20
range_min, range_max = np.min(playtime_df['playtime_forever']), np.max(playtime_df['playtime_forever'])

plt.hist(playtime_df['playtime_forever'], bins=num_bins, range=(range_min, range_max))

# labels
plt.xlabel('Playtime (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of playtime')

# show plot
plt.show()

In [None]:
sort_by_2weeks = playtime_df.sort_values(by='playtime_2weeks', ascending=False)

In [None]:
# plot sort_by_2weeks
sns.distplot(sort_by_2weeks['playtime_2weeks'], kde=False, bins=10)


In [None]:
# sort by playtime_forever
sort_by_playtime = playtime_df.sort_values(by = ['playtime_forever'], ascending = False).reset_index()

In [None]:
# plot distribution of playtimes
sort_by_playtime.plot(kind = 'hist')

In [None]:
sort_by_playtime

In [None]:
sort_by_playtime.plot(y='playtime_forever', use_index=True)

In [None]:
# plot sort_by_playtime
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(sort_by_playtime['playtime_forever'], bins = 20, kde = True)


In [None]:
sns.boxplot(x=sort_by_playtime['playtime_forever'])
plt.show()

In [None]:
# group by appid and get count, mean, median, min, max
playtime_by_game = playtime_df.groupby('appid').agg({'playtime_forever': ['count', 'mean', 'median', 'min', 'max']}).round(2)

In [None]:
# sort by mean
playtime_by_game.sort_values(by=('playtime_forever', 'mean'), ascending=False)

In [None]:
# sort by count
playtime_by_game.sort_values(by=('playtime_forever', 'count'), ascending=True)

In [None]:
# playtime_by_game where count is 1
playtime_by_game[playtime_by_game[('playtime_forever', 'count')] == 1]

In [None]:
# plot playtime by game
plt.figure(figsize=(20,10))
sns.barplot(x=playtime_by_game.index, y = playtime_by_game[('playtime_forever', 'mean')])
plt.title('Average Playtime by Game')
plt.xlabel('Game')


In [None]:
# get appids where count is 1
only_one_playtime = playtime_by_game[playtime_by_game[('playtime_forever', 'count')] == 1].index

In [None]:
# exclude games with only one playtime
playtime_df = playtime_df[~playtime_df['appid'].isin(only_one_playtime)]

In [None]:
playtime_df

### GAME METADATA

In [None]:
games_df = pd.read_csv('../data/steam_app_metadata.csv')

In [None]:
print(len(games_df))

In [None]:
games_df

In [None]:
# show duplicates for appid
games_df[games_df.duplicated(subset=['appid'])]

In [None]:
# remove duplicate appid
games_df = games_df.drop_duplicates(subset=['appid'])

In [None]:
# get duplicates for name
dupl_game_names = games_df[games_df.duplicated(subset=['name'])]

In [None]:
# rename appid's in users_df that are duplicates to original appid
for index, row in dupl_game_names.iterrows():
    dupl_appid = row['appid']
    orig_appid = games_df[games_df['name'] == row['name']]['appid'].iloc[0]
    playtime_df.loc[playtime_df['appid'] == dupl_appid, 'appid'] = orig_appid


In [None]:
# remove duplicates from games_df
games_df = games_df.drop_duplicates(subset=['name'])

In [None]:
# count nulls for each column
games_df.isnull().sum()

In [None]:
no_description = games_df[games_df['description'].isnull() | (games_df['description'] == '')]
games_df = games_df[~games_df['appid'].isin(no_description['appid'])]

In [None]:
no_dev_or_pub = games_df[(games_df['developer'] == 'None') & games_df['publisher'].isnull()]
games_df = games_df[~games_df['appid'].isin(no_dev_or_pub['appid'])]

In [None]:
# impute developer into publisher where publisher is null
games_df['publisher'] = games_df['publisher'].fillna(games_df['developer'])

In [None]:
# get average 'description' length
print('mean is', games_df['description'].str.len().mean().__round__(), 'characters')
print('median is', games_df['description'].str.len().median(), 'characters')

In [None]:
# get average 'description' length
games_df[games_df['description'].str.len() > 10000]