### Exploratory Data Analysis of Steam data

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
playtime_df = pd.read_csv('../data/steam_playtime.csv')

In [None]:
len(playtime_df)

In [None]:
# During my first iteration of the dataset, this domain knowledge based filtering was done
# drop counterstrike i.e. drop rows where appid is 730, 10 or 240
# playtime_df = playtime_df[~playtime_df['appid'].isin([730, 10, 240])]

# drop user playtimes over x minutes
# playtime_df = playtime_df[playtime_df['playtime_forever'] < 130000]

# remove user playtimes under 60 minutes
#playtime_df = playtime_df[playtime_df['playtime_forever'] > 60]

# drop ridiculous playtime_2weeks values
playtime_df = playtime_df[playtime_df['playtime_2weeks'] < 5000]

In [None]:
playtime_df.sort_values(by=['playtime_forever'], ascending=False).head(20)

### Z-SCORE

In [None]:
# get mean for playtime_forever
users_mean = playtime_df['playtime_forever'].mean()

In [None]:
# get standard deviation for playtime_forever
users_std = playtime_df['playtime_forever'].std()

In [None]:
print(users_mean, users_std)

In [None]:
def get_zscore(value, values):
    """Obtain the z-score of a given value"""
    m = users_mean
    s = users_std
    z_score = (value - m)/s
    return np.abs(z_score)

In [None]:
# compute z-scores for all values
playtime_df['z-score'] = playtime_df['playtime_forever'].apply(lambda x: get_zscore(x, playtime_df['playtime_forever']))

# find outliers
outliers = playtime_df[playtime_df['z-score']>3]
outliers

In [None]:
outliers.sort_values(by=['playtime_forever'], ascending=False).head(20)

In [None]:
inliers = playtime_df[playtime_df['z-score']<3]
inliers

In [None]:
inliers.sort_values(by=['playtime_2weeks'], ascending=False).head(20)

In [None]:
playtime_df

In [None]:
len(playtime_df['steam_id'].unique())

In [None]:
sort_by_playtime_2weeks = playtime_df.sort_values(by='playtime_2weeks', ascending=False)

In [None]:
# there are 20160 minutes in two weeks, what is a reasonable max playtime per day?
# stats say average daily playtime is 1.2 hours a day, make max 5 hours per day i.e. 4200 minutes in two weeks?
sort_by_playtime_2weeks[sort_by_playtime_2weeks['playtime_2weeks'] < 4200]

In [None]:
playtime_df[playtime_df['playtime_2weeks'] < 5000]

In [None]:
# sort by playtime_forever
sort_by_playtime = playtime_df.sort_values(by = ['playtime_forever'], ascending = False).reset_index()

In [None]:
sort_by_playtime

In [None]:
sort_by_playtime.plot(y='playtime_forever', use_index=True)

In [None]:
# plot sort_by_playtime
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(sort_by_playtime, x='playtime_forever', bins = 20, kde = True)


In [None]:
# plot sort_by_playtime with playtime_forever on the y axis
sort_by_playtime.

In [None]:
# group by appid
playtime_by_game = playtime_df.groupby('appid').agg({'playtime_forever': 'median'})


In [None]:
# sort by playtime
playtime_by_game = playtime_by_game.sort_values(by = 'playtime_forever', ascending = False)

In [None]:
playtime_by_game.head()

In [None]:
# plot playtime by game
plt.figure(figsize=(20,10))
sns.barplot(x = playtime_by_game.index, y = playtime_by_game['playtime_forever'])
plt.title('Average Playtime by Game')
plt.xlabel('Game')


In [None]:
# plot distribution of playtimes
sortplaytime = playtime_df['playtime_forever'].sort_values(ascending = False)
sortplaytime.plot(kind = 'hist')

In [None]:
sns.histplot(sortplaytime)
plt.show()

In [None]:
# show bins of playtime forever
playtime_df['playtime_forever'].hist(bins=100)

In [None]:
# plot boxplot
plt.boxplot(playtime_df['playtime_forever'])
plt.ylabel('playtime')
plt.title('Box Plot')
plt.show()