# Steam EDA

In [8]:
# Just take all width for viz
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### Import packages

In [61]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

#### Create SparkSession

In [2]:
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("steam-analysis-eda") \
    .getOrCreate()

#### Display full datasets structure

In [3]:
%%bash

tree ../data -L 2

../data
├── steam-dataset
│   ├── steam_analysis.Achievements_Percentages
│   ├── steam_analysis.App_ID_Info
│   ├── steam_analysis.App_ID_Info_Old
│   ├── steam_analysis.Friends
│   ├── steam_analysis.Games_1
│   ├── steam_analysis.Games_2
│   ├── steam_analysis.Games_Daily
│   ├── steam_analysis.Games_Developers
│   ├── steam_analysis.Games_Developers_Old
│   ├── steam_analysis.Games_Genres
│   ├── steam_analysis.Games_Genres_Old
│   ├── steam_analysis.Games_Publishers
│   ├── steam_analysis.Games_Publishers_Old
│   ├── steam_analysis.Groups
│   └── steam_analysis.Player_Summaries
├── steam-review
│   └── steam-review.csv
├── steam-store-games
│   ├── steam.csv
│   ├── steam_description_data.csv
│   ├── steam_media_data.csv
│   ├── steam_requirements_data.csv
│   ├── steamspy_tag_data.csv
│   └── steam_support_info.csv
└── video-games-sales
    └── vgsales.csv

19 directories, 8 files


#### Load global steam store dataset

In [9]:
games_dataset_path = "file:///home/neadex/steam-analysis/data/steam-store-games/steam.csv"

games_df = spark.read.csv(games_dataset_path, header=True)
games_df.printSchema()

root
 |-- appid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- english: string (nullable = true)
 |-- developer: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- platforms: string (nullable = true)
 |-- required_age: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- steamspy_tags: string (nullable = true)
 |-- achievements: string (nullable = true)
 |-- positive_ratings: string (nullable = true)
 |-- negative_ratings: string (nullable = true)
 |-- average_playtime: string (nullable = true)
 |-- median_playtime: string (nullable = true)
 |-- owners: string (nullable = true)
 |-- price: string (nullable = true)



#### Display some rows

In [29]:
games_df.show()

+-----+--------------------+------------+-------+----------------+---------+-----------------+------------+--------------------+-------------------+--------------------+------------+----------------+----------------+----------------+---------------+-----------------+-----+
|appid|                name|release_date|english|       developer|publisher|        platforms|required_age|          categories|             genres|       steamspy_tags|achievements|positive_ratings|negative_ratings|average_playtime|median_playtime|           owners|price|
+-----+--------------------+------------+-------+----------------+---------+-----------------+------------+--------------------+-------------------+--------------------+------------+----------------+----------------+----------------+---------------+-----------------+-----+
|   10|      Counter-Strike|  2000-11-01|      1|           Valve|    Valve|windows;mac;linux|           0|Multi-player;Onli...|             Action|Action;FPS;Multip...|         

#### Get single game row by name

In [22]:
games_df.filter(games_df.name == "Conan Exiles").show()

+------+------------+------------+-------+---------+---------+---------+------------+--------------------+--------------------+--------------------+------------+----------------+----------------+----------------+---------------+--------------+-----+
| appid|        name|release_date|english|developer|publisher|platforms|required_age|          categories|              genres|       steamspy_tags|achievements|positive_ratings|negative_ratings|average_playtime|median_playtime|        owners|price|
+------+------------+------------+-------+---------+---------+---------+------------+--------------------+--------------------+--------------------+------------+----------------+----------------+----------------+---------------+--------------+-----+
|440900|Conan Exiles|  2018-05-08|      1|   Funcom|   Funcom|  windows|          18|Single-player;Mul...|Action;Adventure;...|Survival;Open Wor...|          25|           22001|            9594|            4431|           5167|500000-1000000|33.99|


#### Get last recorded game in dataset

In [197]:
last_release_date = games_df.select("release_date").agg({"release_date" : "max"}).collect()[0]['max(release_date)']
games_df.filter(games_df.release_date == last_release_date).show()

+------+-----------------+------------+-------+-------------+---------+---------+------------+--------------------+--------------------+-------------------+------------+----------------+----------------+----------------+---------------+-------+-----+
| appid|             name|release_date|english|    developer|publisher|platforms|required_age|          categories|              genres|      steamspy_tags|achievements|positive_ratings|negative_ratings|average_playtime|median_playtime| owners|price|
+------+-----------------+------------+-------+-------------+---------+---------+------------+--------------------+--------------------+-------------------+------------+----------------+----------------+----------------+---------------+-------+-----+
|905370|Conqueror's Blade|  2019-05-01|      1|Booming Games|   My.com|  windows|           0|Online Multi-Play...|Action;Free to Pl...|Action;Strategy;RPG|           0|             259|             235|               0|              0|0-20000|11.

#### Filter non-int values on positive_ratings and get max

In [199]:
max_positive_ratings = games_df \
    .select(F.regexp_extract("positive_ratings",'\d+', 0).alias('positive_ratings').cast(IntegerType())) \
    .agg({'positive_ratings' : 'max'}).first()['max(positive_ratings)']

print('Max positive ratings :', max_positive_ratings)

Max positive ratings : 2644404


#### Get game name with max positive ratings

In [201]:
games_df \
    .filter(games_df.positive_ratings == max_positive_ratings) \
    .select('name') \
    .show(truncate=False)

+--------------------------------+
|name                            |
+--------------------------------+
|Counter-Strike: Global Offensive|
+--------------------------------+



#### Get 50 most used tags in dataset

In [203]:
games_df \
    .withColumn('steamspy_tags', F.split("steamspy_tags", ";")) \
    .select(F.explode('steamspy_tags').alias('tags')) \
    .groupBy('tags') \
    .count() \
    .sort(F.col('count').desc()) \
    .show(50, truncate = False)

+---------------------+-----+
|tags                 |count|
+---------------------+-----+
|Indie                |16231|
|Action               |10322|
|Casual               |8205 |
|Adventure            |7770 |
|Strategy             |4172 |
|Simulation           |3284 |
|Early Access         |2967 |
|RPG                  |2784 |
|Free to Play         |1662 |
|Puzzle               |1116 |
|VR                   |961  |
|Sports               |781  |
|Racing               |765  |
|Platformer           |648  |
|Nudity               |570  |
|Sexual Content       |557  |
|Visual Novel         |546  |
|Violent              |545  |
|Anime                |538  |
|Horror               |493  |
|Point & Click        |492  |
|Gore                 |476  |
|Hidden Object        |460  |
|FPS                  |405  |
|Multiplayer          |405  |
|Massively Multiplayer|377  |
|Pixel Graphics       |249  |
|Shoot 'Em Up         |245  |
|Open World           |243  |
|Survival             |235  |
|Space    

#### Really retrieve values on driver filesystem

In [188]:

tags_counts = games_df \
    .withColumn('steamspy_tags', F.split("steamspy_tags", ";")) \
    .select(F.explode('steamspy_tags').alias('tags')) \
    .groupBy('tags') \
    .count() \
    .sort(F.col('count').desc()) \
    .rdd.map(tuple) \
    .take(50)

#### Visualize as a bar chart

In [194]:
import plotly.graph_objects as go

x, y = zip(*tags_counts)

fig = go.Figure()
fig.add_trace(go.Bar(x=x, y=y))
fig.update_layout(title = '50 most used tags on Steam Store')
fig.show()

In [205]:
%%bash

git add

# ignore data folder
data/

# ignore logs
logs/*.logs

# ignore virtualenv
venv/

# ignore specific files
/docs/notes.md
/config/.env
