In [1]:
# Just take all width for viz
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("steam-analysis-eda") \
    .getOrCreate()

In [7]:
games_dataset_path = "file:///home/neadex/steam-analysis/data/steam-store-games/steam.csv"

games_df = spark.read.csv(games_dataset_path, header=True)
games_df.printSchema()

root
 |-- appid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- english: string (nullable = true)
 |-- developer: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- platforms: string (nullable = true)
 |-- required_age: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- steamspy_tags: string (nullable = true)
 |-- achievements: string (nullable = true)
 |-- positive_ratings: string (nullable = true)
 |-- negative_ratings: string (nullable = true)
 |-- average_playtime: string (nullable = true)
 |-- median_playtime: string (nullable = true)
 |-- owners: string (nullable = true)
 |-- price: string (nullable = true)



In [6]:
sales_dataset_path = "file:///home/neadex/steam-analysis/data/video-games-sales/vgsales.csv"

sales_df = spark.read.csv(sales_dataset_path, header=True)
sales_df.printSchema()

root
 |-- Rank: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: string (nullable = true)
 |-- EU_Sales: string (nullable = true)
 |-- JP_Sales: string (nullable = true)
 |-- Other_Sales: string (nullable = true)
 |-- Global_Sales: string (nullable = true)



In [49]:
cols = (
    'name', 
    'Global_Sales','EU_Sales', "NA_Sales", "JP_Sales",
    "platform", 
    'genre',
    'price',
    'release_date'
)

games_df.join(sales_df, on = 'Name') \
    .sort(F.col('Global_Sales').desc()) \
    .selectExpr(*cols) \
    .where(F.col('platform') == "PC") \
    .withColumn('EU_NA_Sales_diff', (((sales_df['EU_Sales'] - sales_df['NA_Sales']) ** 2) / sales_df['Global_Sales'] )) \
    .sort(F.col('EU_NA_Sales_diff')) \
    .show(20, truncate=False)

+----------------------------------+------------+--------+--------+--------+--------+------------+-----+------------+---------------------+
|name                              |Global_Sales|EU_Sales|NA_Sales|JP_Sales|platform|genre       |price|release_date|EU_NA_Sales_diff     |
+----------------------------------+------------+--------+--------+--------+--------+------------+-----+------------+---------------------+
|Warhammer 40,000: Space Marine    |0.2         |0.09    |0.09    |0       |PC      |Shooter     |19.99|2011-09-08  |0.0                  |
|Mount & Blade: Warband            |0.02        |0.01    |0.01    |0       |PC      |Role-Playing|14.99|2010-03-31  |0.0                  |
|Saints Row: The Third             |0.38        |0.16    |0.16    |0       |PC      |Action      |6.99 |2011-11-17  |0.0                  |
|Mafia II                          |0.4         |0.17    |0.17    |0       |PC      |Action      |19.99|2011-03-22  |0.0                  |
|Battleborn         

In [52]:
games_df.join(sales_df, on = 'Name') \
    .sort(F.col('Global_Sales').desc()) \
    .selectExpr(*cols) \
    .where(F.col('platform') == "PC") \
    .withColumn('JP_NA_Sales_diff', (((sales_df['JP_Sales'] - sales_df['NA_Sales']) ** 2) / sales_df['Global_Sales'] )) \
    .sort(F.col('JP_NA_Sales_diff')) \
    .show(20, truncate=False)

+-------------------------------+------------+--------+--------+--------+--------+------------+-----+------------+----------------+
|name                           |Global_Sales|EU_Sales|NA_Sales|JP_Sales|platform|genre       |price|release_date|JP_NA_Sales_diff|
+-------------------------------+------------+--------+--------+--------+--------+------------+-----+------------+----------------+
|Counter-Strike: Source         |0.09        |0.07    |0       |0       |PC      |Shooter     |7.19 |2004-11-01  |0.0             |
|Vietcong                       |0.02        |0.02    |0       |0       |PC      |Shooter     |3.99 |2018-05-25  |0.0             |
|Railroad Tycoon 3              |0.04        |0.03    |0       |0       |PC      |Strategy    |5.99 |2007-05-04  |0.0             |
|Tomb Raider: Anniversary       |0.04        |0.03    |0       |0       |PC      |Action      |6.99 |2007-06-05  |0.0             |
|Bully: Scholarship Edition     |0.03        |0.02    |0       |0       |PC 

### JP_Sales

-> Si les résultats sont filtrés avec platform == "pc", JP_Sales = 0
    
    Conclusion : le marché japonais est complètement négligeable pour un jeu pc
    
 
Prochaine question : Les jeux les plus vendus au japon sont-ils des jeux qui se sont ausi bien vendus ailleurs ? 
    => Corrélation entre JP_Sales et avg(NA_Sales, EU_Sales)

In [62]:
sales_df \
    .filter(F.col('JP_Sales') != 0) \
    .sort(F.col('JP_Sales').desc()) \
    .show(truncate=False)

+----+----------------------------------------------+--------+----+------------+----------------+--------+--------+--------+-----------+------------+
|Rank|Name                                          |Platform|Year|Genre       |Publisher       |NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+----------------------------------------------+--------+----+------------+----------------+--------+--------+--------+-----------+------------+
|13  |Pokemon Gold/Pokemon Silver                   |GB      |1999|Role-Playing|Nintendo        |9       |6.18    |7.2     |0.71       |23.1        |
|2   |Super Mario Bros.                             |NES     |1985|Platform    |Nintendo        |29.08   |3.58    |6.81    |0.77       |40.24       |
|7   |New Super Mario Bros.                         |DS      |2006|Platform    |Nintendo        |11.38   |9.23    |6.5     |2.9        |30.01       |
|21  |Pokemon Diamond/Pokemon Pearl                 |DS      |2006|Role-Playing|Nintendo        |6.4