## Review scraping

In [1]:
# Just take all width for viz
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

import plotly.graph_objects as go

import os

In [None]:
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .config("spark.driver.memory", "10g") \
    .appName("steam-analysis-eda") \
    .getOrCreate()

In [None]:
dataset_path = os.path.join(os.path.dirname(os.path.realpath("")), "data/")

base_df = spark.read.parquet("file://" + dataset_path + "extracts/steam-dataset_games_28-12_5")

base_df.printSchema()
base_df.show()
print('N rows =', base_df.count())

In [None]:
# Fetching reviews using API, not using pagination

import json
import requests
from tqdm import tqdm_notebook

def get_reviews(appid):
    url = 'http://store.steampowered.com/appreviews/{appid}?json=1'
    r = requests.get(url.format(appid = appid, ))
    if r.status_code == 200:
        return r.json()
    else:
        return {"status" : "error", "msg" : "error while fetching API"}

distinct_appid = base_df \
    .select('appid') \
    .rdd.map(lambda x: x.appid).collect()

res = []
for appid in tqdm_notebook(distinct_appid):
    data = get_reviews(appid)
    data['appid'] = appid # attach appid for join
    res.append(data)
    
with open(dataset_path + "extracts/reviews.json", 'w') as f:
    json.dump(res, f, indent = 4)

In [None]:
# To delete after verification

dataset_path = os.path.join(os.path.dirname(os.path.realpath("")), "data/")

rev_df = spark.read.json("file://" + dataset_path + "extracts/reviews.json", multiLine=True)

# Use window function to rank each row for each group (windowed on 'appid' values)
window = Window.partitionBy(rev_df['appid']).orderBy(rev_df['query_summary.total_reviews'].desc())

# Keep only queries with maximum reviews
rev_df = rev_df \
    .select('*', F.rank().over(window).alias('rank')) \
    .filter(F.col('rank') == 1) \
    .dropDuplicates()

# Drop duplicates based on total_reviews
rev_df = rev_df \
    .select('*', 'query_summary.*') \
    .dropDuplicates(['total_reviews']) \
    .drop('query_summary') \
    .drop('rank')

# Write cleaned reviews as parquet file
rev_df \
    .write \
    .mode('overwrite') \
    .parquet("file://" + dataset_path + "extracts/steam-reviews.parquet")

print('DataFrame correctly saved to disk !')

In [None]:
# Example querying directly from json file

spark \
    .sql('SELECT * FROM parquet.`{}` LIMIT 1' \
    .format("file://" + dataset_path + "extracts/steam-reviews.parquet")) \
    .printSchema()

revsum_df = spark \
    .sql(
        """SELECT
        appid,
        total_reviews
        FROM parquet.`{}`""" \
    .format("file://" + dataset_path + "extracts/steam-reviews.parquet"))

revsum_df \
    .sort(F.col('total_reviews').desc()) \
    .show()