In [None]:
from pyspark.sql import SparkSession
import numpy as np
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.memory", "4g")\
    .config("spark.executor.instances", 18)\
    .getOrCreate()

In [None]:
# dir = "resources/data/all_reviews_100000_sample.csv"
df_cleaned = spark.read.csv(dir, header=True, inferSchema=True)

# Show df
df_cleaned.show(10)

In [None]:
# Bar graph which games have the most play time (of reviews)

# Looking to represent playtime statistics for each game

total_playtimes = (
    df_cleaned.groupBy("game")
    .agg(F.sum("author_playtime_at_review").alias("Total Reviewer Playtime (At time of review)"), F.count("*").alias("Review Count"))
    .withColumn("Average Playtime Per Review", F.round(F.col("Total Reviewer Playtime (At time of review)") / F.col("Review Count")))
)

total_playtimes.show()

In [None]:
# Conversion of aggregated table to Pandas for visualization
top_10_reviewer_playtimes_df = total_playtimes.orderBy(F.col("Total Reviewer Playtime (At time of review)").desc()).limit(10).toPandas()
top_10_review_count_df = total_playtimes.orderBy(F.col("Review Count").desc()).limit(10).toPandas()

In [None]:
top_10_reviewer_playtimes_df

In [None]:
top_10_review_count_df

In [None]:
# Bargraph of top 10 games with the most total playtime per reviewer, top 10 games with most reviews
# Omitting stats on average playtime per review as would be skewed towards games with fewer reviews

fig, axes = plt.subplots(2,1,figsize = (6,12))

# Plot 1: Top 10 games with the most total playtime per reviewer
sns.barplot(
    y = top_10_reviewer_playtimes_df["game"],
    x = top_10_reviewer_playtimes_df["Total Reviewer Playtime (At time of review)"],
    ax = axes[0],
    palette = "rocket",
    orient = "h"
)
axes[0].set_title("Top 10 Games with Highest Playtime of Reviewers")
axes[0].set_ylabel("Game Title")
axes[0].set_xlabel("Playtime (Hours)")
for container in axes[0].containers:
  axes[0].bar_label(container, fmt='%.1f')

# Plot 2: Top 10 games with most reviews
sns.barplot(
    y = top_10_review_count_df["game"],
    x = top_10_review_count_df["Review Count"],
    ax = axes[1],
    palette = "rocket", 
    orient = "h"
)
axes[1].set_title("Top 10 Games with the Most Reviews")
axes[1].set_ylabel("Game Title")
axes[1].set_xlabel("Review Count")
for container in axes[1].containers:
  axes[1].bar_label(container, fmt='%.1f')

plt.show()

## Visualizations - Scatter Plots

In [None]:
# Curious to see if there is a pattern regarding playtime and weighted vote score

playtime_votescore_df = df_cleaned.select("weighted_vote_score", "author_playtime_at_review").toPandas()

In [None]:
sns.scatterplot(
    data = playtime_votescore_df,
    y = "weighted_vote_score",
    x = "author_playtime_at_review"
)

plt.title("Scatterplot of Author Playtime at Review vs Weighted Vote Score")
plt.ylabel("Weighted Vote Score (Steam metric)")
plt.xlabel("Author Playtime At Review (hours)")

plt.show()