In [34]:
# !pip install pyspark

In [35]:
from pyspark.sql import SparkSession
import numpy as np
import pyspark.sql.functions as F

In [36]:
# Initialize spark instance
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [37]:
spark

In [38]:
# Load CSV into Spark DataFrame
df = spark.read.csv("resources/data/all_reviews_100000_sample.csv", header=True, inferSchema=True)

# Show df
df.show()

+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-------------------------------------+-----------------+-----------------+--------+--------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+---------------------+--------------------+
|recommendationid|  appid|                game|   author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played| language|                               review|timestamp_created|timestamp_updated|voted_up|votes_up|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|hidden_in_steam_china|steam_china_location|
+----------------+-------+--------------------+-----------------+---------------

In [39]:
# List out the columns for ease of view
print(f"Columns in Original Dataframe:\n\n{df.columns}\n")

#print(f"Statistics: \n{df.describe().show()}")
df.summary().show()


Columns in Original Dataframe:

['recommendationid', 'appid', 'game', 'author_steamid', 'author_num_games_owned', 'author_num_reviews', 'author_playtime_forever', 'author_playtime_last_two_weeks', 'author_playtime_at_review', 'author_last_played', 'language', 'review', 'timestamp_created', 'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score', 'comment_count', 'steam_purchase', 'received_for_free', 'written_during_early_access', 'hidden_in_steam_china', 'steam_china_location']

+-------+---------------------------+--------------------+-------------------------+--------------------+----------------------+--------------------+-----------------------+------------------------------+-------------------------+--------------------+------------------+--------------------+--------------------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------------------------+--------------------

## Initial Data Cleanup / Feature Engineering

In [40]:
# Convert column to float
df = df.withColumn("weighted_vote_score", F.col("weighted_vote_score").cast("float"))

# Then filter
df_cleaned = df.filter(F.col("weighted_vote_score") != 0)
df_cleaned.show(10)

###Did this approach first but it ruined the whole df not sure why###

# # Make weighted_vote_score non 0
# df_cleaned = df.filter(df.weighted_vote_score != 0)

# # Show the cleaned DataFrame
# df_cleaned.show()

###realized apparently it wasnt already a float so i needed to convert the column first###

+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+--------+-------------------------------------+-----------------+-----------------+--------+--------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+---------------------+--------------------+
|recommendationid|  appid|                game|   author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played|language|                               review|timestamp_created|timestamp_updated|voted_up|votes_up|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|hidden_in_steam_china|steam_china_location|
+----------------+-------+--------------------+-----------------+-----------------

In [41]:
# Drop multiple columns
# Dropping review column as content will not be evaluated (not exploring NLP in this case)
df_cleaned = df_cleaned.drop("steam_china_location", "hidden_in_steam_china", "review")

# Show the updated DataFrame
df_cleaned.show()

+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-----------------+-----------------+--------+--------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+
|recommendationid|  appid|                game|   author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played| language|timestamp_created|timestamp_updated|voted_up|votes_up|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|
+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-----------------+-------

In [None]:
# Converting author playtime to hours
df_cleaned = df_cleaned.withColumn("author_playtime_forever", F.col("author_playtime_forever").cast("float") / 60)

df_cleaned.show()

+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-----------------+-----------------+--------+--------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+
|recommendationid|  appid|                game|   author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played| language|timestamp_created|timestamp_updated|voted_up|votes_up|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|
+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-----------------+-------

In [None]:
### Not necessary just thought it would make playtime cleaner ###

# Round playtime to whole number
df_cleaned = df_cleaned.withColumn("author_playtime_forever", F.round(F.col("author_playtime_forever"), 0))
df_cleaned.show()

+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-----------------+-----------------+--------+--------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+
|recommendationid|  appid|                game|   author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played| language|timestamp_created|timestamp_updated|voted_up|votes_up|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|
+----------------+-------+--------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+---------+-----------------+-------