In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.master("local").appName("AmazonReviews").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/06 16:43:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
#"reviewerID": "A8WEXFRWX1ZHH",
# "asin": "0209688726",
# "style": {"Color:": " AC"},
# "reviewerName": "Goldengate",
# Define the schema
schema = StructType([
    StructField("overall", FloatType(), True),
    StructField("verified", BooleanType(), True),
    StructField("reviewTime", StringType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True),
    StructField("style", StructType([StructField("Color:", StringType(), True)]), True),
    StructField("reviewerName", StringType(), True),
    StructField("reviewText", StringType(), True),
    StructField("unixReviewTime", IntegerType(), True)

])

In [4]:
electronics_dataset_path = '../Electronics.json'
pet_supplies_dataset_path = '../Pet_Supplies.json'

df_electronics = spark.read.schema(schema).json(electronics_dataset_path)
df_pet_supplies = spark.read.schema(schema).json(pet_supplies_dataset_path)

# drop unnecessary columns
columns_to_drop = ['verified', 'reviewTime', 'unixReviewTime', 'style', 'reviewerName']
df_electronics = df_electronics.drop(*columns_to_drop)
df_pet_supplies = df_pet_supplies.drop(*columns_to_drop)

# process null values
print('electronics row count', df_electronics.count())
df_electronics = df_electronics.na.drop(subset=['reviewText'])
print('electronics row count', df_electronics.count())
print('pet supplies row count', df_pet_supplies.count())
df_pet_supplies = df_pet_supplies.na.drop(subset=['reviewText'])
print('pet supplies row count', df_pet_supplies.count())

                                                                                

electronics row count 20994353


                                                                                

electronics row count 20984669


                                                                                

pet supplies row count 6542483




pet supplies row count 6538687


                                                                                

In [5]:
# Split data into training and testing sets
training_df_electronics, testing_df_electronics = df_electronics.randomSplit([0.8, 0.2], seed=42)
training_df_pet_supplies, testing_df_pet_supplies = df_pet_supplies.randomSplit([0.8, 0.2], seed=42)

In [6]:
print('electronics training row count', training_df_electronics.count())
print('electronic testing row count', testing_df_electronics.count())
print('pet supplies training row count', training_df_pet_supplies.count())
print('pet supplies testing row count', testing_df_pet_supplies.count())

                                                                                

electronics training row count 16785007


                                                                                

electronic testing row count 4199662


                                                                                

pet supplies training row count 5230764




pet supplies testing row count 1307923


                                                                                

In [7]:
from pyspark.sql.functions import rand
# combining training data, testing data
combined_training_data = training_df_electronics.union(training_df_pet_supplies)
combined_testing_data = testing_df_electronics.union(testing_df_pet_supplies)

# shuffling both sets of data
combined_training_data = combined_training_data.orderBy(rand())
combined_testing_data = combined_testing_data.orderBy(rand())

In [8]:
# Write combined DataFrames to JSON files
combined_training_data.write.mode("overwrite").json("combined_training_data.json")
combined_testing_data.write.mode("overwrite").json("combined_testing_data.json")

                                                                                