In [1]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, pandas_udf,col, lower, regexp_replace
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, StringIndexer, Tokenizer, StopWordsRemover
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import confusion_matrix
from pyspark.ml import PipelineModel, Pipeline, Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.ml.util import DefaultParamsWritable, DefaultParamsReadable

In [3]:
file_path = "Industrial_and_Scientific_5.json.gz"

In [9]:
import json
import gzip

def readJSON(file_path, inChunks, percentage):
    """
    The readJSON function decompresses a .gz file containing list of Json
    onjects and reads in chunks when the inChunks parameter is set to True.

    :file_path: The path of the file to be read.
    :inChunks:  If True, reads the file in chunks based on the percentage
    parameter.
                If False, reads the entire file.
    :percentage: The total percentage to read from the file if inChunks is True.
    :return: returns a list of Json objects.
    """

    json_objects = []

    if (inChunks is False):
        precentage = 1

    with gzip.open(file_path, 'rt') as f:
        total_lines = sum(1 for line in f)
        fraction = round(total_lines * percentage)

    with gzip.open(file_path, 'rt') as f:
        for i, line in enumerate(f):
            if i > fraction:
              break
            try:
              json_data = json.loads(line)
              json_objects.append(json_data)
            except json.JSONDecodeError:
              print(f"Error parsing line: {line}")

    return json_objects

In [10]:
read_json = readJSON(file_path, inChunks=False, percentage=1)

In [11]:
df1 = pd.DataFrame.from_records(read_json)

In [12]:
del read_json

In [13]:
df1.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"11 27, 2017",A1JB7HFWHRYHT7,B0000223SI,{'Size:': ' 1-(Pack)'},Alex W.,This worked really well for what I used it for...,Couldn't have been happier with it's performance,1511740800,,
1,5.0,True,"11 4, 2017",A2FCLJG5GV8SD6,B0000223SI,{'Size:': ' 1-(Pack)'},Randall Harris,Fast cutting and good adheasive.,Good paper.,1509753600,,
2,5.0,False,"10 27, 2017",A3IT9B33NWYQSL,B0000223SI,{'Size:': ' 1-(Pack)'},A. C.,Worked great for my lapping bench. I would li...,Handy!,1509062400,,
3,4.0,True,"01 13, 2018",AUL5LCV4TT73P,B0000223SK,{'Size:': ' 1-Pack'},TnT,As advertised,As advertised,1515801600,,
4,5.0,True,"10 7, 2017",A1V3I3L5JKO7TM,B0000223SK,{'Size:': ' 1-Pack'},John Jones,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,1507334400,,


In [76]:
df1.drop(columns=['verified', 'reviewTime', 'style', 'reviewerName', 'reviewText', 
				  'summary', 'unixReviewTime', 'vote', 'image'], inplace=True)

In [77]:
df1.dtypes

overall       float64
reviewerID     object
asin           object
dtype: object

In [14]:
df1.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

In [78]:
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

# Define the schema
schema = StructType([
    StructField("overall", DoubleType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True)
])

# Convert the pandas DataFrame to a Spark DataFrame with the defined schema
data = spark.createDataFrame(df1, schema=schema)

In [2]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Recommendation System with Spark") \
    .getOrCreate()

your 131072x1 screen size is bogus. expect trouble
24/05/28 08:40:47 WARN Utils: Your hostname, MedWalid resolves to a loopback address: 127.0.1.1; using 172.19.159.144 instead (on interface eth0)
24/05/28 08:40:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 08:40:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [79]:
data.show()

[Stage 52:>                                                         (0 + 1) / 1]

+-------+--------------+----------+
|overall|    reviewerID|      asin|
+-------+--------------+----------+
|    5.0|A1JB7HFWHRYHT7|B0000223SI|
|    5.0|A2FCLJG5GV8SD6|B0000223SI|
|    5.0|A3IT9B33NWYQSL|B0000223SI|
|    4.0| AUL5LCV4TT73P|B0000223SK|
|    5.0|A1V3I3L5JKO7TM|B0000223SK|
|    5.0|A20X7NCNZ7T5ZK|B0000223SK|
|    5.0|A3OBWQ8DTRLW2Q|B0000223SI|
|    5.0|A398INYG0ZBUZB|B0000223SK|
|    5.0| AEBM08OO8Y9BJ|B0000223SK|
|    4.0|A358U1JEA514P6|B0000223SI|
|    5.0|A1Z584ZH824BU1|B0000223SI|
|    5.0|A3OBWQ8DTRLW2Q|B0000223SK|
|    5.0|A2B40VHCBDLC43|B0000223SK|
|    5.0|A34O4UAC27ECL6|B0000223SI|
|    5.0| ACDP5UBE4ZW3T|B0000223SI|
|    3.0|A11MN6521EQ9QD|B0000223SK|
|    4.0|A358U1JEA514P6|B0000223SK|
|    4.0| AAEPD6U1H2X37|B0000223SK|
|    5.0|A1ICJY2HU1QLS1|B0000223SK|
|    4.0|A1XUBWNW0UFITU|B0000223SK|
+-------+--------------+----------+
only showing top 20 rows



                                                                                

In [80]:
data.printSchema()

root
 |-- overall: double (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)



In [81]:
# Rename multiple columns
data = data.withColumnRenamed("overall", "rating").withColumnRenamed("reviewerID", "userID").withColumnRenamed("asin", "productID")

In [82]:
data.show()

+------+--------------+----------+
|rating|        userID| productID|
+------+--------------+----------+
|   5.0|A1JB7HFWHRYHT7|B0000223SI|
|   5.0|A2FCLJG5GV8SD6|B0000223SI|
|   5.0|A3IT9B33NWYQSL|B0000223SI|
|   4.0| AUL5LCV4TT73P|B0000223SK|
|   5.0|A1V3I3L5JKO7TM|B0000223SK|
|   5.0|A20X7NCNZ7T5ZK|B0000223SK|
|   5.0|A3OBWQ8DTRLW2Q|B0000223SI|
|   5.0|A398INYG0ZBUZB|B0000223SK|
|   5.0| AEBM08OO8Y9BJ|B0000223SK|
|   4.0|A358U1JEA514P6|B0000223SI|
|   5.0|A1Z584ZH824BU1|B0000223SI|
|   5.0|A3OBWQ8DTRLW2Q|B0000223SK|
|   5.0|A2B40VHCBDLC43|B0000223SK|
|   5.0|A34O4UAC27ECL6|B0000223SI|
|   5.0| ACDP5UBE4ZW3T|B0000223SI|
|   3.0|A11MN6521EQ9QD|B0000223SK|
|   4.0|A358U1JEA514P6|B0000223SK|
|   4.0| AAEPD6U1H2X37|B0000223SK|
|   5.0|A1ICJY2HU1QLS1|B0000223SK|
|   4.0|A1XUBWNW0UFITU|B0000223SK|
+------+--------------+----------+
only showing top 20 rows



In [83]:
from pyspark.ml.feature import StringIndexer

# Assuming df is your DataFrame and "userID" is the column to be indexed
indexer = StringIndexer(inputCol="userID", outputCol="userIDIndex")
data = indexer.fit(data).transform(data)

                                                                                

In [84]:
from pyspark.ml.feature import StringIndexer

# Assuming df is your DataFrame and "userID" is the column to be indexed
indexer = StringIndexer(inputCol="productID", outputCol="productIDIndex")
data = indexer.fit(data).transform(data)

                                                                                

In [85]:
data.show()

+------+--------------+----------+-----------+--------------+
|rating|        userID| productID|userIDIndex|productIDIndex|
+------+--------------+----------+-----------+--------------+
|   5.0|A1JB7HFWHRYHT7|B0000223SI|     1078.0|        1465.0|
|   5.0|A2FCLJG5GV8SD6|B0000223SI|     8222.0|        1465.0|
|   5.0|A3IT9B33NWYQSL|B0000223SI|     5689.0|        1465.0|
|   4.0| AUL5LCV4TT73P|B0000223SK|     1371.0|         720.0|
|   5.0|A1V3I3L5JKO7TM|B0000223SK|     7535.0|         720.0|
|   5.0|A20X7NCNZ7T5ZK|B0000223SK|     7742.0|         720.0|
|   5.0|A3OBWQ8DTRLW2Q|B0000223SI|     5800.0|        1465.0|
|   5.0|A398INYG0ZBUZB|B0000223SK|      153.0|         720.0|
|   5.0| AEBM08OO8Y9BJ|B0000223SK|     1814.0|         720.0|
|   4.0|A358U1JEA514P6|B0000223SI|     3527.0|        1465.0|
|   5.0|A1Z584ZH824BU1|B0000223SI|     7669.0|        1465.0|
|   5.0|A3OBWQ8DTRLW2Q|B0000223SK|     5800.0|         720.0|
|   5.0|A2B40VHCBDLC43|B0000223SK|     4949.0|         720.0|
|   5.0|

In [86]:
specific_line = data.limit(1).collect()[0]

In [87]:
specific_line

Row(rating=5.0, userID='A1JB7HFWHRYHT7', productID='B0000223SI', userIDIndex=1078.0, productIDIndex=1465.0)

In [88]:
num_rows = data.count()
num_rows

                                                                                

77071

In [89]:
data = data.dropna().drop_duplicates()

In [90]:
num_rows = data.count()
num_rows

                                                                                

72131

In [91]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [103]:
(train, test) = data.randomSplit([0.7, 0.3])

In [104]:
als = ALS(maxIter=5, regParam=0.01, userCol="userIDIndex", itemCol="productIDIndex", ratingCol="rating",
          coldStartStrategy="drop")

In [105]:
model = als.fit(train)

                                                                                

In [106]:
predictions = model.transform(test)

In [107]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

[Stage 274:>  (0 + 8) / 8][Stage 289:> (0 + 0) / 10][Stage 290:> (0 + 0) / 10]0]

                                                                                

Root-mean-square error = 5.709008848948894


In [108]:
data.coalesce(1).write.option('header', 'true').csv('forKafkatesting.csv')

                                                                                