In [21]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col,expr
spark = SparkSession.builder \
    .appName("Load CSV Example") \
    .getOrCreate()
df = spark.read.json('/home/lplab/Desktop/janav_220962049/movies 1.json')
df.show(3)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This movie needed...|1164844800|A1I7QGUDP043DG|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
only showing top 3 rows



In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
user_indexer = StringIndexer(inputCol="user_id", outputCol="userIndex",handleInvalid="keep")
df = user_indexer.fit(df).transform(df)


product_indexer = StringIndexer(inputCol="product_id", outputCol="productIndex", handleInvalid="keep")
df = product_indexer.fit(df).transform(df)
df.show(3)
df = df.select(
    col("userIndex").alias("userId"),
    col("productIndex").alias("itemId"),
    col("score").alias("rating")
)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+---------+------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|userIndex|productIndex|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+---------+------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|     32.0|       731.0|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|      3.0|       731.0|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This movie needed...|1164844800|A1I7QGUDP043DG|    312.0|       731.0|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+---------+------------+

In [24]:
(training_data, test_data) = df.randomSplit([0.6, 0.4])
als = ALS(
    maxIter=10,
    regParam=0.01,
    userCol="userId",
    itemCol="itemId",
    ratingCol="rating",
    coldStartStrategy="drop"
)
model = als.fit(training_data)
predictions = model.transform(test_data)
predictions.show()

+------+------+------+------------+
|userId|itemId|rating|  prediction|
+------+------+------+------------+
|1238.0|  63.0|   4.0| -0.96577525|
|2366.0|  21.0|   4.0|   3.9407632|
|3997.0|  21.0|   2.0|   1.9703816|
| 540.0|   7.0|   5.0|   0.4127052|
|4161.0|  63.0|   5.0|-0.046322126|
|1339.0| 680.0|   4.0|  -0.6463967|
|2393.0|  21.0|   5.0|    4.925954|
| 451.0|   6.0|   5.0|   0.9437881|
|2525.0|  37.0|   5.0|  0.36404803|
|2833.0|  21.0|   1.0|   0.9851908|
|  53.0|  37.0|   1.0|   3.9476726|
|1005.0|   7.0|   1.0|  0.99807584|
|1005.0|   7.0|   1.0|  0.99807584|
|1005.0|   7.0|   1.0|  0.99807584|
|1133.0|   7.0|   5.0|  0.15591462|
|3577.0| 303.0|   3.0|   3.0138671|
|1212.0|   6.0|   4.0|   1.4729717|
|4182.0|  85.0|   3.0|  -1.7397054|
| 108.0|  37.0|   4.0|   2.4454935|
|4452.0|  85.0|   4.0|   1.2296405|
+------+------+------+------------+
only showing top 20 rows



In [26]:
test_df = df  # This is a placeholder; replace with actual test data if available

# Predict on test data
predictions = model.transform(test_df)

# Initialize evaluator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 1.8611114155543285
