In [6]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Recommendation System").getOrCreate() 

### 1: Load data and split train/test.

Can download data from [HERE](https://grouplens.org/datasets/movielens/1m/)

In [8]:
lines = spark.read.text("data/ml-1m/ratings.dat")
lines.show(5)

+--------------------+
|               value|
+--------------------+
|1::1193::5::97830...|
|1::661::3::978302109|
|1::914::3::978301968|
|1::3408::4::97830...|
|1::2355::5::97882...|
+--------------------+
only showing top 5 rows


In [9]:
from pyspark.sql import functions as F

parts = lines.withColumn("parts", F.split(lines.value, ":"))
parts.show(5, truncate=False)

+---------------------+-----------------------------+
|value                |parts                        |
+---------------------+-----------------------------+
|1::1193::5::978300760|[1, , 1193, , 5, , 978300760]|
|1::661::3::978302109 |[1, , 661, , 3, , 978302109] |
|1::914::3::978301968 |[1, , 914, , 3, , 978301968] |
|1::3408::4::978300275|[1, , 3408, , 4, , 978300275]|
|1::2355::5::978824291|[1, , 2355, , 5, , 978824291]|
+---------------------+-----------------------------+
only showing top 5 rows


In [10]:
dtRatings = parts.select(F.col("parts")[0].cast("int").alias("userId"),
                         F.col("parts")[2].cast("int").alias("movieId"),
                         F.col("parts")[4].cast("double").alias("rating"),
                         F.col("parts")[6].cast("long").alias("timestamp"),
                         )
dtRatings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|   1193|   5.0|978300760|
|     1|    661|   3.0|978302109|
|     1|    914|   3.0|978301968|
|     1|   3408|   4.0|978300275|
|     1|   2355|   5.0|978824291|
+------+-------+------+---------+
only showing top 5 rows


In [11]:
print(f"Data size: ({dtRatings.count()}, {len(dtRatings.columns)})")

Data size: (1000209, 4)


In [12]:
train, test = dtRatings.randomSplit([0.8, 0.2])

In [13]:
print(f"Training set: ({train.count()}, {len(train.columns)})") 
print(f"Test set: ({test.count()}, {len(test.columns)})") 

Training set: (799732, 4)
Test set: (200477, 4)


### 2: Build and evaluate ALS model on pyspark

In [14]:
from pyspark.ml.recommendation import ALS
from datetime import datetime

start_time = datetime.now()
# coldStartStrategy="drop": remove rows that could not been predicted (default: none), if predictions has NaN rows => rmse = nan 
# regParam: regulation para: 
# - Small: 0.01 - The model fits the data too closely but is prone to overfitting  
# - Medium (0.05–0.1) - Usually optimal, providing a good bias–variance trade-off
# - Large (> 1.0) — The model becomes overly smoothed → underfitting
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(train)
end_time = datetime.now()

print(f'Execute time {end_time - start_time}')


Execute time 0:00:09.516900


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

test_predictions = model.transform(test)
test_predictions.show(5)

+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     1|    531|   4.0|978302149|  4.430085|
|     1|    594|   4.0|978302268| 4.1480737|
|     1|    608|   4.0|978301398|  4.163809|
|     1|   1197|   3.0|978302268|  4.033935|
|     1|   1246|   4.0|978302091| 4.8618646|
+------+-------+------+---------+----------+
only showing top 5 rows


Check if predictions has nan values, which can cause rmse nan

In [None]:
print(test_predictions.filter(F.isnan("prediction")).count())

36


Evaluate model

In [67]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(test_predictions)
rmse
print(f"Root-mean-square error = {rmse}")

Root-mean-square error = 0.896118982168018


### 3: Recommend products to users and recommend users to products
Get 10 recommended movies for each user

In [77]:
userRecs = model.recommendForAllUsers(10)

Get 10 recommended users for each movie

In [None]:
movieRecs = model.recommendForAllItems(10)

In [None]:
#Get 10 recommended movies for user subset
users = dtRatings.select(als.getUserCol()).distinct().limit(5) # .take(5): tra ve row list -> ko phai dataframe nhu limit() 
userSubRecs = model.recommendForUserSubset(users, 10)

#Get 10 recommended users for movie subset
movies = dtRatings.select(als.getItemCol()).distinct().limit(5)
movieSubRecs = model.recommendForItemSubset(movies, 10)

List 10 users recommended for the 5 movies

In [92]:
list(movieSubRecs.select('recommendations').toPandas()['recommendations'])

[[Row(userId=5072, rating=5.385042190551758),
  Row(userId=1664, rating=5.365973949432373),
  Row(userId=3440, rating=5.326171875),
  Row(userId=3177, rating=5.30164098739624),
  Row(userId=283, rating=5.297595977783203),
  Row(userId=3772, rating=5.2307000160217285),
  Row(userId=4236, rating=5.2023234367370605),
  Row(userId=5862, rating=5.160823822021484),
  Row(userId=1080, rating=5.138598918914795),
  Row(userId=4216, rating=5.123500823974609)],
 [Row(userId=2431, rating=6.533290386199951),
  Row(userId=3009, rating=6.424563407897949),
  Row(userId=878, rating=5.717471122741699),
  Row(userId=4880, rating=5.6871843338012695),
  Row(userId=3480, rating=5.599314212799072),
  Row(userId=41, rating=5.56636905670166),
  Row(userId=6031, rating=5.560342311859131),
  Row(userId=3787, rating=5.529158115386963),
  Row(userId=2274, rating=5.528017997741699),
  Row(userId=1751, rating=5.526898384094238)],
 [Row(userId=3009, rating=7.146150588989258),
  Row(userId=1535, rating=6.7440557479858