In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("movielens32m_colab_filtering").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/22 10:47:50 INFO SparkEnv: Registering MapOutputTracker
24/12/22 10:47:50 INFO SparkEnv: Registering BlockManagerMaster
24/12/22 10:47:50 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/12/22 10:47:51 INFO SparkEnv: Registering OutputCommitCoordinator


In [14]:
movies = spark.read.csv("gs://2425_fall_150200313/project/movies.csv", header=True, inferSchema=True)
ratings = spark.read.csv("gs://2425_fall_150200313/project/ratings.csv", header=True, inferSchema=True)

                                                                                

In [15]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [16]:
ratings = ratings.drop("timestamp")

In [17]:
ratings.show(10)

[Stage 17:>                                                         (0 + 1) / 1]

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     17|   4.0|
|     1|     25|   1.0|
|     1|     29|   2.0|
|     1|     30|   5.0|
|     1|     32|   5.0|
|     1|     34|   2.0|
|     1|     36|   1.0|
|     1|     80|   5.0|
|     1|    110|   3.0|
|     1|    111|   5.0|
+------+-------+------+
only showing top 10 rows



                                                                                

In [18]:
ratings.describe().show()



+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|          32000204|          32000204|          32000204|
|   mean|100278.50641102163|29318.610121829224|3.5403956487277393|
| stddev|  57949.0462332529| 50958.16087967011| 1.058986213945308|
|    min|                 1|                 1|               0.5|
|    max|            200948|            292757|               5.0|
+-------+------------------+------------------+------------------+



                                                                                

In [20]:
# group by user_id and count the number of ratings for each user
user_rating_counts = ratings.groupBy("userId").count()

# sort by the count of ratings in descending order
user_rating_counts.sort("count", ascending=False).show()



+------+-----+
|userId|count|
+------+-----+
|175325|33332|
| 17035| 9577|
| 55653| 9178|
|123465| 9044|
|171795| 9016|
| 10202| 7748|
|198515| 7594|
| 49305| 7488|
| 22744| 7372|
|  7858| 7322|
| 14674| 6407|
| 53192| 6265|
|133878| 6074|
| 57304| 6061|
|129705| 5812|
|139018| 5806|
| 43703| 5784|
| 68797| 5693|
|103925| 5655|
|119247| 5654|
+------+-----+
only showing top 20 rows



                                                                                

## Using ALS algorithm provided by Pyspark MLlib

In [22]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator

In [24]:
als = ALS(
    userCol="userId", 
    itemCol="movieId", 
    ratingCol="rating", 
    rank=10,          # Number of latent factors (adjust as needed)
    maxIter=10,       # Maximum number of iterations
    regParam=0.1,     # Regularization parameter (adjust for overfitting)
    coldStartStrategy="drop"  # Drop rows where we can't make predictions
)

In [25]:
train_data, test_data = ratings.randomSplit([0.7, 0.3], seed=123)

model = als.fit(train_data)

predictions = model.transform(test_data)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")



Root Mean Squared Error (RMSE): 0.8025341101498348


                                                                                

In [27]:
top10_recos = model.recommendForAllUsers(10)

In [28]:
top10_recos.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{135735, 5.68260...|
|     3|[{185291, 5.72309...|
|     4|[{270306, 5.10728...|
|     6|[{225437, 8.47182...|
|     8|[{275847, 5.95986...|
|    11|[{206447, 5.35142...|
|    12|[{186087, 5.83219...|
|    31|[{275847, 6.25373...|
|    33|[{289897, 6.15281...|
|    34|[{199342, 5.16130...|
|    35|[{193817, 5.73754...|
|    36|[{199342, 5.32439...|
|    37|[{199342, 4.89357...|
|    44|[{199342, 5.98434...|
|    53|[{151989, 6.44114...|
|    60|[{231287, 5.44635...|
|    61|[{199342, 5.75335...|
|    64|[{275847, 5.37640...|
|    65|[{274047, 4.51002...|
|    66|[{199342, 6.38508...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [31]:
single_user = top10_recos.take(1)

                                                                                

In [33]:
single_user[0]

Row(userId=1, recommendations=[Row(movieId=135735, rating=5.682607650756836), Row(movieId=227066, rating=5.321781158447266), Row(movieId=126060, rating=5.245600700378418), Row(movieId=289897, rating=5.225883960723877), Row(movieId=286911, rating=5.225883960723877), Row(movieId=165689, rating=5.212541103363037), Row(movieId=263389, rating=5.1936845779418945), Row(movieId=193817, rating=5.16832160949707), Row(movieId=165497, rating=5.151379585266113), Row(movieId=228511, rating=5.112616062164307)])