<a href="https://colab.research.google.com/github/julioger/Assignment_Week09_5025201079/blob/main/Collaborative_Filtering_5025201079.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Assignment Week 9 / Big Data A

Julio Geraldi Soeiono/5025201079

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=f186a3f6ad27ca2fe914f8d54de53237af587d37f9da4105cb28794ddfef9118
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row, SparkSession

In [3]:
# Create a SparkSession
spark = SparkSession.builder \
        .master("local") \
        .appName("myApp") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.sql.session.timeout", "48h") \
        .getOrCreate()

In [5]:
lines = spark.read.text("./sample_data/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

## Build Recomendation model using ALS

In [6]:
max_iters = [5, 10, 17]
reg_params = [0.1, 0.5, 0.9]

results = {}

In [7]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
for max_iter in max_iters:
    for reg_param in reg_params:
        als = ALS(maxIter=max_iter, regParam=reg_param, userCol="userId", itemCol="movieId", ratingCol="rating",
                  coldStartStrategy="drop")
        model = als.fit(training)

        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)

        results[(max_iter, reg_param)] = rmse
        print(f"Root-mean-square error for maxIter={max_iter}, regParam={reg_param} = {rmse}")

Root-mean-square error for maxIter=5, regParam=0.1 = 1.0924181097051857
Root-mean-square error for maxIter=5, regParam=0.5 = 1.2719653344123234
Root-mean-square error for maxIter=5, regParam=0.9 = 1.4895659731214166
Root-mean-square error for maxIter=10, regParam=0.1 = 1.0522477212470325
Root-mean-square error for maxIter=10, regParam=0.5 = 1.2614666776902745
Root-mean-square error for maxIter=10, regParam=0.9 = 1.4895579527506728
Root-mean-square error for maxIter=17, regParam=0.1 = 1.0537366583592385
Root-mean-square error for maxIter=17, regParam=0.5 = 1.2614991730676248
Root-mean-square error for maxIter=17, regParam=0.9 = 1.489557982815122


In [8]:
best_params = min(results, key=results.get)
best_rmse = results[best_params]
print(f"\nBest hyperparameters: maxIter={best_params[0]}, regParam={best_params[1]} with RMSE={best_rmse}")


Best hyperparameters: maxIter=10, regParam=0.1 with RMSE=1.0522477212470325


In [9]:
als = ALS(maxIter=best_params[0], regParam=best_params[1], userCol="userId", itemCol="movieId", ratingCol="rating",
               coldStartStrategy="drop")
model = als.fit(training)

In [10]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.8394387},...|
|    10|[{92, 3.2256405},...|
|     0|[{92, 3.2177768},...|
|     1|[{62, 3.1430864},...|
|    21|[{52, 4.294122}, ...|
|    11|[{18, 4.79582}, {...|
|    12|[{46, 4.720094}, ...|
|    22|[{75, 4.679727}, ...|
|     2|[{8, 4.4959283}, ...|
|    13|[{93, 2.7825341},...|
|     3|[{30, 3.6952896},...|
|    23|[{55, 4.76673}, {...|
|     4|[{2, 3.4510968}, ...|
|    24|[{52, 4.512777}, ...|
|    14|[{52, 4.717275}, ...|
|     5|[{55, 3.7353773},...|
|    15|[{46, 3.9840033},...|
|    25|[{47, 2.9848151},...|
|    26|[{23, 4.4985633},...|
|     6|[{25, 3.705084}, ...|
+------+--------------------+
only showing top 20 rows



In [11]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{2, 1.1400391}, ...|
|     40|[{28, 3.544061}, ...|
|     10|[{23, 3.38582}, {...|
|     50|[{23, 4.025022}, ...|
|     80|[{3, 3.6170764}, ...|
|     70|[{8, 3.2217858}, ...|
|     60|[{3, 2.2034435}, ...|
|     90|[{23, 4.576513}, ...|
|     30|[{11, 4.734014}, ...|
|      0|[{28, 2.7173338},...|
|     31|[{12, 2.9552052},...|
|     81|[{28, 3.836219}, ...|
|     91|[{12, 3.0490267},...|
|      1|[{12, 2.1476226},...|
|     41|[{4, 3.0755599}, ...|
|     61|[{6, 2.0846412}, ...|
|     51|[{22, 3.950915}, ...|
|     21|[{26, 2.127709}, ...|
|     11|[{16, 1.4927329},...|
|     71|[{25, 2.8108008},...|
+-------+--------------------+
only showing top 20 rows



In [12]:
# Generate top 10 movie recommendations for a specific set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{23, 4.4985633},...|
|    19|[{94, 3.5228574},...|
|    29|[{46, 4.2372036},...|
+------+--------------------+



In [13]:
# Generate top 10 user recommendations for a specific set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 3.992788}, ...|
|     26|[{12, 2.301162}, ...|
|     29|[{8, 4.663761}, {...|
+-------+--------------------+



The best model is the best hyperparameter combination based on the lowest RMSE