# Import Libraries

In [None]:
!pip install pyspark 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=d16ecf5670f572124ca89434d09cc3131a4c780244004a38ccdbd39d2bb36e0f
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row,SparkSession

# Initialize Spark


In [None]:
# Create a SparkSession
spark = SparkSession.builder \
        .master("local") \
        .appName("myApp") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.sql.session.timeout", "48h") \
        .getOrCreate()

# Import Dataset

In [None]:
lines = spark.read.text("./sample_data/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build Recomendation model using ALS

In [None]:
max_iters = [5, 10, 20]
reg_params = [0.1, 0.5, 1.0]

results = {}

In [None]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
for max_iter in max_iters:
    for reg_param in reg_params:
        als = ALS(maxIter=max_iter, regParam=reg_param, userCol="userId", itemCol="movieId", ratingCol="rating",
                  coldStartStrategy="drop")
        model = als.fit(training)

        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)

        results[(max_iter, reg_param)] = rmse
        print(f"Root-mean-square error for maxIter={max_iter}, regParam={reg_param} = {rmse}")

Root-mean-square error for maxIter=5, regParam=0.1 = 0.9688068171784219
Root-mean-square error for maxIter=5, regParam=0.5 = 1.1791751971038336
Root-mean-square error for maxIter=5, regParam=1.0 = 1.4626570255844364
Root-mean-square error for maxIter=10, regParam=0.1 = 0.958388386780552
Root-mean-square error for maxIter=10, regParam=0.5 = 1.1778582283934038
Root-mean-square error for maxIter=10, regParam=1.0 = 1.462666678476791
Root-mean-square error for maxIter=20, regParam=0.1 = 0.9610906911437301
Root-mean-square error for maxIter=20, regParam=0.5 = 1.1779658482486925
Root-mean-square error for maxIter=20, regParam=1.0 = 1.4626666930936816


Dengan referensi kode yang sudah ada, saya menambahkan perulangan untuk max_iters reg_params.

In [None]:

best_params = min(results, key=results.get)
best_rmse = results[best_params]
print(f"\nBest hyperparameters: maxIter={best_params[0]}, regParam={best_params[1]} with RMSE={best_rmse}")


Best hyperparameters: maxIter=10, regParam=0.1 with RMSE=0.958388386780552


Dari proses Build Recommendation model dengan ALS, didapatkan hasil akhir bahwa hyperparameters terbaik ada di maxIter=20, regParam=0.1 dengan RMSE=0.9832103291575914

# Generate Movie Recomendation

In [None]:
als = ALS(maxIter=best_params[0], regParam=best_params[1], userCol="userId", itemCol="movieId", ratingCol="rating",
               coldStartStrategy="drop")
model = als.fit(training)

# Result

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.1537251},...|
|    10|[{40, 2.8611674},...|
|     0|[{9, 2.789438}, {...|
|     1|[{62, 2.9791405},...|
|    21|[{53, 4.2900853},...|
|    11|[{32, 4.8312974},...|
|    12|[{46, 4.462131}, ...|
|    22|[{51, 4.4230843},...|
|     2|[{83, 4.917964}, ...|
|    13|[{93, 3.1314921},...|
|     3|[{51, 4.369176}, ...|
|    23|[{49, 4.7948055},...|
|     4|[{53, 3.7900708},...|
|    24|[{52, 4.4978967},...|
|    14|[{52, 4.247112}, ...|
|     5|[{55, 3.7527733},...|
|    15|[{46, 3.8292143},...|
|    25|[{47, 3.2942047},...|
|    26|[{51, 4.855834}, ...|
|     6|[{25, 3.946157}, ...|
+------+--------------------+
only showing top 20 rows



In [None]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 3.9930067},...|
|     40|[{10, 2.8611674},...|
|     10|[{12, 3.507947}, ...|
|     50|[{23, 3.4926114},...|
|     80|[{3, 3.406635}, {...|
|     70|[{21, 3.2781315},...|
|     60|[{22, 2.8751035},...|
|     90|[{17, 4.592823}, ...|
|     30|[{11, 4.6378994},...|
|      0|[{28, 2.5745838},...|
|     31|[{12, 3.1792202},...|
|     81|[{28, 4.092317}, ...|
|     91|[{25, 2.64118}, {...|
|      1|[{12, 3.2607908},...|
|     41|[{4, 3.2625012}, ...|
|     61|[{6, 2.2553048}, ...|
|     51|[{26, 4.855834}, ...|
|     21|[{26, 2.7172592},...|
|     11|[{2, 3.3703382}, ...|
|     71|[{25, 2.932577}, ...|
+-------+--------------------+
only showing top 20 rows



In [None]:
# Generate top 10 movie recommendations for a specific set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{51, 4.855834}, ...|
|    19|[{90, 3.4796317},...|
|    29|[{90, 3.7731807},...|
+------+--------------------+



In [None]:

# Generate top 10 user recommendations for a specific set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 4.2193503},...|
|     26|[{0, 1.9150383}, ...|
|     29|[{8, 4.482235}, {...|
+-------+--------------------+



# Summary

To analyze the MovieLens dataset, I took the following steps:

1. I loaded the dataset and preprocessed it to create both training and test sets.
2. Then, I experimented with various combinations of maxIter and regParam values, evaluating each model's performance on the test set and recording the corresponding RMSE in a dictionary.
3. Using the RMSE results, I selected the best hyperparameter combination that yielded the lowest RMSE and displayed the results.
4. With the best hyperparameters, I trained the model and generated recommendations for all users, all movies, as well as for subsets of users and movies.
5. Finally, I presented the top 10 recommendations for each user, movie, and specified subset in the output.