In [5]:
!pip install pyspark
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=6257eed039922469b4db0eacebad78e4473ddc4181fa7fedba61f4ff9da4a23f
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [73]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [74]:
# Baca data dari file txt
lines = spark.read.text("/content/gdrive/My Drive/kuliah/BigData/dataset_movie.txt").rdd

In [76]:
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))

In [78]:
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [126]:
als = ALS(maxIter=20, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

In [127]:
model = als.fit(training)

In [128]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.4818548378515297


In [84]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)

# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)



In [86]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [92]:
    userRecs.show() 

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{80, 5.867346}, ...|
|    10|[{85, 4.105188}, ...|
|     0|[{18, 5.0230374},...|
|     1|[{22, 4.317766}, ...|
|    21|[{53, 5.0754404},...|
|    11|[{7, 5.676603}, {...|
|    12|[{90, 6.4377403},...|
|    22|[{75, 4.9906764},...|
|     2|[{8, 5.1904807}, ...|
|    13|[{93, 3.8239582},...|
|     3|[{30, 6.4142666},...|
|    23|[{90, 5.6623626},...|
|     4|[{52, 4.6091857},...|
|    24|[{29, 5.5523977},...|
|    14|[{29, 4.9809847},...|
|     5|[{55, 4.8910055},...|
|    15|[{46, 4.99285}, {...|
|    25|[{71, 4.0238376},...|
|    26|[{38, 6.200528}, ...|
|     6|[{25, 5.138626}, ...|
+------+--------------------+
only showing top 20 rows



In [93]:
  movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 4.7652183},...|
|     40|[{8, 4.9868965}, ...|
|     10|[{23, 3.9799986},...|
|     50|[{23, 4.2144923},...|
|     80|[{26, 5.957684}, ...|
|     70|[{8, 4.7956357}, ...|
|     60|[{24, 3.5635986},...|
|     90|[{12, 6.4377403},...|
|     30|[{3, 6.4142666}, ...|
|      0|[{12, 3.5649214},...|
|     31|[{12, 3.6212664},...|
|     81|[{28, 4.9016094},...|
|     91|[{23, 3.5883782},...|
|      1|[{15, 3.638225}, ...|
|     41|[{8, 4.1039505}, ...|
|     61|[{6, 2.436781}, {...|
|     51|[{3, 5.179548}, {...|
|     21|[{26, 2.967746}, ...|
|     11|[{18, 4.021506}, ...|
|     71|[{25, 4.0238376},...|
+-------+--------------------+
only showing top 20 rows



In [94]:
  userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{38, 6.200528}, ...|
|    19|[{30, 5.002263}, ...|
|    29|[{90, 4.122185}, ...|
+------+--------------------+



In [95]:
  movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{26, 2.2675679},...|
|     26|[{27, 3.3084774},...|
|     29|[{24, 5.5523977},...|
+-------+--------------------+



In [None]:
    spark.stop()

According to the RMSE. For the best model is maxIter = 10 and regParam = 0.1 
