<a href="https://colab.research.google.com/github/icarocarmona/pos-tech-dtat/blob/main/f3_big_data/ALS_RecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configurando bibliotecas e dependencias

In [None]:
!pip install pyspark

In [None]:
!pip install findspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [5]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [9]:
lines = spark.read.text("/content/sample_movielens_ratings.txt").rdd

In [10]:
parts = lines.map(lambda row: row.value.split("::"))

In [14]:
ratingRDD = parts.map(lambda p: Row(
        userId=int(p[0]),
        movieId=int(p[1]),
        rating=float(p[2]),
        timestamp=int(p[3])
))

In [15]:
rating = spark.createDataFrame(ratingRDD)

In [16]:
rating.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|   3.0|1424380312|
|     0|      3|   1.0|1424380312|
|     0|      5|   2.0|1424380312|
|     0|      9|   4.0|1424380312|
|     0|     11|   1.0|1424380312|
|     0|     12|   2.0|1424380312|
|     0|     15|   1.0|1424380312|
|     0|     17|   1.0|1424380312|
|     0|     19|   1.0|1424380312|
|     0|     21|   1.0|1424380312|
|     0|     23|   1.0|1424380312|
|     0|     26|   3.0|1424380312|
|     0|     27|   1.0|1424380312|
|     0|     28|   1.0|1424380312|
|     0|     29|   1.0|1424380312|
|     0|     30|   1.0|1424380312|
|     0|     31|   1.0|1424380312|
|     0|     34|   1.0|1424380312|
|     0|     37|   1.0|1424380312|
|     0|     41|   2.0|1424380312|
+------+-------+------+----------+
only showing top 20 rows



In [19]:
(training, test) = rating.randomSplit([0.8, 0.2])

In [20]:
als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

In [21]:
model = als.fit(training)

In [22]:
predictions = model.transform(test)

In [23]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

print("Erro médio quadrático = " + str(rmse))

Erro médio quadrático = 1.7024024462585714


In [24]:
userRec = model.recommendForAllUsers(10)

In [25]:
userRec.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 4.633335}, ...|
|    10|[{46, 4.3338404},...|
|     0|[{90, 5.8650727},...|
|     1|[{29, 4.867915}, ...|
|    21|[{29, 5.1724653},...|
|    11|[{32, 5.757972}, ...|
|    12|[{46, 5.9621487},...|
|    22|[{4, 6.0828013}, ...|
|     2|[{8, 5.035456}, {...|
|    13|[{52, 3.4599023},...|
|     3|[{75, 5.966167}, ...|
|    23|[{27, 5.1779575},...|
|     4|[{53, 4.4732}, {5...|
|    24|[{52, 5.023619}, ...|
|    14|[{52, 5.4232106},...|
|     5|[{53, 5.9689717},...|
|    15|[{46, 4.7341228},...|
|    25|[{28, 4.375128}, ...|
|    26|[{32, 6.2474246},...|
|     6|[{25, 4.765902}, ...|
+------+--------------------+
only showing top 20 rows



In [26]:
movieRecs = model.recommendForAllItems(10)

In [27]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{5, 2.918556}, {...|
|     40|[{2, 3.8685434}, ...|
|     10|[{0, 2.7656345}, ...|
|     50|[{11, 4.548509}, ...|
|     80|[{3, 3.6778986}, ...|
|     70|[{4, 3.845504}, {...|
|     60|[{22, 3.3446133},...|
|     90|[{17, 5.90263}, {...|
|     30|[{26, 5.83772}, {...|
|      0|[{28, 2.9130337},...|
|     31|[{12, 3.8364975},...|
|     81|[{23, 4.2194037},...|
|     91|[{23, 4.0537505},...|
|      1|[{12, 4.3187284},...|
|     41|[{5, 4.6398935}, ...|
|     61|[{16, 3.2701192},...|
|     51|[{3, 5.086837}, {...|
|     21|[{22, 3.0220068},...|
|     11|[{18, 3.9367208},...|
|     71|[{25, 3.7561786},...|
+-------+--------------------+
only showing top 20 rows



In [28]:
users = rating.select(als.getUserCol()).distinct()

In [30]:
users.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
|     0|
|    22|
|     7|
|    25|
|     6|
|     9|
|    27|
|    17|
|    28|
|     5|
|     1|
|    10|
|     3|
|    12|
|     8|
|    11|
|     2|
+------+
only showing top 20 rows



In [31]:
UserRecsOnlyItemId = userRec.select(userRec['userId'], userRec['recommendations']['movieId'])

In [33]:
UserRecsOnlyItemId.show(10, False)

+------+----------------------------------------+
|userId|recommendations.movieId                 |
+------+----------------------------------------+
|20    |[22, 75, 77, 98, 94, 51, 29, 90, 53, 88]|
|10    |[46, 23, 42, 90, 89, 49, 4, 55, 95, 26] |
|0     |[90, 92, 94, 58, 9, 63, 54, 68, 70, 2]  |
|1     |[29, 75, 53, 22, 62, 51, 69, 98, 24, 28]|
|21    |[29, 53, 52, 62, 87, 63, 74, 70, 2, 72] |
|11    |[32, 30, 65, 44, 27, 69, 18, 23, 50, 7] |
|12    |[46, 64, 17, 1, 16, 94, 31, 26, 42, 95] |
|22    |[4, 22, 75, 30, 51, 88, 77, 76, 52, 98] |
|2     |[8, 93, 39, 83, 37, 92, 19, 34, 40, 2]  |
|13    |[52, 87, 74, 32, 29, 53, 2, 70, 18, 83] |
+------+----------------------------------------+
only showing top 10 rows

