## Ao longo da execucao, documente o codigo com comentarios, referentes ao seu entendimento. Utilize tambem a estrutura de markdown do Jupyter para colocar informacoes textuais relevantes.

In [1]:
from __future__ import print_function

import sys
if sys.version >= '3':
    long = int

from pyspark.sql import SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [2]:
spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .config("spark.mongodb.read.connection.uri", "mongodb://172.19.0.2:27017/puc.recomendacoes") \
        .config("spark.mongodb.write.connection.uri", "mongodb://172.19.0.2:27017/puc.recomendacoes") \
        .config('spark.jars.packages',"org.mongodb.spark:mongo-spark-connector_2.12:10.3.0")\
        .getOrCreate()

In [3]:
lines = spark.read.text("sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=long(p[3])))
ratings = spark.createDataFrame(ratingsRDD.collect())

In [4]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|   3.0|1424380312|
|     0|      3|   1.0|1424380312|
|     0|      5|   2.0|1424380312|
|     0|      9|   4.0|1424380312|
|     0|     11|   1.0|1424380312|
|     0|     12|   2.0|1424380312|
|     0|     15|   1.0|1424380312|
|     0|     17|   1.0|1424380312|
|     0|     19|   1.0|1424380312|
|     0|     21|   1.0|1424380312|
|     0|     23|   1.0|1424380312|
|     0|     26|   3.0|1424380312|
|     0|     27|   1.0|1424380312|
|     0|     28|   1.0|1424380312|
|     0|     29|   1.0|1424380312|
|     0|     30|   1.0|1424380312|
|     0|     31|   1.0|1424380312|
|     0|     34|   1.0|1424380312|
|     0|     37|   1.0|1424380312|
|     0|     41|   2.0|1424380312|
+------+-------+------+----------+
only showing top 20 rows



In [5]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [6]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
model = als.fit(training)

In [7]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.9321096909368045


In [8]:
userRecs = model.recommendForAllUsers(10)

In [9]:
userRecs.show(10, False)

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                           |
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|20    |[{51, 4.7503405}, {38, 3.8885486}, {94, 3.8497345}, {75, 3.8363097}, {77, 3.6053252}, {98, 3.1625428}, {90, 3.0880547}, {63, 2.9872339}, {88, 2.9836533}, {31, 2.95345}]  |
|10    |[{2, 3.9842355}, {92, 3.8487735}, {47, 3.5541136}, {34, 3.3940427}, {25, 3.355951}, {32, 3.0925984}, {71, 2.927309}, {0, 2.8021085}, {89, 2.8007543}, {42, 2.731646}]     |
|0     |[{92, 3.810485}, {9, 3.4796143}, {32, 3.0601728}, {62, 2.8598676}, {49, 2.5119925}, {2, 2.46

In [10]:
movieRecs = model.recommendForAllItems(10)

In [11]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 4.5293865},...|
|     40|[{2, 3.7585113}, ...|
|     10|[{23, 3.7467911},...|
|     50|[{23, 4.0729065},...|
|     80|[{23, 4.1024942},...|
|     70|[{21, 3.5673985},...|
|     60|[{21, 3.021291}, ...|
|     90|[{12, 6.128367}, ...|
|     30|[{22, 4.923941}, ...|
|      0|[{28, 2.899958}, ...|
|     31|[{12, 3.4700475},...|
|     81|[{28, 4.7540236},...|
|     91|[{28, 3.7061825},...|
|      1|[{15, 3.8608859},...|
|     41|[{24, 5.153804}, ...|
|     61|[{25, 3.8745818},...|
|     51|[{22, 5.4619584},...|
|     21|[{22, 3.3067005},...|
|     11|[{18, 3.976976}, ...|
|     71|[{25, 3.8979485},...|
+-------+--------------------+
only showing top 20 rows



In [12]:
users = ratings.select(als.getUserCol()).distinct()

In [13]:
users.show()

+------+
|userId|
+------+
|     0|
|     1|
|     3|
|     2|
|     7|
|     6|
|     5|
|     4|
|     9|
|    10|
|     8|
|    11|
|    12|
|    13|
|    14|
|    17|
|    18|
|    15|
|    16|
|    19|
+------+
only showing top 20 rows



In [14]:
userRecsOnlyItemId = userRecs.select(userRecs['userId'], userRecs['recommendations']['movieId'])

In [15]:
userRecsOnlyItemId.show(10, False)

+------+----------------------------------------+
|userId|recommendations.movieId                 |
+------+----------------------------------------+
|20    |[51, 38, 94, 75, 77, 98, 90, 63, 88, 31]|
|10    |[2, 92, 47, 34, 25, 32, 71, 0, 89, 42]  |
|0     |[92, 9, 32, 62, 49, 2, 22, 26, 89, 28]  |
|1     |[62, 32, 51, 22, 30, 85, 9, 47, 28, 77] |
|21    |[93, 29, 53, 74, 70, 87, 96, 41, 58, 59]|
|11    |[2, 52, 32, 79, 18, 48, 30, 58, 92, 13] |
|12    |[90, 25, 27, 85, 64, 35, 55, 46, 68, 20]|
|22    |[51, 30, 74, 75, 88, 23, 69, 22, 68, 32]|
|2     |[83, 8, 37, 89, 92, 34, 19, 40, 81, 55] |
|13    |[93, 29, 58, 53, 74, 89, 18, 81, 88, 41]|
+------+----------------------------------------+
only showing top 10 rows



In [16]:
userRecs.select(userRecs["userId"], \
                userRecs["recommendations"]["movieId"].alias("movieId"),\
userRecs["recommendations"]["rating"].cast('array<double>').alias("rating")).\
    write.format("mongodb").mode("append").save()

## Agora faça 50 recomendacoes para todos os usuarios

## Recomende 50 usuarios para os itens

## Como poderiamos armazenar as recomendacoes no MongoDB?

## Como podemos fazer isso em Python?

## Podemos utilizar outros datasets de teste, em especifico do proprio MovieLenz? Pesquise sobre esses datasets e sua estrutura basica.

## Amazene os dados no MongoDB, criem consultas para obter as recomendações para os usuários de ID 6 e 20.