## Problem

a) Predict what ratings users might have rated for movies

b) Recommend user 10 top movies to watch.

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 62 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 50.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=3eaa4258b6c73a8fedd72f23df85f493a21626c23546aec870bb19be432569dd
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommender').getOrCreate()
sc = spark.sparkContext
sc

In [9]:
df = spark.read.csv('movielens_ratings.csv', inferSchema = True, header = True)
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [10]:
df.show(10)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
+-------+------+------+
only showing top 10 rows



In [11]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [12]:
df.groupBy('rating').count().show()

+------+-----+
|rating|count|
+------+-----+
|   1.0|  941|
|   4.0|   99|
|   3.0|  179|
|   2.0|  207|
|   5.0|   75|
+------+-----+



In [24]:
df.groupBy('userId').count().show()

+------+-----+
|userId|count|
+------+-----+
|    28|   50|
|    26|   49|
|    27|   46|
|    12|   55|
|    22|   56|
|     1|   49|
|    13|   48|
|     6|   57|
|    16|   45|
|     3|   48|
|    20|   47|
|     5|   49|
|    19|   49|
|    15|   48|
|     9|   53|
|    17|   46|
|     4|   55|
|     8|   49|
|    23|   52|
|     7|   54|
+------+-----+
only showing top 20 rows



In [15]:
train_data, test_data  = df.randomSplit([0.8, 0.2], 20)

print("Records for training: " + str(train_data.count()))
print("Records for evaluation: " + str(test_data.count()))

Records for training: 1188
Records for evaluation: 313


In [19]:
from pyspark.ml.recommendation import ALS

In [20]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")

In [21]:
model = als.fit(train_data)

In [22]:
predictions = model.transform(test_data)

In [23]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    26| 0.70589715|
|     31|   1.0|     5|  3.7538288|
|     31|   1.0|    24|  1.4627191|
|     85|   3.0|    21|  2.5371575|
|     65|   1.0|    19|  1.0504794|
|     65|   5.0|    23|  0.4818818|
|     65|   1.0|    24| 0.79530156|
|     53|   3.0|    20|  2.1144269|
|     53|   5.0|     8|  3.1592326|
|     78|   1.0|    13|  0.6788278|
|     78|   1.0|    20|  0.4729411|
|     78|   1.0|    17| 0.91470456|
|     34|   1.0|    28| -2.0583692|
|     34|   1.0|    19|  0.6273526|
|     34|   1.0|    17| 0.09805626|
|     81|   3.0|    26|  3.4195998|
|     81|   1.0|    22|-0.84257275|
|     81|   1.0|     6|  1.7715576|
|     81|   1.0|    19|  0.4393794|
|     28|   1.0|    27|  3.3084912|
+-------+------+------+-----------+
only showing top 20 rows



In [26]:
from pyspark.ml.evaluation import RegressionEvaluator

In [27]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.6717573971385071


In [29]:
single_user = test_data.filter(test_data['userId']==10).select(['movieId','userId'])

# movies that user 10 saw in test data

In [30]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      2|    10|
|     10|    10|
|     13|    10|
|     16|    10|
|     17|    10|
|     28|    10|
|     35|    10|
|     41|    10|
|     43|    10|
|     56|    10|
|     66|    10|
|     67|    10|
|     89|    10|
+-------+------+



In [32]:
recommendations = model.transform(single_user)

In [33]:
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     89|    10|  3.3979173|
|     41|    10|  2.9436579|
|     43|    10|  2.5312998|
|     13|    10|  2.1199603|
|      2|    10|  1.9320009|
|     67|    10|  1.7676947|
|     56|    10|  1.1662536|
|     16|    10|  1.1338801|
|     35|    10|  1.1091353|
|     66|    10|  0.8162328|
|     17|    10| 0.38492316|
|     28|    10|-0.13626468|
|     10|    10| -0.6826315|
+-------+------+-----------+

