In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark import SparkConf
from pyspark.ml.tuning import CrossValidator

In [34]:
spark = SparkSession.builder.appName("recommender").getOrCreate()

df = spark.read.csv("final_ratings.csv", inferSchema=True, header=True)

In [35]:
df.show()

+--------------+------+------+------+
|          name|  tvId|userId|rating|
+--------------+------+------+------+
|Rick and Morty| 60625|   106|   9.0|
|Rick and Morty| 60625|  1576|  10.0|
|Grey's Anatomy|  1416|     3|   1.0|
|Grey's Anatomy|  1416|  1576|   1.0|
|  The Simpsons|   456|  1561|  10.0|
|  The Simpsons|   456|  1576|   5.0|
|    Invincible| 95557|   649|  10.0|
|          Loki| 84958|    11|  10.0|
|          Loki| 84958|   923|   8.0|
|          Loki| 84958|   727|   9.0|
|          Loki| 84958|   709|   6.0|
|          Loki| 84958|   728|  10.0|
|    Doctor Who| 57243|    24|  10.0|
|    Doctor Who| 57243|  1002|   7.0|
|  Supernatural|  1622|    23|   7.0|
|    Doctor Who| 57243|    24|  10.0|
|    Doctor Who| 57243|  1002|   7.0|
|     Wednesday|119051|    28|   1.0|
|     Wednesday|119051|    29|   9.0|
|     Wednesday|119051|    30|   4.0|
+--------------+------+------+------+
only showing top 20 rows



In [36]:
df.describe().show()

+-------+-----------------+-----------------+------------------+------------------+
|summary|             name|             tvId|            userId|            rating|
+-------+-----------------+-----------------+------------------+------------------+
|  count|             1352|             1352|              1352|              1352|
|   mean|         1665.625|72794.43121301776|1244.5029585798816|6.8076923076923075|
| stddev|663.4052709855632|51067.32025267716| 436.6018466017765|2.9877618625731066|
|    min| 1000 Ways to Die|               45|                 3|               1.0|
|    max|          iZombie|           237311|              1597|              10.0|
+-------+-----------------+-----------------+------------------+------------------+



In [37]:
(train, test) = df.randomSplit([0.8,0.2], seed = 42)

In [38]:
als = ALS(userCol="userId", itemCol="tvId", ratingCol="rating", maxIter=5, regParam=0.01)

In [39]:
model = als.fit(train)

In [40]:
pred = model.transform(test)

In [41]:
pred.show()

+--------------------+------+------+------+----------+
|                name|  tvId|userId|rating|prediction|
+--------------------+------+------+------+----------+
|Adventures of Sup...| 12662|  1576|   6.0|       NaN|
|        Almost Human| 51019|  1576|  10.0|-0.7976903|
|Battlestar Galactica| 71365|  1576|  10.0|       NaN|
|        All American| 82428|   673|  10.0|    9.9989|
|                  24|  1973|  1243|   8.0|       NaN|
|A Discovery of Wi...| 77236|  1592|   8.0|       NaN|
|               9-1-1| 75219|   190|  10.0| 2.0167692|
|             Ballers| 62704|  1542|   9.0|       NaN|
|      13 Reasons Why| 66788|   590|   5.0|       NaN|
|Anne Rice's Mayfa...|207863|  1576|   2.0|       NaN|
|Alfred Hitchcock ...|  5273|  1253|   6.0|       NaN|
|Alfred Hitchcock ...|  5273|  1576|  10.0|       NaN|
|Alias Smith and J...|  3275|  1243|   8.0|       NaN|
|All Creatures Gre...|108255|  1594|  10.0|       NaN|
|            Batwoman| 89247|  1542|   7.0|       NaN|
|    A Tou

In [42]:
eval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [43]:
rmse = eval.evaluate(pred)
print(f"RMSE: {rmse}")

RMSE: nan
