In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("Recommendation ALS").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True)
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True)
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True)

(training, test) = ratings_df.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)


# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
predictions.printSchema()
predictions.orderBy('prediction').show(10)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- prediction: float (nullable = false)

+------+-------+------+----------+-----------+
|userId|movieId|rating| timestamp| prediction|
+------+-------+------+----------+-----------+
|   416|    830|   1.5|1187495328|  -2.556579|
|    34|   8957|   2.0|1162050052| -1.9150332|
|   261|   3146|   2.5|1404881024| -1.7352031|
|   288|   5048|   3.0|1136400440|  -1.672625|
|   392|   3004|   1.0|1027524319| -1.6521981|
|   358|  56715|   4.0|1339546338| -1.3864722|
|   111|  48322|   5.0|1516153342| -1.0412714|
|   160|    374|   1.0| 971619754|  -0.974728|
|   266|   2253|   1.0| 945044723|-0.96504146|
|   149|   2034|   1.0| 902085188|-0.72532916|
+------+-------+------+----------+-----------+
only showing top 10 rows

Root-mean-square error = 1.0599080348902534


Select the random userid, for example userid=1. Filter all movies that has not been watched by this user

In [33]:
userId = 1
ratings_df.filter('userId' == userId)

TypeError: condition should be string or Column