In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("Recommendation ALS").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True)
movies_df = movies_df.join(links_df, on = ['movieId'])
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True)
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True)

movies_df.show()

(training, test) = ratings_df.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)


# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
predictions.printSchema()
predictions.orderBy('prediction').show(10)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|imdbId|tmdbId|
+-------+--------------------+--------------------+------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|
|      5|Father of the Bri...|              Comedy|113041| 11862|
|      6|         Heat (1995)|Action|Crime|Thri...|113277|   949|
|      7|      Sabrina (1995)|      Comedy|Romance|114319| 11860|
|      8| Tom and Huck (1995)|  Adventure|Children|112302| 45325|
|      9| Sudden Death (1995)|              Action|114576|  9091|
|     10|    GoldenEye (1995)|Action|Adventure|...|113189|   710|
|     11|American Presiden...|Comedy|Drama|Romance|112346|  9087|
|     12|Dracula: Dead and...|       Comedy|Horror|112896| 12110|
|     13| 

Select the random userid, for example userid=1. Filter all movies that has not been watched by this user

In [3]:
userId = 1
users = ratings_df.filter(ratings_df["userId"] == userId).select("userId").distinct()
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()
recommendations = userSubsetRecs.select("recommendations").collect()[0][0]

movieIds = []
for recommendation in recommendations:
    movieIds.append(recommendation[0])

print(movieIds)
movies_df.filter(movies_df["movieId"].isin(movieIds)).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[[1237, 6.137741]...|
+------+--------------------+

[1237, 522, 1104, 3451, 1262, 6283, 7587, 5650, 3910, 176371]
+-------+--------------------+--------------------+-------+------+
|movieId|               title|              genres| imdbId|tmdbId|
+-------+--------------------+--------------------+-------+------+
|    522|Romper Stomper (1...|        Action|Drama| 105275| 10412|
|   1104|Streetcar Named D...|               Drama|  44081|   702|
|   1237|Seventh Seal, The...|               Drama|  50976|   490|
|   1262|Great Escape, The...|Action|Adventure|...|  57115|  5925|
|   3451|Guess Who's Comin...|               Drama|  61735|  1879|
|   3910|Dancer in the Dar...|       Drama|Musical| 168629|    16|
|   5650| Strange Brew (1983)|              Comedy|  86373| 12921|
|   6283|Cowboy Bebop: The...|Action|Animation|...| 275277| 11299|
|   7587|Samouraï, Le (God...|Crime|Drama|Thriller