In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("Recommendation ALS").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True)
movies_df = movies_df.join(links_df, on = ['movieId'])
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True)
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True)

movies_df.show()

(training, test) = ratings_df.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)


# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
predictions.printSchema()
predictions.orderBy('prediction').show(10)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|imdbId|tmdbId|
+-------+--------------------+--------------------+------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|
|      5|Father of the Bri...|              Comedy|113041| 11862|
|      6|         Heat (1995)|Action|Crime|Thri...|113277|   949|
|      7|      Sabrina (1995)|      Comedy|Romance|114319| 11860|
|      8| Tom and Huck (1995)|  Adventure|Children|112302| 45325|
|      9| Sudden Death (1995)|              Action|114576|  9091|
|     10|    GoldenEye (1995)|Action|Adventure|...|113189|   710|
|     11|American Presiden...|Comedy|Drama|Romance|112346|  9087|
|     12|Dracula: Dead and...|       Comedy|Horror|112896| 12110|
|     13| 

Select the random userid, for example userid=1. Filter all movies that has not been watched by this user

In [5]:
from pyspark.sql import SQLContext

userId = 496
watched_movies = ratings_df.filter(ratings_df['userId'] == userId).select("movieId")

print("Watch movies {}".format(watched_movies.count()))
print("Total movies {}".format(movies_df.count()))

sqlContext = SQLContext(spark.sparkContext)
# watched_movies.registerTempTable('watchedMovies')
# movies_df.registerTempTable("movies")
movies_df.show()
unwatched_movies = movies_df.filter(~movies_df['movieId'].isin(watched_movies.select("movieId").rdd.map(lambda r:r[0]).collect()))
# unwatched_movies = sqlContext.sql("SELECT * FROM movies WHERE movies.movieId NOT IN (SELECT movieId FROM watchedMovies)")
# unwatched_movies.count()

unwatched_movies.registerTempTable('unwatchedMovies')
predictions.registerTempTable("predictions")

unwatched_movies_rating = sqlContext.sql("SELECT * FROM unwatchedMovies INNER JOIN predictions ON unwatchedMovies.movieId = predictions.movieId order by predictions.prediction DESC, predictions.rating DESC")
unwatched_movies_rating.show()

Watch movies 29
Total movies 9742
+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|imdbId|tmdbId|
+-------+--------------------+--------------------+------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|
|      5|Father of the Bri...|              Comedy|113041| 11862|
|      6|         Heat (1995)|Action|Crime|Thri...|113277|   949|
|      7|      Sabrina (1995)|      Comedy|Romance|114319| 11860|
|      8| Tom and Huck (1995)|  Adventure|Children|112302| 45325|
|      9| Sudden Death (1995)|              Action|114576|  9091|
|     10|    GoldenEye (1995)|Action|Adventure|...|113189|   710|
|     11|American Presiden...|Comedy|Drama|Romance|112346|  9087|
|     12|Dracula: Dead and...|       Comed

In [3]:
unwatched_movies.show()

+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|imdbId|tmdbId|
+-------+--------------------+--------------------+------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|
|      5|Father of the Bri...|              Comedy|113041| 11862|
|      6|         Heat (1995)|Action|Crime|Thri...|113277|   949|
|      7|      Sabrina (1995)|      Comedy|Romance|114319| 11860|
|      8| Tom and Huck (1995)|  Adventure|Children|112302| 45325|
|      9| Sudden Death (1995)|              Action|114576|  9091|
|     10|    GoldenEye (1995)|Action|Adventure|...|113189|   710|
|     11|American Presiden...|Comedy|Drama|Romance|112346|  9087|
|     12|Dracula: Dead and...|       Comedy|Horror|112896| 12110|
|     13| 

In [4]:
recommendedMovies = unwatched_movies_rating.take(5)
for r in recommendedMovies:
    print(r['movieId'], r['title'], r['prediction'])



6857 Ninja Scroll (Jûbei ninpûchô) (1995) 9.87397289276123
2840 Stigmata (1999) 8.627127647399902
27611 Battlestar Galactica (2003) 8.05093765258789
122916 Thor: Ragnarok (2017) 7.164710521697998
102125 Iron Man 3 (2013) 7.159113883972168
