## Movie Recommender project

This is the PySpark version of the Movie Recommender project which runs on a Databricks cluster (instead of on Hadoop!)

In [0]:
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating
from pyspark.rdd import RDD
from pyspark.sql import SparkSession

In [0]:
# view spark context (auto-generated)
sc

In [0]:
# read movie data
rawUD = sc.textFile("/FileStore/tables/u.data")

In [0]:
# read first three words and turn them into Rating object.
rawRatings = rawUD.map(lambda line: line.split("\t")[:3])
ratingsRDD = rawRatings.map(lambda x:Rating(x[0], x[1], x[2]))

In [0]:
# look at unique users
numUsers = ratingsRDD.map(lambda x:x[0]).distinct().count()
numUsers

Out[9]: 943

In [0]:
# train model
model = ALS.train(ratingsRDD, 10, 10, 0.01)

In [0]:
# recommend top 5 most-possibly-liked films for user 100
films = model.recommendProducts(100, 5)
films

Out[12]: [Rating(user=100, product=1169, rating=5.250575568573353),
 Rating(user=100, product=1335, rating=5.174906601425274),
 Rating(user=100, product=1315, rating=4.936108697422341),
 Rating(user=100, product=1217, rating=4.882182587035171),
 Rating(user=100, product=733, rating=4.860158378639019)]

In [0]:
# look at possible rating of film 1311 by user 100
rating = model.predict(100, 1311)
rating

Out[14]: 4.150251195723154

In [0]:
# look at top 5 users who will most possibly like file 200
users = model.recommendUsers(200, 5)
users

Out[15]: [Rating(user=820, product=200, rating=7.614645145761803),
 Rating(user=93, product=200, rating=6.9649619387887665),
 Rating(user=550, product=200, rating=6.8861918702477265),
 Rating(user=651, product=200, rating=6.824342206147021),
 Rating(user=519, product=200, rating=6.776672259591825)]

In [0]:
# Now let's show movie names instead of movie ids!
# first, read data table containing movie names
itemRDD = sc.textFile("/FileStore/tables/u.item")

In [0]:
# create a dictionary mapping movie ids to their names
movieTitle = itemRDD.map(lambda line: line.split("|")).map(lambda a: (float(a[0]),a[1])).collectAsMap()

In [0]:
# now start recommending films with their actual names!
recommendP = model.recommendProducts(100, 5)
for p in recommendP:
    print("For user: {}, we recommend: {}, rating: {}".format(str(p[0]), movieTitle[p[1]], str(p[2])))

For user: 100, we recommend: Fresh (1994), rating: 5.250575568573353
For user: 100, we recommend: American Buffalo (1996), rating: 5.174906601425274
For user: 100, we recommend: Inventing the Abbotts (1997), rating: 4.936108697422341
For user: 100, we recommend: Assassins (1995), rating: 4.882182587035171
For user: 100, we recommend: Go Fish (1994), rating: 4.860158378639019
