# Recommendation systems scaled using Spark
Use Alternative Least Squares matrix factorization for making recommendations

In [14]:
import csv

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [3]:
# define spark session
spark = SparkSession.builder.appName("ALSExample").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 21:07:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# read file and retrieve
lines = spark.read.option("header", "true").csv("../data/ml-latest-small/ratings.csv").rdd
# define data structure and convert rdd into dataset
ratingsRDD = lines.map(
    lambda p: Row(
        userID=int(p[0]),
        movieID=int(p[1]),
        rating=float(p[2]),
        timestamp=int(p[3])
    )
)
ratings = spark.createDataFrame(ratingsRDD)

In [7]:
# define a training and test set
(training, test) = ratings.randomSplit([0.8, 0.2])

In [8]:
# define ALS model instance
als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol="userID",
    itemCol="movieID",
    ratingCol="rating",
    coldStartStrategy="drop"
)
# fit model
model = als.fit(training)

24/08/12 21:17:18 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/12 21:17:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [11]:
# get preds and test
preds = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(preds)
print("Root-mean-square-error = "+str(rmse))



Root-mean-square-error = 1.0727645136635104


                                                                                

In [12]:
# get some recommendations for user
userRecs = model.recommendForAllUsers(10)
user85Recs = userRecs.filter(userRecs['userID']==85).collect()

                                                                                

In [15]:
# read movies data and get movies names
moviesPath = "../data/ml-latest-small/movies.csv"
# now parse movies dataset
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [16]:
# funtion to ger movie name based on movie ID
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

In [17]:
for row in user85Recs:
    for rec in row.recommendations:
        print(getMovieName(rec.movieID))

Angels and Insects (1995)
Hostel (2005)
Double Indemnity (1944)
Towering Inferno, The (1974)
Young Guns II (1990)
Vampires (1998)
Teenage Mutant Ninja Turtles II: The Secret of the Ooze (1991)
Michael Clayton (2007)
Glengarry Glen Ross (1992)
The Handmaiden (2016)
