# Recommendation systems scaled using Spark 20m Movie lens
20m Movie lens dataset for this one. </br>
Use Alternative Least Squares matrix factorization for making recommendations

In [1]:
import csv
import multiprocessing

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [2]:
# get number of available cores
#cores = str(multiprocessing.cpu_count() - 1)
# define spark session
spark = SparkSession.builder.appName("ALSExample").config("spark.executor.cores", '8').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/13 14:18:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read file and retrieve
lines = spark.read.option("header", "true").csv("../ml-20m/ratings.csv").rdd
# define data structure and convert rdd into dataset
ratingsRDD = lines.map(
    lambda p: Row(
        userID=int(p[0]),
        movieID=int(p[1]),
        rating=float(p[2]),
        timestamp=int(p[3])
    )
)
ratings = spark.createDataFrame(ratingsRDD)

                                                                                

In [4]:
# define a training and test set
(training, test) = ratings.randomSplit([0.8, 0.2])

In [5]:
# define ALS model instance
als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol="userID",
    itemCol="movieID",
    ratingCol="rating",
    coldStartStrategy="drop"
)
# fit model
model = als.fit(training)

24/08/13 14:20:50 WARN BlockManager: Block rdd_34_5 could not be removed as it was not found on disk or in memory
24/08/13 14:20:50 WARN BlockManager: Block rdd_35_5 could not be removed as it was not found on disk or in memory
24/08/13 14:20:50 ERROR Executor: Exception in task 5.0 in stage 5.0 (TID 24)
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3793)
	at scala.collection.mutable.ArrayBuilder$ofInt.mkArray(ArrayBuilder.scala:339)
	at scala.collection.mutable.ArrayBuilder$ofInt.result(ArrayBuilder.scala:383)
	at scala.collection.mutable.ArrayBuilder$ofInt.result(ArrayBuilder.scala:330)
	at org.apache.spark.ml.recommendation.ALS$UncompressedInBlockBuilder.build(ALS.scala:1452)
	at org.apache.spark.ml.recommendation.ALS$.$anonfun$makeBlocks$5(ALS.scala:1660)
	at org.apache.spark.ml.recommendation.ALS$$$Lambda$3784/0x0000000841502040.apply(Unknown Source)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$mapValues$3(PairRDDFunctions.scal

ConnectionRefusedError: [Errno 111] Connection refused

In [6]:
# get preds and test
preds = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(preds)
print("Root-mean-square-error = "+str(rmse))

NameError: name 'model' is not defined

In [12]:
# get some recommendations for user
userRecs = model.recommendForAllUsers(10)
user85Recs = userRecs.filter(userRecs['userID']==85).collect()

                                                                                

In [15]:
# read movies data and get movies names
moviesPath = "../data/ml-latest-small/movies.csv"
# now parse movies dataset
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [16]:
# funtion to ger movie name based on movie ID
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

In [17]:
for row in user85Recs:
    for rec in row.recommendations:
        print(getMovieName(rec.movieID))

Angels and Insects (1995)
Hostel (2005)
Double Indemnity (1944)
Towering Inferno, The (1974)
Young Guns II (1990)
Vampires (1998)
Teenage Mutant Ninja Turtles II: The Secret of the Ooze (1991)
Michael Clayton (2007)
Glengarry Glen Ross (1992)
The Handmaiden (2016)
