In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col,explode
spark = SparkSession.builder.appName('Collaborative filtering').getOrCreate()

In [None]:
movies_df = spark.read.options(header='True', inferSchema='True').csv('data/movies.csv')
ratings_df = spark.read.options(header='True', inferSchema='True').csv('data/ratings.csv')

movies_df.show(5)
ratings_df.show(5)

In [None]:
# Joining data
ratings = ratings_df.join(movies_df, 'movieId', 'left')

In [None]:
# Split data
(train, test) = ratings.randomSplit([0.8, 0.2])

In [None]:
# ALS model
from pyspark.ml.recommendation import ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",\
     nonnegative=True, implicitPrefs=False, coldStartStrategy="drop")

In [None]:
# Hyperparameter tuning and cross validation

param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)


In [None]:
model = cv.fit(train)
best_model = model.bestModel
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

In [None]:
recommendations = best_model.recommendForAllUsers(5)

In [None]:
recommendations.show(5)

In [None]:
df2 = recommendations.withColumn("movieid_rating", explode("recommendations"))

In [None]:
df2.select("userId", col("movieid_rating.movieId"), col("movieid_rating.rating")).show(5)