In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Prepare data
final_data = spark.read.csv("hdfs://devenv/user/spark/spark_mllib_101/movies/ratings.csv",
                            inferSchema=True,
                            header=True)

In [3]:
# Split data into train and test sets
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [4]:
# Model training
als = ALS(maxIter=5,userCol="userId",itemCol="movieId",ratingCol="rating" , coldStartStrategy="drop")
model = als.fit(train_data)

In [5]:
# Transform the test data using the model to get predictions
predicted_test_data = model.transform(test_data)

In [6]:
# Evalute model performance with test set
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
print("rmse: {}".format(evaluator.evaluate(predicted_test_data)))

rmse: 0.9298523350195411


In [7]:
# Specify the number of movies you would like to recommand for each user
user_movies = model.recommendForAllUsers(3)
user_movies.show(100, truncate=False)

+------+------------------------------------------------------------+
|userId|recommendations                                             |
+------+------------------------------------------------------------+
|471   |[[83318, 5.1683464], [83411, 5.1683464], [83359, 5.1683464]]|
|463   |[[83318, 4.9093847], [83411, 4.9093847], [83359, 4.9093847]]|
|496   |[[106471, 5.415984], [4357, 5.293998], [8530, 5.2430234]]   |
|148   |[[83318, 5.3946414], [83411, 5.3946414], [83359, 5.3946414]]|
|540   |[[83318, 6.0467806], [83411, 6.0467806], [83359, 6.0467806]]|
|392   |[[83318, 4.9486527], [83411, 4.9486527], [83359, 4.9486527]]|
|243   |[[83318, 5.186041], [83411, 5.186041], [83359, 5.186041]]   |
|623   |[[54328, 5.3641315], [83411, 5.290846], [83318, 5.290846]]  |
|31    |[[83318, 5.381199], [83411, 5.381199], [83359, 5.381199]]   |
|516   |[[83318, 5.0995235], [83411, 5.0995235], [83359, 5.0995235]]|
|580   |[[83318, 5.0964403], [83411, 5.0964403], [83359, 5.0964403]]|
|251   |[[59684, 5.5

In [8]:
# The users who are most likely to like a particular movie
movie_uers = model.recommendForAllItems(3)
movie_uers.show(100, truncate=False)

+-------+------------------------------------------------------+
|movieId|recommendations                                       |
+-------+------------------------------------------------------+
|1580   |[[543, 4.8482385], [113, 4.8316417], [464, 4.804806]] |
|5300   |[[515, 4.9625607], [631, 4.8999248], [78, 4.8124523]] |
|6620   |[[357, 4.649997], [577, 4.6415224], [465, 4.4875]]    |
|7340   |[[113, 4.799348], [112, 4.7816877], [543, 4.733344]]  |
|32460  |[[357, 4.9377656], [298, 4.866643], [289, 4.7387853]] |
|54190  |[[477, 3.9632885], [123, 3.8679903], [465, 3.8302276]]|
|471    |[[401, 4.666645], [499, 4.637788], [422, 4.6010647]]  |
|1591   |[[261, 3.9497433], [592, 3.798002], [336, 3.7878623]] |
|1342   |[[46, 4.3928757], [473, 4.2079296], [483, 4.0112596]] |
|2122   |[[228, 3.8427644], [145, 3.7307177], [46, 3.7131634]] |
|2142   |[[46, 4.928414], [113, 4.861998], [517, 4.788576]]    |
|7982   |[[46, 4.8129334], [113, 4.7676578], [70, 4.632042]]   |
|44022  |[[145, 5.182537]