In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("recommender").getOrCreate()

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [3]:
data = spark.read.csv("movielens_ratings.csv",header=True,inferSchema=True)

In [4]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [5]:
data.show(n=10)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
+-------+------+------+
only showing top 10 rows



In [6]:
(training,test) = data.randomSplit([0.8,0.2])

In [8]:
#building the recommendation model
als = ALS(maxIter=5, regParam=0.01, userCol="userId",itemCol="movieId",ratingCol="rating")
model = als.fit(training)

In [9]:
predictions = model.transform(test)

In [10]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    27|   1.687046|
|     31|   1.0|     5| 0.82745796|
|     31|   1.0|    19| 0.63328236|
|     31|   2.0|    25|   4.033231|
|     85|   1.0|    13|  -2.676021|
|     85|   5.0|    16|  3.7168713|
|     85|   1.0|    15|  1.0126411|
|     85|   1.0|    23| -3.5453377|
|     85|   3.0|    21|-0.79443693|
|     65|   2.0|    15|  1.2645495|
|     53|   1.0|     9|  0.6668146|
|     53|   1.0|    23| -0.9935711|
|     53|   5.0|    21|  1.4965943|
|     78|   1.0|    28| 0.91738176|
|     78|   1.0|    19|  0.9404285|
|     78|   1.0|    11|   1.376937|
|     34|   1.0|    16|  2.3455975|
|     81|   1.0|     6| 0.69695026|
|     28|   1.0|    23|  0.7385907|
|     28|   1.0|    10| -1.5912343|
+-------+------+------+-----------+
only showing top 20 rows



In [11]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.9461591497691364


In [13]:
#predicting a recommendation to the user
single_user = test.filter(test['userId']==11).select(['movieId','userId'])

In [15]:
# User had 10 ratings in the test data set 
# Realistically this should be some sort of hold out set!
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      9|    11|
|     30|    11|
|     70|    11|
|     72|    11|
|     78|    11|
|     79|    11|
|     80|    11|
|     89|    11|
+-------+------+



In [16]:
recommendations = model.transform(single_user)

In [19]:
recommendations.orderBy('prediction',ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     89|    11| 3.8717446|
|      9|    11| 3.5054262|
|     30|    11| 2.9022236|
|     80|    11| 2.4120898|
|     70|    11| 1.9492218|
|     78|    11|  1.376937|
|     79|    11|  1.190164|
|     72|    11| 0.8949266|
+-------+------+----------+

