In [1]:
import findspark
findspark.init('Path-to_Spark_Installtion_Directory')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
data = spark.read.csv('movie_ratings.csv', header = True, inferSchema = True)
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [3]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|            100004|            100004|            100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|
|    min|                 1|                 1|               0.5|
|    max|               671|            163949|               5.0|
+-------+------------------+------------------+------------------+



In [4]:
# Splitting the data into train set and test set
train, test = data.randomSplit([0.8, 0.2])

In [5]:
# Developing recommnedation system model
# Alternative least square(ALS) method
from pyspark.ml.recommendation import ALS
als = ALS(maxIter = 5,
          regParam = 0.01,
          userCol = 'userId',
          itemCol = 'movieId',
          ratingCol = 'rating')

# Fitting the model with training data
model = als.fit(train)

# Checking the prediction with test data
pred = model.transform(test)
pred.show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   242|    463|   4.0| 3.7814856|
|   350|    471|   3.0| 3.9126155|
|   306|    471|   3.0| 3.9589782|
|   607|    471|   4.0|  3.478189|
|   537|    471|   5.0|  4.008949|
|   380|    471|   4.0| 3.5639136|
|   487|    471|   4.0| 3.9232037|
|   574|    471|   3.5| 3.8216467|
|   105|    471|   4.0| 3.5162578|
|   521|    471|   3.5|  4.221143|
+------+-------+------+----------+
only showing top 10 rows



In [6]:
# Evaluating the model
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')
rmse = evaluator.evaluate(pred.na.drop())
print('RMSE: %.2f'%rmse)

RMSE: 1.12


In [7]:
# How can we use this model to recommend a movie to a new single user
single_user = test.filter(test['userId'] == 11).select(['movieId', 'userId'])
single_user.show(10)

+-------+------+
|movieId|userId|
+-------+------+
|    126|    11|
|    923|    11|
|   2596|    11|
|  48516|    11|
|  77455|    11|
|  79132|    11|
|  80489|    11|
|  91548|    11|
| 106487|    11|
+-------+------+



In [8]:
# Let's predict how this user going to like the above mentioned movies
recommendations = model.transform(single_user)
recommendations.orderBy('movieId', ascending = False).show(10)

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
| 106487|    11| 1.9842854|
|  91548|    11|       NaN|
|  80489|    11| 5.7499537|
|  79132|    11| 3.3195586|
|  77455|    11| 4.0956645|
|  48516|    11|  4.766561|
|   2596|    11|  4.890043|
|    923|    11| 3.3459005|
|    126|    11| 5.2579637|
+-------+------+----------+



In [9]:
# Let's check our prediction against the actual data to see how well our model perform
test.filter(test['userId'] == 11).orderBy('movieId', ascending = False).show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    11| 106487|   5.0|
|    11|  91548|   4.0|
|    11|  80489|   4.5|
|    11|  79132|   4.0|
|    11|  77455|   4.5|
|    11|  48516|   5.0|
|    11|   2596|   4.5|
|    11|    923|   5.0|
|    11|    126|   4.0|
+------+-------+------+

