In [1]:
import findspark
findspark.init('Path_to_Spark_Installation_Folder')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
data = spark.read.csv('movie_ratings.csv', header = True, inferSchema = True)
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [3]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|            100004|            100004|            100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|
|    min|                 1|                 1|               0.5|
|    max|               671|            163949|               5.0|
+-------+------------------+------------------+------------------+



In [4]:
# Splitting the data into train set and test set
train, test = data.randomSplit([0.8, 0.2])

*** When making predictions using Alternative least square(ALS) method, it is common to encounter users and/or items in the test dataset that were not present during training the model. This amy cause 'NaN' predicted values in result for evaluation metrics. So, Spark has a feature called 'coldStartStrategy' to reslove this issue. ***

In [5]:
# Developing recommnedation system model
# Alternative least square(ALS) method
from pyspark.ml.recommendation import ALS
als = ALS(maxIter = 5, #  number of iterations to run
          regParam = 0.01, # regularization parameter
          userCol = 'userId',
          itemCol = 'movieId',
          ratingCol = 'rating',
#           coldStartStrategy = 'drop' # we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
         )

# Fitting the model with training data
model = als.fit(train)

# Checking the prediction with test data
pred = model.transform(test)
pred.show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   575|    148|   4.0|       NaN|
|   232|    463|   4.0| 2.5138907|
|   242|    463|   4.0|  3.077727|
|   311|    463|   3.0| 1.7966607|
|   460|    471|   5.0|  3.472546|
|   491|    471|   3.0| 4.2521105|
|   607|    471|   4.0| 3.1809988|
|   358|    471|   5.0| 3.7823098|
|    23|    471|   3.5| 3.6827884|
|   105|    471|   4.0| 3.6581256|
+------+-------+------+----------+
only showing top 10 rows



In [6]:
# Evaluating the model
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')
rmse = evaluator.evaluate(pred.na.drop())
print('RMSE: %.2f'%rmse)

RMSE: 1.12


In [7]:
# How can we use this model to recommend a movie to a new single user
single_user = test.filter(test['userId'] == 11).select(['movieId', 'userId'])
single_user.show(10)

+-------+------+
|movieId|userId|
+-------+------+
|    296|    11|
|    785|    11|
|   1201|    11|
|   1408|    11|
|   3424|    11|
|  48516|    11|
|  58295|    11|
|  79132|    11|
| 106487|    11|
+-------+------+



In [8]:
# Let's predict how this user going to like the above mentioned movies
recommendations = model.transform(single_user)
recommendations.orderBy('movieId', ascending = False).show(10)

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
| 106487|    11| 3.6910338|
|  79132|    11| 4.2264347|
|  58295|    11| 3.4234211|
|  48516|    11| 4.2861857|
|   3424|    11|  5.153322|
|   1408|    11| 2.9022808|
|   1201|    11|  5.137532|
|    785|    11| 4.0600085|
|    296|    11| 4.9990373|
+-------+------+----------+



In [9]:
# Let's check our prediction against the actual data to see how well our model perform
test.filter(test['userId'] == 11).orderBy('movieId', ascending = False).show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    11| 106487|   5.0|
|    11|  79132|   4.0|
|    11|  58295|   4.5|
|    11|  48516|   5.0|
|    11|   3424|   3.0|
|    11|   1408|   5.0|
|    11|   1201|   5.0|
|    11|    785|   3.5|
|    11|    296|   5.0|
+------+-------+------+

