In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://10.226.156.86:4040
SparkContext available as 'sc' (version = 2.1.1, master = local[*], app id = local-1503347694087)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@17bbbdf1


In [2]:
val data = (spark.read.option("header","true")
            .option("inferSchema","true")
            .csv("movie_ratings.csv"))
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



data: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 1 more field]


In [3]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|            100004|            100004|            100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|
|    min|                 1|                 1|               0.5|
|    max|               671|            163949|               5.0|
+-------+------------------+------------------+------------------+



In [4]:
// Splitting the data into train set and test set
val Array(training, test) = data.randomSplit(Array(0.8, 0.2))

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 1 more field]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 1 more field]


*** When making predictions using Alternative least square(ALS) method, it is common to encounter users and/or items in the test dataset that were not present during training the model. This amy cause 'NaN' predicted values in result for evaluation metrics. So, Spark has a feature called 'coldStartStrategy' to reslove this issue. ***

In [5]:
// Developing recommnedation system model
// Alternative least square(ALS) method
import org.apache.spark.ml.recommendation.ALS
val als = (new ALS()  
           .setMaxIter(5)  // number of iterations to run
           .setRegParam(0.01) // regularization parameter
           .setUserCol("userId")  
           .setItemCol("movieId")  
           .setRatingCol("rating")
//            .setColdStartStrategy("drop") // we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
          )

// Fitting the model with training data
val model = als.fit(training)

// Checking the prediction with test data
val pred = model.transform(test)
pred.show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   575|    148|   4.0|       NaN|
|   452|    463|   2.0|  2.525254|
|   588|    471|   3.0| 3.9335432|
|   460|    471|   5.0| 3.2634168|
|   306|    471|   3.0| 3.5716166|
|   491|    471|   3.0| 3.3785546|
|   502|    471|   4.0|  4.505924|
|   241|    471|   4.0| 3.2869341|
|   487|    471|   4.0| 3.5859377|
|   399|    471|   5.0|  3.345561|
+------+-------+------+----------+
only showing top 10 rows



import org.apache.spark.ml.recommendation.ALS
als: org.apache.spark.ml.recommendation.ALS = als_746342d2fd7f
model: org.apache.spark.ml.recommendation.ALSModel = als_746342d2fd7f
pred: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [6]:
// Evaluating the model
import org.apache.spark.ml.evaluation.RegressionEvaluator
val evaluator = (new RegressionEvaluator()
                 .setMetricName("rmse")
                 .setLabelCol("rating")
                 .setPredictionCol("prediction")
                )
val rmse = evaluator.evaluate(pred.na.drop())
println($"RMSE: $rmse")

RMSE: 1.109687797412326


import org.apache.spark.ml.evaluation.RegressionEvaluator
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_8f81fb87febe
rmse: Double = 1.109687797412326


In [7]:
// How can we use this model to recommend a movie to a new single user
val single_user = test.filter($"userId" === 11).select("movieId", "userId")
single_user.show(10)

+-------+------+
|movieId|userId|
+-------+------+
|    778|    11|
|   1027|    11|
|   6598|    11|
+-------+------+



single_user: org.apache.spark.sql.DataFrame = [movieId: int, userId: int]


In [8]:
// Let's predict how this user going to like the above mentioned movies
import org.apache.spark.sql.functions._
val recommendations = model.transform(single_user)
recommendations.orderBy($"movieId".desc).show(10)

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   6598|    11| 0.8916418|
|   1027|    11| 2.7821307|
|    778|    11|  4.510311|
+-------+------+----------+



import org.apache.spark.sql.functions._
recommendations: org.apache.spark.sql.DataFrame = [movieId: int, userId: int ... 1 more field]


In [9]:
// Let's check our prediction against the actual data to see how well our model perform
test.filter($"userId" === 11).orderBy($"movieId".desc).show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    11|   6598|   5.0|
|    11|   1027|   4.5|
|    11|    778|   4.5|
+------+-------+------+

