In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://10.226.156.86:4040
SparkContext available as 'sc' (version = 2.1.1, master = local[*], app id = local-1503343022298)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@8bc6648


In [2]:
val data = (spark.read.option("header","true")
            .option("inferSchema","true")
            .csv("movie_ratings.csv"))
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



data: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 1 more field]


In [3]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|            100004|            100004|            100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|
|    min|                 1|                 1|               0.5|
|    max|               671|            163949|               5.0|
+-------+------------------+------------------+------------------+



In [4]:
// Splitting the data into train set and test set
val Array(training, test) = data.randomSplit(Array(0.8, 0.2))

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 1 more field]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 1 more field]


In [5]:
// Developing recommnedation system model
// Alternative least square(ALS) method
import org.apache.spark.ml.recommendation.ALS
val als = (new ALS()  
           .setMaxIter(5)  
           .setRegParam(0.01) 
           .setUserCol("userId")  
           .setItemCol("movieId")  
           .setRatingCol("rating")
          )

// Fitting the model with training data
val model = als.fit(training)

// Checking the prediction with test data
val pred = model.transform(test)
pred.show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   575|    148|   4.0|       NaN|
|    30|    463|   4.0| 3.6407297|
|   588|    471|   3.0|  5.079941|
|    19|    471|   3.0| 3.5896082|
|    92|    471|   4.0|  4.560835|
|   299|    471|   4.5| 3.2854736|
|    23|    471|   3.5| 4.0470138|
|   624|    471|   4.0| 3.5342512|
|   649|    471|   3.0| 3.5394318|
|   508|    471|   4.0| 3.9685206|
+------+-------+------+----------+
only showing top 10 rows



import org.apache.spark.ml.recommendation.ALS
als: org.apache.spark.ml.recommendation.ALS = als_0889d4af2b8c
model: org.apache.spark.ml.recommendation.ALSModel = als_0889d4af2b8c
pred: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [6]:
// Evaluating the model
import org.apache.spark.ml.evaluation.RegressionEvaluator
val evaluator = (new RegressionEvaluator()
                 .setMetricName("rmse")
                 .setLabelCol("rating")
                 .setPredictionCol("prediction")
                )
val rmse = evaluator.evaluate(pred.na.drop())
println($"RMSE: $rmse")

RMSE: 1.109343388002812


import org.apache.spark.ml.evaluation.RegressionEvaluator
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_f1fbf5b1b6d6
rmse: Double = 1.109343388002812


In [7]:
// How can we use this model to recommend a movie to a new single user
val single_user = test.filter($"userId" === 11).select("movieId", "userId")
single_user.show(10)

+-------+------+
|movieId|userId|
+-------+------+
|    126|    11|
|  26614|    11|
|  51084|    11|
|  77455|    11|
|  91500|    11|
|  91548|    11|
|  96079|    11|
| 106487|    11|
+-------+------+



single_user: org.apache.spark.sql.DataFrame = [movieId: int, userId: int]


In [8]:
// Let's predict how this user going to like the above mentioned movies
import org.apache.spark.sql.functions._
val recommendations = model.transform(single_user)
recommendations.orderBy($"movieId".desc).show(10)

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
| 106487|    11| 3.3369894|
|  96079|    11| 3.1696131|
|  91548|    11|       NaN|
|  91500|    11| 3.9924564|
|  77455|    11|  5.065037|
|  51084|    11| 1.3344916|
|  26614|    11| 3.5799954|
|    126|    11|  1.008879|
+-------+------+----------+



import org.apache.spark.sql.functions._
recommendations: org.apache.spark.sql.DataFrame = [movieId: int, userId: int ... 1 more field]


In [9]:
// Let's check our prediction against the actual data to see how well our model perform
test.filter($"userId" === 11).orderBy($"movieId".desc).show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    11| 106487|   5.0|
|    11|  96079|   4.0|
|    11|  91548|   4.0|
|    11|  91500|   4.5|
|    11|  77455|   4.5|
|    11|  51084|   4.0|
|    11|  26614|   5.0|
|    11|    126|   4.0|
+------+-------+------+

