In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.recommendation.ALS

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.recommendation.ALS


In [3]:
val spark = SparkSession.builder().getOrCreate()
val data = spark.read.option("header", "true")
            .option("inferSchema", "true")
            .format("csv")
            .load("./datasets/movie_ratings.csv")

data.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   1172|   4.0|
|     1|   1263|   2.0|
|     1|   1287|   2.0|
|     1|   1293|   2.0|
|     1|   1339|   3.5|
|     1|   1343|   2.0|
|     1|   1371|   2.5|
|     1|   1405|   1.0|
|     1|   1953|   4.0|
|     1|   2105|   4.0|
|     1|   2150|   3.0|
|     1|   2193|   2.0|
|     1|   2294|   2.0|
|     1|   2455|   2.5|
|     1|   2968|   1.0|
|     1|   3671|   3.0|
+------+-------+------+
only showing top 20 rows



spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2d9c858
data: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 1 more field]


In [4]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [5]:
data.count()

res3: Long = 100004


In [6]:
val Array(train, test) = data.randomSplit(Array(0.8, 0.2))

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 1 more field]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 1 more field]


In [7]:
val als = new ALS()
            .setMaxIter(5)
            .setRegParam(0.01)
            .setUserCol("userId")
            .setItemCol("movieId")
            .setRatingCol("rating")

als: org.apache.spark.ml.recommendation.ALS = als_97d7ee71242e


In [8]:
val model = als.fit(train)

model: org.apache.spark.ml.recommendation.ALSModel = ALSModel: uid=als_97d7ee71242e, rank=10


In [9]:
val predictions = model.transform(test)

predictions: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [10]:
predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   126|    471|   5.0| 3.8498173|
|   460|    471|   5.0|    3.8482|
|   292|    471|   3.5| 3.9373882|
|   309|    471|   4.0| 4.3008566|
|   537|    471|   5.0|  2.955943|
|   514|    471|   4.0| 3.9447765|
|   102|    471|   5.0| 3.4636588|
|    30|    471|   4.0| 3.2390635|
|   521|    471|   3.5| 3.8136914|
|   497|    496|   2.0| 1.9009589|
|   516|   1088|   3.0| 3.5277994|
|    57|   1088|   4.0| 3.9113593|
|    19|   1088|   3.0|  3.145015|
|   262|   1088|   2.0|  2.227397|
|   461|   1088|   3.0|   2.12214|
|   547|   1088|   5.0| 2.6412466|
|   344|   1088|   3.0|  3.438826|
|    30|   1088|   4.0| 4.0381246|
|   647|   1238|   4.0|  4.762541|
|   585|   1238|   5.0| 4.0878115|
+------+-------+------+----------+
only showing top 20 rows



In [11]:
import org.apache.spark.sql.functions._

import org.apache.spark.sql.functions._


In [14]:
val error = predictions.select(abs($"rating" - $"prediction"))

error: org.apache.spark.sql.DataFrame = [abs((rating - prediction)): double]


In [17]:
error.na.drop().describe().show()

+-------+--------------------------+
|summary|abs((rating - prediction))|
+-------+--------------------------+
|  count|                     19209|
|   mean|        0.8281573207619471|
| stddev|        0.7139848979552249|
|    min|      7.367134094238281E-5|
|    max|         6.011513710021973|
+-------+--------------------------+

