# Movie recommendation using ALS
https://medium.com/@navdeepsingh_2336/scala-machine-learning-projects-recommendation-systems-d41d9eebbb06

https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

https://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/

https://fr.wikipedia.org/wiki/Filtrage_collaboratif

## Import packages 

In [ ]:
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating

import scala.Tuple2
import org.apache.spark.rdd.RDD

import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD


## Read data 

In [ ]:
val pathRatings = "hdfs://hupi-factory-02-01-01-01/user/factory02/movieLens20m/ratings.csv"
val pathMovies = "hdfs://hupi-factory-02-01-01-01/user/factory02/movieLens20m/movies.csv"

pathRatings: String = hdfs://hupi-factory-02-01-01-01/user/factory02/movieLens20m/ratings.csv
pathMovies: String = hdfs://hupi-factory-02-01-01-01/user/factory02/movieLens20m/movies.csv


In [ ]:
val df1 = sparkSession.read.format("com.databricks.spark.csv").option("header", true).load(pathRatings)
val df2 = sparkSession.read.format("com.databricks.spark.csv").option("header", "true").load(pathMovies)

df1: org.apache.spark.sql.DataFrame = [userId: string, movieId: string ... 2 more fields]
df2: org.apache.spark.sql.DataFrame = [movieId: string, title: string ... 1 more field]


In [ ]:
val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
ratingsDF.show(false)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |2      |3.5   |1112486027|
|1     |29     |3.5   |1112484676|
|1     |32     |3.5   |1112484819|
|1     |47     |3.5   |1112484727|
|1     |50     |3.5   |1112484580|
|1     |112    |3.5   |1094785740|
|1     |151    |4.0   |1094785734|
|1     |223    |4.0   |1112485573|
|1     |253    |4.0   |1112484940|
|1     |260    |4.0   |1112484826|
|1     |293    |4.0   |1112484703|
|1     |296    |4.0   |1112484767|
|1     |318    |4.0   |1112484798|
|1     |337    |3.5   |1094785709|
|1     |367    |3.5   |1112485980|
|1     |541    |4.0   |1112484603|
|1     |589    |3.5   |1112485557|
|1     |593    |3.5   |1112484661|
|1     |653    |3.0   |1094785691|
|1     |919    |3.5   |1094785621|
+------+-------+------+----------+
only showing top 20 rows

ratingsDF: org.apache.spark.sql.DataFrame = [userId: string, movieId: string ... 2 more fields]


In [ ]:
val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))

moviesDF: org.apache.spark.sql.DataFrame = [movieId: string, title: string ... 1 more field]


In [ ]:
// Create temporary tables
ratingsDF.createOrReplaceTempView("ratings")
moviesDF.createOrReplaceTempView("movies")

## Explore data 

In [ ]:
val numRatings = ratingsDF.count()

numRatings: Long = 20000263


In [ ]:
val numUsers = ratingsDF.select(ratingsDF.col("userId")).distinct().count()

numUsers: Long = 138493


In [ ]:
val numMovies = ratingsDF.select(ratingsDF.col("movieId")).distinct().count() 

numMovies: Long = 26744


In [ ]:
println("Got " + numRatings + " ratings from " + numUsers + " users on " + numMovies + " movies.")

Got 20000263 ratings from 138493 users on 26744 movies.


Get the max, min ratings along with the count of users who have rated a movie

In [ ]:
val results = sparkSession.sql("""
  select movies.title, movierates.maxr, movierates.minr, movierates.cntu 
  from(SELECT ratings.movieId, max(ratings.rating) as maxr, min(ratings.rating) as minr, 
          count(distinct userId) as cntu
      FROM ratings group by ratings.movieId) movierates
  join movies on movierates.movieId=movies.movieId
""")

results: org.apache.spark.sql.DataFrame = [title: string, maxr: string ... 2 more fields]


In [ ]:
results.show(false)

+--------------------------------------------------+----+----+-----+
|title                                             |maxr|minr|cntu |
+--------------------------------------------------+----+----+-----+
|Battle of Los Angeles (2011)                      |5.0 |0.5 |22   |
|Frozen Planet (2011)                              |5.0 |0.5 |31   |
|One-Eyed Monster (2008)                           |4.0 |0.5 |4    |
|Deep in the Valley (American Hot Babes) (2009)    |3.0 |0.5 |5    |
|Only God Forgives (2013)                          |5.0 |0.5 |167  |
|Pyaar Ka Punchnama (2011)                         |4.5 |2.0 |4    |
|Love and Lemons (Sm? citroner gula) (2013)        |3.0 |3.0 |3    |
|Deathsport (1978)                                 |3.0 |1.0 |3    |
|Narrien illat (1970)                              |2.0 |2.0 |1    |
|Toy Story of Terror (2013)                        |5.0 |0.5 |60   |
|Tortured (2008)                                   |2.5 |2.5 |1    |
|World War II: When Lions Roared (

The 10 most active users and how many times they rated a movie

In [ ]:
val mostActiveUsersSchemaRDD = sparkSession.sql("""
  SELECT ratings.userId, count(*) as ct 
  from ratings 
  group by ratings.userId 
  order by ct desc 
  limit 10
""")

mostActiveUsersSchemaRDD: org.apache.spark.sql.DataFrame = [userId: string, ct: bigint]


In [ ]:
mostActiveUsersSchemaRDD.show(false)

+------+----+
|userId|ct  |
+------+----+
|118205|9254|
|8405  |7515|
|82418 |5646|
|121535|5520|
|125794|5491|
|74142 |5447|
|34576 |5356|
|131904|5330|
|83090 |5169|
|59477 |4988|
+------+----+



Let’s have a look at a particular user and find the movies that, say user, 668 rated higher than 4

In [ ]:
val results2 = sparkSession.sql("""
  SELECT ratings.userId, ratings.movieId, ratings.rating, movies.title 
  FROM ratings 
  JOIN movies ON movies.movieId=ratings.movieId
  where ratings.userId=668 and ratings.rating > 4
""") 

results2: org.apache.spark.sql.DataFrame = [userId: string, movieId: string ... 2 more fields]


In [ ]:
results2.show(false)

+------+-------+------+----------------------------------------------------+
|userId|movieId|rating|title                                               |
+------+-------+------+----------------------------------------------------+
|668   |175    |4.5   |Kids (1995)                                         |
|668   |296    |5.0   |Pulp Fiction (1994)                                 |
|668   |1258   |4.5   |Shining, The (1980)                                 |
|668   |1285   |4.5   |Heathers (1989)                                     |
|668   |2174   |4.5   |Beetlejuice (1988)                                  |
|668   |2395   |5.0   |Rushmore (1998)                                     |
|668   |2915   |4.5   |Risky Business (1983)                               |
|668   |2959   |4.5   |Fight Club (1999)                                   |
|668   |4973   |5.0   |Amelie (Fabuleux destin d'Am?lie Poulain, Le) (2001)|
|668   |4979   |5.0   |Royal Tenenbaums, The (2001)                        |

## Split data 

Split ratings RDD into training RDD (75%) & test RDD (25%)

In [ ]:
val splits = ratingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L)
val (trainingData, testData) = (splits(0), splits(1))

val numTraining = trainingData.count()
val numTest = testData.count()

println("Training: " + numTraining + " test: " + numTest)

Training: 15002884 test: 4997379
splits: Array[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = Array([userId: string, movieId: string ... 2 more fields], [userId: string, movieId: string ... 2 more fields])
trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: string, movieId: string ... 2 more fields]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: string, movieId: string ... 2 more fields]
numTraining: Long = 15002884
numTest: Long = 4997379


## Prepare data for training model 

In [ ]:
// For trainingData
val ratingsRDD = trainingData.rdd.map(row => { 
  val userId = row.getString(0)
  val movieId = row.getString(1)
  val ratings = row.getString(2)
  Rating(userId.toInt, movieId.toInt, ratings.toDouble)
})

ratingsRDD: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating] = MapPartitionsRDD[76] at map at <console>:75


In [ ]:
// For testData
val testRDD = testData.rdd.map(row => {
  val userId = row.getString(0)
  val movieId = row.getString(1)
  val ratings = row.getString(2)
  Rating(userId.toInt, movieId.toInt, ratings.toDouble)
})

testRDD: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating] = MapPartitionsRDD[81] at map at <console>:75


## Build ALS model 

In [ ]:
// les hyper-paramètres du modèle
val rank = 20
val numIterations = 15
val lambda = 0.10
val alpha = 1.00 
val block = -1
val seed = 12345L
val implicitPrefs = false

rank: Int = 20
numIterations: Int = 15
lambda: Double = 0.1
alpha: Double = 1.0
block: Int = -1
seed: Long = 12345
implicitPrefs: Boolean = false


In [ ]:
val model = new ALS().setIterations(numIterations) .setBlocks(block).setAlpha(alpha)
  .setLambda(lambda)
  .setRank(rank) .setSeed(seed)
  .setImplicitPrefs(implicitPrefs)
  .run(ratingsRDD)

model: org.apache.spark.mllib.recommendation.MatrixFactorizationModel = org.apache.spark.mllib.recommendation.MatrixFactorizationModel@4fd89dd7


## Predict 

In [ ]:
println("Rating:(UserID, MovieID, Rating)")

println("----------------------------------")

val topRecsForUser = model.recommendProducts(668, 6) 

for (rating <- topRecsForUser) { println(rating.toString()) } 
println("----------------------------------")

Rating:(UserID, MovieID, Rating)
----------------------------------
Rating(668,98595,5.740206979348404)
Rating(668,117907,5.704248251716616)
Rating(668,112423,5.511863334836015)
Rating(668,104317,5.437270557417177)
Rating(668,89056,5.421782564053725)
Rating(668,128830,5.342475209729571)
----------------------------------
topRecsForUser: Array[org.apache.spark.mllib.recommendation.Rating] = Array(Rating(668,98595,5.740206979348404), Rating(668,117907,5.704248251716616), Rating(668,112423,5.511863334836015), Rating(668,104317,5.437270557417177), Rating(668,89056,5.421782564053725), Rating(668,128830,5.342475209729571))


## Evaluate model 

In [ ]:
def computeRmse(model: MatrixFactorizationModel, 
                data: RDD[Rating], 
                implicitPrefs: Boolean): Double = { 
  val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) 
  val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating) } 
        .join(data.map(x => ((x.user, x.product), x.rating))).values 
  if (implicitPrefs) { println("(Prediction, Rating)") 
                      println(predictionsAndRatings.take(5).mkString("n")) 
                     } 
  math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
}

computeRmse: (model: org.apache.spark.mllib.recommendation.MatrixFactorizationModel, data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating], implicitPrefs: Boolean)Double


In [ ]:
val rmseTest = computeRmse(model, testRDD, true)

println("Test RMSE: = " + rmseTest) //Less is better

(Prediction, Rating)
(4.104696463479858,5.0)n(4.149396123497152,4.5)n(3.9602610672116043,3.5)n(3.8000647397782297,3.5)n(3.813269417867101,5.0)
Test RMSE: = 0.8055232578571584
rmseTest: Double = 0.8055232578571584


In [ ]:
println("Recommendations: (MovieId => Rating)") 
println("----------------------------------") 
val recommendationsUser = model.recommendProducts(668, 6) 

recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) 
println("----------------------------------")

Recommendations: (MovieId => Rating)
----------------------------------
(98595,5.740206979348404)
(117907,5.704248251716616)
(112423,5.511863334836015)
(104317,5.437270557417177)
(89056,5.421782564053725)
(128830,5.342475209729571)
----------------------------------
recommendationsUser: Array[org.apache.spark.mllib.recommendation.Rating] = Array(Rating(668,98595,5.740206979348404), Rating(668,117907,5.704248251716616), Rating(668,112423,5.511863334836015), Rating(668,104317,5.437270557417177), Rating(668,89056,5.421782564053725), Rating(668,128830,5.342475209729571))
