In [None]:
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ALS, Rating, MatrixFactorizationModel}

In [None]:
val data = sc.textFile("/ratings.csv")
val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
    Rating(user.toInt, item.toInt, rate.toDouble)
})
  
val splits = ratings.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))

In [None]:
// train implicit uses rank and num iterations, lambda and alpha
val factorization = ALS.train(trainingData, 9, 10, 0.01, 1)

In [None]:
// use the test data now to check
val usersProducts = testData.map { case Rating(user, product, rate) =>
  (user, product)
}

In [None]:
// return the product ratings 
val predictions = factorization.predict(usersProducts).map { case Rating(user, product, rate) => 
    ((user, product), rate)
}
predictions.take(5)

In [None]:
// join with the original and check whether the products 
val ratesAndPreds = ratings.map { case Rating(user, product, rate) => 
  ((user, product), rate)
}.join(predictions)
ratesAndPreds.take(5)

In [None]:
// Display the movies in the dataset
val movies = sc.textFile("/movies.csv").map(line => line.split(",")).map(movs => (movs(0).toInt, movs(1)))
val movieSet = ratesAndPreds.map { case ((a, b), (c, d)) =>
    (b, (a, c, d))
}.join(movies)
movieSet.take(10)

In [None]:
// calculate the error here - remember this is in the reported units of the scale - so on average we're MSE out on the scale 
val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => 
  val err = (r1 - r2)
  err * err
}.mean()
println("Mean Squared Error = " + MSE)