## Load Data

In [1]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [2]:
# The code was removed by DSX for sharing.

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   1172|   4.0|
+------+-------+------+
only showing top 5 rows



In [3]:
small_rating.schema.names

['userId', 'movieId', 'rating']

In [4]:

small_movies = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .load(cos.url('movies.csv', 'moiverecommendationd136f4f785054be6861fc3dba4e8391e')).drop('genres')

small_movies.show(5)


+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
+-------+--------------------+
only showing top 5 rows



## Collaborative Filtering

In [5]:
train, validation, test = small_rating.randomSplit([0.6, 0.2, 0.2])
validation_for_predict = validation.select(validation.columns[0:2])
test_for_predict = test.select(test.columns[0:2])

In [6]:
from pyspark.mllib.recommendation import ALS
import math

seed = 0
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1    

In [7]:
for rank in ranks:
    model = ALS.train(train, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict.rdd).map(lambda x: ((x[0], x[1]), x[2]))
    rates_and_preds = validation.rdd.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda x: (x[1][0] - x[1][1])**2).mean())
    errors[err] = error
    err += 1
    print('For rank %s the RMSE is %s'%(rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank
    
print('The best model was trained with rank %s' %best_rank)

For rank 4 the RMSE is 0.9393634103820316
For rank 8 the RMSE is 0.9514905390216329
For rank 12 the RMSE is 0.953502927598072
The best model was trained with rank 4


In [9]:

full_rating = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .load(cos.url('ratings_full.csv', 'moiverecommendationd136f4f785054be6861fc3dba4e8391e'))\
  .drop('timestamp')

full_rating.show(5)


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    110|   1.0|
|     1|    147|   4.5|
|     1|    858|   5.0|
|     1|   1221|   5.0|
|     1|   1246|   5.0|
+------+-------+------+
only showing top 5 rows



In [11]:
train2, test2 = full_rating.randomSplit([0.7, 0.3])
model2 = ALS.train(train2, best_rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)

test2_for_predict = test2.rdd.map(lambda x: (x[0], x[1]))
predictions2 = model.predictAll(test_for_predict.rdd).map(lambda x: ((x[0], x[1]), x[2]))
rates_and_preds2 = test2.rdd.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(predictions2)
error2 = math.sqrt(rates_and_preds2.map(lambda x: (x[1][0] - x[1][1])**2).mean())

In [12]:
print('For testing data the RMSE is %s' %error)

For testing data the RMSE is 0.953502927598072
