In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [None]:
import pyspark.sql.functions as f

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
spark = SparkSession.builder.appName('recommender_demo').getOrCreate()

In [None]:
data = spark.read.csv('Cung cap du lieu buoi 8/movielens_ratings.csv', inferSchema = True, header = True)

In [None]:
data.show(5, False)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|2      |3.0   |0     |
|3      |1.0   |0     |
|5      |2.0   |0     |
|9      |4.0   |0     |
|11     |1.0   |0     |
+-------+------+------+
only showing top 5 rows



In [None]:
# Distinct users and movies
users = data.select('userId').distinct().count()
movies = data.select('movieId').distinct().count()
numerator = data.count()

In [None]:
display(numerator, users, movies)

1501

30

100

In [None]:
# Smaller dataset so we will use 0.8 / 0.2
training, test = data.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter = 10, regParam = 0.01, 
          userCol = 'userId',
          itemCol = 'movieId',
          ratingCol = 'rating')
model = als.fit(training)
# Repeat with different parameter(maxIter, regParam)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [None]:
predictions.show(5)

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    13|  0.221611|
|     31|   3.0|     8| 3.6727962|
|     31|   1.0|    29|0.28979826|
|     31|   1.0|     0| 0.9264622|
|     85|   1.0|    13|  3.052084|
+-------+------+------+----------+
only showing top 5 rows



In [None]:
evaluator = RegressionEvaluator(metricName = 'rmse', 
                                labelCol = 'rating',
                                predictionCol = 'prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-squared error = ' + str(rmse))

Root-mean-squared error = 1.591313012769057


In [None]:
# On evarage, this model is ~ 1.7 from perfect recommendations

In [None]:
# Get 20 recommendations which have highest rating.
user_recs = model.recommendForAllUsers(20)

In [None]:
for user in user_recs.head(2):
    print(user)
    print('\n')

Row(userId=28, recommendations=[Row(movieId=47, rating=5.848161697387695), Row(movieId=22, rating=5.718774795532227), Row(movieId=71, rating=5.585036277770996), Row(movieId=12, rating=4.931039810180664), Row(movieId=25, rating=4.892429828643799), Row(movieId=92, rating=4.837496757507324), Row(movieId=81, rating=4.825254440307617), Row(movieId=1, rating=4.761500835418701), Row(movieId=91, rating=4.619905471801758), Row(movieId=33, rating=4.054471492767334), Row(movieId=89, rating=3.977895736694336), Row(movieId=49, rating=3.9064536094665527), Row(movieId=84, rating=3.7748336791992188), Row(movieId=17, rating=3.7146804332733154), Row(movieId=2, rating=3.682649612426758), Row(movieId=82, rating=3.268052339553833), Row(movieId=62, rating=3.1074442863464355), Row(movieId=40, rating=3.09523344039917), Row(movieId=19, rating=3.0653679370880127), Row(movieId=16, rating=2.979362964630127)])


Row(userId=26, recommendations=[Row(movieId=46, rating=5.350130558013916), Row(movieId=94, rating=5.096

In [None]:
userID = 27
test.filter(test['userId']==userID).show(truncate=False)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|27     |3.0   |27    |
|40     |1.0   |27    |
|45     |1.0   |27    |
|59     |1.0   |27    |
|66     |3.0   |27    |
|72     |1.0   |27    |
|75     |3.0   |27    |
+-------+------+------+



In [None]:
# List of movies that UserID 27 watched and rated
userID = 27
test.filter(test['userId']==userID).sort('rating', ascending = False).show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|     66|   3.0|    27|
|     27|   3.0|    27|
|     75|   3.0|    27|
|     40|   1.0|    27|
|     45|   1.0|    27|
|     59|   1.0|    27|
|     72|   1.0|    27|
+-------+------+------+

