In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('recommender_demo').getOrCreate()

### Đọc dữ liệu

In [5]:
data = spark.read.csv('../../Data/movielens_ratings.csv', inferSchema=True, header=True)

In [6]:
data.show(5, False)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|2      |3.0   |0     |
|3      |1.0   |0     |
|5      |2.0   |0     |
|9      |4.0   |0     |
|11     |1.0   |0     |
+-------+------+------+
only showing top 5 rows



In [7]:
# Distinct users and movies
users = data.select("userId").distinct().count()
movies = data.select("movieId").distinct().count()
numerator = data.count()

In [8]:
display(users, movies, numerator)

30

100

1501

### Chuẩn hóa dữ liệu, chuyển đổi dữ liệu (nếu cần)
### Chia sẻ dữ liệu train-test

In [9]:
# Smaller dataset so we will use 0.8/0.2
(training, test) = data.randomSplit([0.8, 0.2])

### Xây dựng model

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

* maxIter is the maximum number of iterations to run (defaults to 10).
* regParam specifies the regularization parameter in ALS (defaults to 1.0).
* rank is the number of latent factors in the model (defaults to 10).
* numBlocks is the number of blocks the users and ittems will be partitioned into in order to parallelize computation (defaults to 10).
* implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
* alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0).
* nonnegative specifies whether or not to use nonnegative constraints for least squares (defaults to false).

In [11]:
als = ALS(maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating", alpha=0.01)
model = als.fit(training)

### Đánh giá kết quả

In [12]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [13]:
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    27| 0.6368803|
|     31|   4.0|    12| 1.0867245|
|     31|   1.0|     4| 1.7705575|
|     85|   1.0|    12| 0.8865745|
|     85|   3.0|     6|  2.700335|
|     85|   1.0|     4| 2.6832724|
|     85|   1.0|    25|0.90189505|
|     65|   1.0|    22|  0.778532|
|     65|   1.0|     4|0.78153044|
|     53|   1.0|     6|  2.258929|
|     53|   3.0|    20| 1.0708176|
|     53|   2.0|    19| 0.9484483|
|     53|   5.0|    21| 2.9477472|
|     78|   1.0|     1| 0.7810516|
|     78|   1.0|    19| 0.7383895|
|     78|   1.0|     4|0.89909494|
|     34|   1.0|    16|0.99475443|
|     34|   1.0|    19| 1.7423106|
|     34|   3.0|    25| 1.0933553|
|     34|   4.0|     2| 1.6565886|
+-------+------+------+----------+
only showing top 20 rows



In [14]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error =', str(rmse))

Root-mean-square error = 1.0528745793022518


### Đưa ra đề xuất cho tất cả user

In [15]:
# Get 3 recommendations which have highest rating
user_recs = model.recommendForAllUsers(3)

In [18]:
for user in user_recs.head(10):
    print(user)
    print('\n')

Row(userId=28, recommendations=[Row(movieId=81, rating=4.012272834777832), Row(movieId=12, rating=3.9162325859069824), Row(movieId=2, rating=3.6629419326782227)])


Row(userId=26, recommendations=[Row(movieId=22, rating=4.6804962158203125), Row(movieId=94, rating=4.527807712554932), Row(movieId=24, rating=4.452171802520752)])


Row(userId=27, recommendations=[Row(movieId=32, rating=3.397430896759033), Row(movieId=18, rating=3.343568801879883), Row(movieId=38, rating=3.1188554763793945)])


Row(userId=12, recommendations=[Row(movieId=46, rating=5.055620193481445), Row(movieId=55, rating=4.19067907333374), Row(movieId=64, rating=4.030313968658447)])


Row(userId=22, recommendations=[Row(movieId=75, rating=4.6111907958984375), Row(movieId=74, rating=4.492098808288574), Row(movieId=88, rating=4.179080009460449)])


Row(userId=1, recommendations=[Row(movieId=62, rating=3.0914840698242188), Row(movieId=68, rating=3.079878568649292), Row(movieId=22, rating=2.9451892375946045)])


Row(userId=1

### Đưa ra đề xuất cho một user cụ thể

In [19]:
userId = 27
user_recs.filter(user_recs['userId']==27).show(truncate=False)

+------+--------------------------------------------------+
|userId|recommendations                                   |
+------+--------------------------------------------------+
|27    |[{32, 3.397431}, {18, 3.3435688}, {38, 3.1188555}]|
+------+--------------------------------------------------+

