In [26]:
import math
import os
from pyspark.mllib.recommendation import ALS
from pyspark import SparkConf,SparkContext

# 读取数据

In [18]:
dataset_path = os.path.join('file:///home/hadoop/workspaces/books_recommendation_system/datasets', 'BX-CSV-Dump')
ratings_file_path = os.path.join(dataset_path, 'BX-Book-Ratings.csv')
ratings_raw_RDD = sc.textFile(ratings_file_path)

# 处理数据

In [22]:
ratings_raw_data_header = ratings_raw_RDD.take(1)[0]
ratings_RDD = ratings_raw_RDD.filter(lambda line : line != ratings_raw_data_header)\
.map(lambda line : line.split(';'))\
.map(lambda tokens : (int(tokens[0][1:-1]),abs(hash(tokens[1][1:-1])) % (10 ** 8),int(tokens[2][1:-1]))).cache()

# 构造训练集，测试集和验证集

In [23]:
test,train,validate = ratings_RDD.randomSplit(weights = [0.3, 0.6, 0.1], seed = 1)

In [24]:
test.top(3)

[(278854, 89577233, 7), (278851, 82832139, 0), (278851, 76278149, 0)]

In [25]:
test = test.map(lambda token : (token[0],token[1]))

# 模型
## （1）参数设置

In [None]:
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12, 16, 20]
errors = [0, 0, 0, 0, 0]
err = 0
tolerance = 0.02
min_error = float('inf')
best_rank = -1
best_iteration = -1

## （2）模型训练

In [31]:
for rank in ranks:
    model = ALS.train(train,rank,seed = seed,iterations = iterations, lambda_ = regularization_parameter)
    predictions = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validate.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

For rank 4 the RMSE is 9.779722167495578
For rank 8 the RMSE is 1.9311406385604153
For rank 12 the RMSE is 4.978624280268749
For rank 16 the RMSE is 4.242691962481146
For rank 20 the RMSE is 2.961898227522224


In [32]:
print('The best model was trained with rank %s' % best_rank)

The best model was trained with rank 8


In [35]:
rates_and_preds.take(1)

[((98391, 16891830), (9.0, 6.038101772477776))]

In [36]:
predictions.take(1)

[((19378, 60135476), 4.464581323923139)]