In [1]:
from pyspark import SparkContext, SparkConf # 导入相关工具包
from pyspark.mllib.recommendation import ALS
from math import sqrt
from operator import add
import itertools

# 初始化Spark上下文
# local为本地调试模式，具体集群方式参照http://spark.apache.org/docs/latest/cluster-overview.html
conf = SparkConf().setAppName("CF").setMaster("local") 
sc = SparkContext(conf=conf)
print ("init complete：sc = ", sc)

('init complete\xef\xbc\x9asc = ', <SparkContext master=local appName=CF>)


In [2]:
# 导入数据，数据格式为：user_id::movies_id::rating::time
ratings = sc.textFile("file:///root/notebook/data/ratings.dat").map(lambda line: line.strip().split("::"))
print("data.count() = %d" % ratings.count())
# 对应的电影文件的格式为movieId::movieTitle
movies = sc.textFile("file:///root/notebook/data/movies.dat").map(lambda line: line.strip().split("::"))

# 数据预处理，根据评论时间戳最后一位把整个数据集分成训练集(60%), 交叉验证集(20%), 和评估集(20%)
ratingsData = ratings.map(lambda fields: (long(fields[3]) % 10, (int(fields[0]), int(fields[1]), float(fields[2]))))
trainingData = ratingsData.filter(lambda x: x[0] < 6).values()
validationData = ratingsData.filter(lambda x: x[0] >= 6 and x[0] < 8).values()
testData = ratingsData.filter(lambda x: x[0] >= 8).values()

numTraining, numValidation, numTest = trainingData.count(), validationData.count(), testData.count()
print("training.count()=%d,validation.count()=%d,test.count()=%d" % (numTraining, numValidation, numTest))

moviesData = movies.map(lambda fields: (int(fields[0]), fields[1]))
rawMoviesData = dict(moviesData.collect())


data.count() = 10000
training.count()=5932,validation.count()=2001,test.count()=2067


In [3]:
# 计算model在data数据集上的均方误差(Mean Squared Error)
def computeRmse(model, data):
    newData = data.map(lambda r: (r[0], r[1]))
    predictions = model.predictAll(newData).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    return ratesAndPreds.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean()

In [4]:
# 训练模型，注意，为了更好的调整参数，每个参数都使用了两个值最为备选值，通过
# 使模型在用于调参的数据上的误差最小选取参数，这个可以参数表可以自己设置。
# train的参数有lambda_是正则化项，blocks表示分区数，设置为-1为使用默认配置
# iterations是迭代次数，rank是每一个user或movies的隐含因素的维数。注意，
# rank过大或lambda过小都可能导致过拟合，可能导致预测结果偏小
def train(training, validation, test, iterations=5, lambda_=0.01, blocks=-1):
    ranks = [8, 12]
    lambdas = [1.0, 10.0]
    numIters = [10, 20]
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1

    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        model = ALS.train(training, rank, numIter, lmbda)
        validationRmse = computeRmse(model, validation)
        print "RMSE (validation) = %f for the model trained with " % validationRmse + \
              "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter)
        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter

    testRmse = computeRmse(bestModel, test)

    print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
          + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
    return bestModel

In [5]:
# 预测。注意使用ALS算法时预测的user_id和movies都必须在训练集中。
def predict(model, rating, user_id):
    myRateMovieIdsRDD = rating.filter(lambda x: int(x[0]) == user_id). \
        map(lambda x: x[1]).collect()
    myRateMovieIds = set(myRateMovieIdsRDD)
    candidates = sc.parallelize([m for m in rawMoviesData if m not in myRateMovieIds])
    predictions = model.predictAll(candidates.map(lambda x: (user_id, x))).collect()
    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:50]
    print "Movies recommended for you:"
    for i in xrange(len(recommendations)):
        print ("%2d: %s" % (i + 1, rawMoviesData[recommendations[i][1]])).encode('ascii', 'ignore')


In [6]:
if __name__ == '__main__':
    model = train(training=trainingData, validation=validationData, test=testData)
    predict(model, testData, 1)


RMSE (validation) = 1.829934 for the model trained with rank = 8, lambda = 1.0, and numIter = 10.
RMSE (validation) = 1.828464 for the model trained with rank = 8, lambda = 1.0, and numIter = 20.
RMSE (validation) = 14.678594 for the model trained with rank = 8, lambda = 10.0, and numIter = 10.
RMSE (validation) = 14.678594 for the model trained with rank = 8, lambda = 10.0, and numIter = 20.
RMSE (validation) = 1.830608 for the model trained with rank = 12, lambda = 1.0, and numIter = 10.
RMSE (validation) = 1.828451 for the model trained with rank = 12, lambda = 1.0, and numIter = 20.
RMSE (validation) = 14.678594 for the model trained with rank = 12, lambda = 10.0, and numIter = 10.
RMSE (validation) = 14.678594 for the model trained with rank = 12, lambda = 10.0, and numIter = 20.
The best model was trained with rank = 12 and lambda = 1.0, and numIter = 20, and its RMSE on the test set is 1.799926.
Movies recommended for you:
 1: Love & Sex (2000)
 2: 13th Warrior, The (1999)
 3: E