In [24]:
from pyspark.mllib.linalg.distributed import MatrixEntry, CoordinateMatrix
from pyspark.sql import SparkSession
 
spark = SparkSession\
        .builder\
        .appName("UserBasedExample")\
        .getOrCreate()
 
sc = spark.sparkContext
 
# 读入数据
data = sc.textFile("file:/home/hadoop/spark-exp/exp7/ratingdata.txt")
# * 这里的每一项都是一个(i: Long, j: Long, value: Double) 指示行列值的元组tuple。
# * 其中i是行坐标，j是列坐标，value是值。*/ 1,101,5.0：用户1对101物品评分为5
parseData = data.map(lambda x:x.split(","))



def func(x):
    arr=x.split(",")
    if(len(arr) == 3):#MatrixEntry代表：矩阵中的一行          
         return MatrixEntry(int(arr[0]), int(arr[1]), float(arr[2]))
parseData = data.map(func)

 
# CoordinateMatrix是Spark MLLib中专门保存user_item_rating这种数据样本的
ratings = CoordinateMatrix(parseData)

"""
由于协同矩阵CoordinateMatrix没有columnSimilarities()方法，所以我们需要将其转换成
行矩阵RowMatrix，调用他的columnSimilarities()计算其相似性
行矩阵RowMatrix的columnSimilarities()方法是计算，列与列的相似度
现在是user_item_rating，需要转置(transpose)成item_user_rating,这样才是用户的相似
"""
matrix = ratings.transpose().toRowMatrix()

# 现在可以调用columnSimilarities()计算用户的相似性了
similarities = matrix.columnSimilarities()

print("用户相似性矩阵")
li = similarities.entries.sortBy(lambda x:x.i).collect()
for x in li:
    print(str(x.i) + "->" + str(x.j) + "->" + str(x.value))
    
    
#用户1对各个物品的评分
ratingOfUser1 =ratings.entries.filter(lambda x:x.i ==1).map(lambda x:(x.j,x.value)).sortBy(lambda x:x[0])
print("\n")
for s in ratingOfUser1.collect():
    print(s)

#用户1的所有评分的平均值    
allRatingValOfUser1=ratingOfUser1.map(lambda x:x[1]).collect()
avgRatingOfUser1 = sum(allRatingValOfUser1)/len(allRatingValOfUser1)
print("\n" + str(avgRatingOfUser1))

print("\n") 

#除了用户1外，其他用户对物品101的评价
tmp=ratings.entries.filter(lambda x:(x.i !=1 and x.j==101))
otherRatingsToItem1=tmp.map(lambda x:(x.i,x.j,x.value)).sortBy(lambda x:x[0])
for s in otherRatingsToItem1.collect():
    print(s)
print("\n")

#相似矩阵中，用户1的数据
tmp2 =similarities.entries.filter(lambda x:x.i==1)
weights = tmp2.sortBy(lambda x:x.value).map(lambda x:(x.i,x.j,x.value)).collect()
for s in weights:
    print(s)


用户相似性矩阵
1->4->0.6111914276294735
1->2->0.754776694478251
1->5->0.6261698609836083
1->3->0.23981435961206427
2->3->0.25256410947267416
2->4->0.647494259163196
2->5->0.6247714305641069
3->5->0.5937461121628563
3->4->0.4429194874175847
4->5->0.8364197355878605


(101, 5.0)
(102, 3.0)
(103, 2.5)

3.5


(2, 101, 2.0)
(3, 101, 2.5)
(4, 101, 5.0)
(5, 101, 4.0)


(1, 3, 0.23981435961206427)
(1, 4, 0.6111914276294735)
(1, 5, 0.6261698609836083)
(1, 2, 0.754776694478251)


用户相似性矩阵
1->4->0.6111914276294735
1->2->0.754776694478251
1->5->0.6261698609836083
1->3->0.23981435961206427
2->3->0.25256410947267416
2->4->0.647494259163196
2->5->0.6247714305641069
3->5->0.5937461121628563
3->4->0.4429194874175847
4->5->0.8364197355878605
