In [1]:
import numpy as np
import pandas as pd

In [6]:
#1）定义数据集，这里采用字典存放数据，实际情况中数据非常稀疏，很少有情况是现在这样
def loadData():
    items = {'A':{1:5,2:3,3:4,4:3,5:1},
            'B':{1:3,2:1,3:3,4:3,5:5},
            'C':{1:4,2:2,3:4,4:1,5:5},
            'D':{1:4,2:3,3:3,4:5,5:2},
            'E':{2:3,3:5,4:4,5:1}}
    users = {1:{'A':5,'B':3,'C':4,'D':4},
            2:{'A':3,'B':1,'C':2,'D':3,'E':3},
            3:{'A':4,'B':3,'C':4,'D':3,'E':5},
            4:{'A':3,'B':3,'C':1,'D':5,'E':4},
            5:{'A':1,'B':5,'C':5,'D':2,'E':1}}
    return items, users
items, users = loadData()
item_df = pd.DataFrame(items).T
user_df = pd.DataFrame(users).T

In [4]:
user_df.head()

Unnamed: 0,A,B,C,D,E
1,5.0,3.0,4.0,4.0,
2,3.0,1.0,2.0,3.0,3.0
3,4.0,3.0,4.0,3.0,5.0
4,3.0,3.0,1.0,5.0,4.0
5,1.0,5.0,5.0,2.0,1.0


In [5]:
item_df.head()

Unnamed: 0,1,2,3,4,5
A,5.0,3.0,4.0,3.0,1.0
B,3.0,1.0,3.0,3.0,5.0
C,4.0,2.0,4.0,1.0,5.0
D,4.0,3.0,3.0,5.0,2.0
E,,3.0,5.0,4.0,1.0


In [9]:
#2)计算用户相似度矩阵
similarity_matrix = pd.DataFrame(np.zeros((len(users), len(users))), index=[1, 2, 3, 4, 5], columns=[1, 2, 3, 4, 5])
for userId in users:
    for otherUserId in users:
        vec_user = []
        vec_otheruser = []
        if userId != otherUserId:
            for itemId in items:
                itemRating = items[itemId]
                if userId in itemRating and otherUserId in itemRating:
                    vec_user.append(itemRating[userId])
                    vec_otheruser.append(itemRating[otherUserId])
            #利用皮尔逊相关系数来计算两个用户之间的相似度，同样也可以使用余弦相似度
            similarity_matrix[userId][otherUserId] = np.corrcoef(np.array(vec_user), np.array(vec_otheruser))[0][1]

In [10]:
similarity_matrix

Unnamed: 0,1,2,3,4,5
1,0.0,0.852803,0.707107,0.0,-0.792118
2,0.852803,0.0,0.467707,0.489956,-0.900149
3,0.707107,0.467707,0.0,-0.161165,-0.466569
4,0.0,0.489956,-0.161165,0.0,-0.641503
5,-0.792118,-0.900149,-0.466569,-0.641503,0.0


In [11]:
#3)给用户1做推荐，计算前n个相似的用户，并计算最终得分
n = 2#得到前2个相似的用户
similarity_user = similarity_matrix[1].sort_values(ascending=False)[:n].index.tolist()
similarity_user

[2, 3]

In [13]:
#计算用户1对物品的最终得分，以物品E为例
base_score = np.mean(np.array([value for value in users[1].values()]))  #用户1的平均打分
weighted_scores = 0.
corr_values_sum = 0.
for user in similarity_user:
    corr_value = similarity_matrix[1][user]  #两个用户之间的相似性
    mean_user_score = np.mean(np.array([value for value in users[user].values()])) #相似用户的平均打分
    weighted_scores += corr_value * (users[user]['E'] - mean_user_score)
    corr_values_sum += corr_value
final_scores = base_score + weighted_scores / corr_values_sum
print('用户1对物品E的打分:', final_scores)
user_df.loc[1]['E'] = final_scores
user_df

用户1对物品E的打分: 4.871979899370592


Unnamed: 0,A,B,C,D,E
1,5.0,3.0,4.0,4.0,4.87198
2,3.0,1.0,2.0,3.0,3.0
3,4.0,3.0,4.0,3.0,5.0
4,3.0,3.0,1.0,5.0,4.0
5,1.0,5.0,5.0,2.0,1.0


## 基于物品的协同过滤

In [15]:
#2)计算物品相似度矩阵
similarity_matrix = pd.DataFrame(np.zeros((len(items), len(items))), index=['A', 'B', 'C', 'D', 'E'], columns=['A','B','C','D','E'])
for itemId in items:
    for otherItemId in items:
        otherItem_vec = []
        item_vec = []
        if itemId != otherItemId:
            for userId in users:
                userRating = users[userId]
                if itemId in userRating and otherItemId in userRating:
                    item_vec.append(userRating[itemId])
                    otherItem_vec.append(userRating[otherItemId])
            #这里使用的是皮尔逊相关系数，也可以使用余弦相似度
            similarity_matrix[itemId][otherItemId] = np.corrcoef(np.array(item_vec), np.array(otherItem_vec))[0][1]
similarity_matrix

Unnamed: 0,A,B,C,D,E
A,0.0,-0.476731,-0.123091,0.532181,0.969458
B,-0.476731,0.0,0.645497,-0.310087,-0.478091
C,-0.123091,0.645497,0.0,-0.720577,-0.427618
D,0.532181,-0.310087,-0.720577,0.0,0.581675
E,0.969458,-0.478091,-0.427618,0.581675,0.0


In [18]:
#3）给用户1做推荐，计算用户1给物品E的打分
#得到物品E的前2个最相似的物品
n = 2
similarity_item = similarity_matrix['E'].sort_values(ascending=False)[:n].index.tolist()
#计算用户1对物品E的得分
base_score = np.mean(np.array([value for value in items['E'].values()])) #物品E的平均打分
weighted_scores = 0.
corr_values_sum = 0.
for item in similarity_item:
    corr_value = similarity_matrix['E'][item]
    mean_item_score = np.mean(np.array([value for value in items[item].values()])) #相似物品的平均打分
    weighted_scores += corr_value * (users[1][item] - mean_item_score)
    corr_values_sum += corr_value
final_scores = base_score + weighted_scores / corr_values_sum
print('用户1对物品E的打分：', final_scores)
user_df.loc[1]['E'] = final_scores
user_df

用户1对物品E的打分： 4.6


Unnamed: 0,A,B,C,D,E
1,5.0,3.0,4.0,4.0,4.6
2,3.0,1.0,2.0,3.0,3.0
3,4.0,3.0,4.0,3.0,5.0
4,3.0,3.0,1.0,5.0,4.0
5,1.0,5.0,5.0,2.0,1.0
