In [1]:
# 项亮《推荐系统实践》UserCF的实现
# 使用小型MovieLens数据集

In [2]:
import pandas as pd
import random
import math
from operator import itemgetter
from tqdm import tqdm

In [3]:
# 读取数据
rating_file = '../jupyter_files/ml-latest-small/ratings.csv'

data = pd.read_csv(rating_file)

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# 切分训练集和测试集
train = {}                # 数据集是词典，key为用户，value为用户有过评分的电影列表
test = {}

train_test_ratio = 0.7      # 划分训练集和测试集，训练集所占的比例

In [6]:
random.seed(2020)
for row in data.itertuples():
    if random.random() < train_test_ratio:
        if row[1] not in train.keys():                    # userId的在tuple里的序号为1
            train[row[1]] = []
        train[row[1]].append(row[2])                      # movieId在tuple里的序号为2
    else:
        if row[1] not in test.keys():
            test[row[1]] = []
        test[row[1]].append(row[2])

In [7]:
# 建立物品到用户之间的倒查表，降低计算用户相似度的时间复杂性
item_users = {}
for user, items in train.items():
    for item in items:
        if item not in item_users.keys():
            item_users[item] = set()
        item_users[item].add(user)

In [8]:
# 计算用户间共同评论过的电影数量。
# 计算相似度的分子
C = {}                                   # 计算用户间共同评论过的电影数量
N = {}                                   # 统计各用户评论过多少电影
for item, users in tqdm(item_users.items()):
    for u in users:
        if u not in N.keys():
            N[u] = 0
        N[u] += 1
        if u not in C.keys():
                C[u] = {}
        for v in users:
            if u == v:
                continue
            if v not in C[u].keys():
                C[u][v] = 0
            C[u][v] += 1 / math.log(1 + len(users))            # 惩罚热门物品对相似度的影响

100%|████████████████████████████████████████████████████████████████████████████| 8551/8551 [00:02<00:00, 3921.65it/s]


In [9]:
C[1][2]

0.4793914053771732

In [10]:
# 计算最终的用户间相似度
# 除以分母
W = {}
for u, related_users in C.items():
    if u not in W.keys():
        W[u] = {}
    for v, cuv in related_users.items():
        W[u][v] = cuv / math.sqrt(N[u]*N[v])

In [11]:
# 找到K个最相似的用户，推荐n个物品
user = 5        # 我们给编号为5的用户做推荐
K = 80
n = 20

In [12]:
rank = {}
for sim_user, similarity in sorted(W[user].items(), key=itemgetter(1), reverse=True)[0:K]:
    for item in train[sim_user]:
        if item in train[user]:
            continue
        if item not in rank.keys():
            rank[item] = 0
        rank[item] += similarity

rank = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:n]

In [13]:
rank

[(380, 3.4565206038764344),
 (457, 3.07038319621229),
 (356, 3.0499815789812974),
 (454, 2.8222144683113406),
 (318, 2.800213715863799),
 (349, 2.7747817770859005),
 (480, 2.682590468662604),
 (377, 2.646399428528173),
 (434, 2.565767426706949),
 (161, 2.5074825955000515),
 (47, 2.4673246980845813),
 (329, 2.4489383268857225),
 (292, 2.403897972478507),
 (208, 2.399192771264493),
 (500, 2.298772255437218),
 (231, 2.29044309593399),
 (165, 2.213800417356548),
 (593, 2.20952037255619),
 (539, 2.206841584431416),
 (185, 2.176802767723239)]

In [14]:
# 计算精度
hit = 0
total = 0
for item, _ in rank:
    if item in test[user]:
        hit += 1
    total += 1

print("pecision: %f" % (hit/total))

pecision: 0.200000


In [15]:
# 计算召回率
hit = 0
for item, _ in rank:
    if item in test[user]:
        hit += 1

print("recall: %f" % (hit/len(test[user])))

recall: 0.307692
