In [1]:
# 项亮《推荐系统实践》ItemCF的实现
# 使用小型MovieLens数据集

In [2]:
import pandas as pd
import random
import math
from operator import itemgetter
from tqdm import tqdm

In [3]:
# 读取数据
rating_file = '../jupyter_files/ml-latest-small/ratings.csv'

data = pd.read_csv(rating_file)

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# 切分训练集和测试集
train = {}                # 数据集是词典，key为用户，value为用户有过评分的电影列表。
test = {}

train_test_ratio = 0.7      # 划分训练集和测试集，训练集所占的比例

In [6]:
random.seed(2020)
for row in data.itertuples():
    if random.random() < train_test_ratio:
        if row[1] not in train.keys():                    # userId的在tuple里的序号为1
            train[row[1]] = []
        train[row[1]].append(row[2])                      # movieId在tuple里的序号为2
    else:
        if row[1] not in test.keys():
            test[row[1]] = []
        test[row[1]].append(row[2])

In [7]:
# 此时，train和test已经是用户到物品之间的倒查表。不需要再建立倒查表。

In [8]:
# 计算二部电影被用户都评论过的用户数量。N(i)∩N(j),评论过i电影和评论过j电影的用户的交集。
# 计算相似度的分子
C = {}                                   # 计算二部电影被用户都评论过的用户数量
N = {}                                   # 统计各电影被多少用户评论过
for user, items in tqdm(train.items()):
    for i in items:
        if i not in N.keys():
            N[i] = 0
        N[i] += 1
        if i not in C.keys():
                C[i] = {}
        for j in items:
            if i == j:
                continue
            if j not in C[i].keys():
                C[i][j] = 0
            C[i][j] += 1 / math.log(1 + len(items))            # 惩罚活跃用户对相似度的影响

100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [00:26<00:00, 23.18it/s]


In [9]:
C[1][2]

6.576976840541445

In [10]:
# 计算最终的物品间相似度
# 除以分母
W = {}
for i, related_items in C.items():
    if i not in W.keys():
        W[i] = {}
    for j, cij in related_items.items():
        W[i][j] = cij / math.sqrt(N[i]*N[j])

In [11]:
# 给用户评论过的每部电影，找到K部最相似的电影，最后推荐n部电影
user = 5        # 我们给编号为5的用户做推荐
K = 10
n = 20

In [12]:
rank = {}
for item in train[user]:
    for sim_item, similarity in sorted(W[item].items(), key=itemgetter(1), reverse=True)[0:K]:
        if sim_item in train[user]:
            continue
        if sim_item not in rank:
            rank[sim_item] = 0
        rank[sim_item] += similarity

rank = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:n]

In [13]:
rank

[(380, 1.4153245148859999),
 (457, 0.9360971447765714),
 (356, 0.7953692854329181),
 (500, 0.6291878741767979),
 (377, 0.5838614047875813),
 (454, 0.5166995374168022),
 (208, 0.5003083308968814),
 (434, 0.41291160111119757),
 (47, 0.4057628700466144),
 (539, 0.3516563210832019),
 (292, 0.33745596670366834),
 (318, 0.3253689557022883),
 (329, 0.3096307055320706),
 (185, 0.29922875120548165),
 (480, 0.282250126986049),
 (593, 0.20799027477262205),
 (165, 0.20323492304216687),
 (440, 0.20077992622509738),
 (349, 0.19891317963632987),
 (587, 0.18826332203679574)]

In [14]:
# 计算精度
hit = 0
total = 0
for item, _ in rank:
    if item in test[user]:
        hit += 1
    total += 1

print("pecision: %f" % (hit/total))

pecision: 0.200000


In [15]:
# 计算召回率
hit = 0
for item, _ in rank:
    if item in test[user]:
        hit += 1

print("recall: %f" % (hit/len(test[user])))

recall: 0.307692
