In [1]:
import os
import math
import random
import pandas as pd
import numpy as np

# 数据集导入、处理

In [2]:
path = '../datasets/ml-1m'

`ratings.dat`格式为`user_id, movie_id, rating_score, timestamp`

- `user_id`介于$(1 - 6040)$
- `movie_id`介于$(1 - 3952)$
- 评分为5分制
- 每个用户有至少20个评分数据

In [3]:
rating_path = os.path.join(path, 'ratings.dat')

# 算法实现

In [48]:
class UserBasedCF:
    def __init__(self, path):
        self.train = {}
        self.test = {}
        self.item_user_table = {}
        self.S = {}
        self.generate_dataset(path)
    
    def load_file(self, path):
        # 直接使用f是迭代器，f.readlines是list
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                yield line.strip('\r\n')
    
    def generate_dataset(self, path, random_pivot=0.8):
        i = 0
        for line in self.load_file(path):
            # user_id, movie_id, rating, timestamp
            user, movie, rating, _ = line.split('::')
            if i < 10:
                print('No.{}: - user_id: {}; - movie_id: {}; - rating: {}'.format(
                    i, user, movie, rating))
            i += 1
            if random.random() < random_pivot:
                self.train.setdefault(user, {})
                self.train[user][movie] = int(rating)
            else:
                self.test.setdefault(user, {})
                self.test[user][movie] = int(rating)
                
    def compute_user_similarity(self):
        # 建立倒排表
        for user, items in self.train.items():
            for item in items.keys():
                self.item_user_table.setdefault(item, set())
                self.item_user_table[item].add(user)
        
        # 计算共现矩阵
        C = {}
        # 用户产生行为的物品个数
        N = {}
        for users in self.item_user_table.values():
            for u in users:
                N.setdefault(u, 0)
                N[u] += 1
                C.setdefault(u, {})
                for v in users:
                    if u == v:
                        continue
                    else:
                        C[u].setdefault(v, 0)
                        C[u][v] += 1
        
        # 计算余弦相似度
        for u, related_users in C.items():
            self.S.setdefault(u, {})
            for v, co_occurrence_time in related_users.items():
                self.S[u][v] = co_occurrence_time / math.sqrt(N[u] * N[v])
        
        return self.S, C, N 
    
    def recommend(self, u, K=3, N=10):
        if not self.S:
            print('还没有计算相似度矩阵!')
            return
        recommending_items = {}
        rating_items = self.train[u].keys()
        most_k_similar_users = sorted(self.S[u].items(), key=lambda x:x[1], reverse=True)[:K]
        # 相似度归一化
        total_similar_degree = sum(item[1] for item in most_k_similar_users)
        most_k_similar_users = [(user, similar_degree/total_similar_degree)
                                for user, similar_degree in most_k_similar_users]
        for v, similar_degree in most_k_similar_users:
            for i, r_vi in self.train[v].items():
                if i in rating_items:
                    continue
                else:
                    recommending_items.setdefault(i, 0)
                    recommending_items[i] += r_vi * similar_degree
        
        # 字典化（可视为JSON化）
        return dict(sorted(recommending_items.items(), key=lambda x:x[1], reverse=True)[:N])
    
    def evaluate(self, K=8, N=10):
        if not self.S:
            print('还没有计算相似度矩阵!')
            return
        hit, precision, recall = 0, 0, 0
        for user, items in self.test.items():
            recommended = self.recommend(user, K, N)
            hit += len(set(recommended.keys()) & set(items.keys()))
            precision += N
            recall += len(items)
        precision, recall = hit / (precision * 1.0), hit / (recall * 1.0)
        return precision, recall

In [49]:
user_cf = UserBasedCF(rating_path)

No.0: - user_id: 1; - movie_id: 1193; - rating: 5
No.1: - user_id: 1; - movie_id: 661; - rating: 3
No.2: - user_id: 1; - movie_id: 914; - rating: 3
No.3: - user_id: 1; - movie_id: 3408; - rating: 4
No.4: - user_id: 1; - movie_id: 2355; - rating: 5
No.5: - user_id: 1; - movie_id: 1197; - rating: 3
No.6: - user_id: 1; - movie_id: 1287; - rating: 5
No.7: - user_id: 1; - movie_id: 2804; - rating: 5
No.8: - user_id: 1; - movie_id: 594; - rating: 4
No.9: - user_id: 1; - movie_id: 919; - rating: 4


In [58]:
S, C, N = user_cf.compute_user_similarity()

# 推荐预测

In [53]:
user_cf.recommend('100')

{'2872': 4.337111895172112,
 '1210': 4.0,
 '1220': 3.996186198305362,
 '2105': 3.6628881048278883,
 '2001': 3.6628881048278883,
 '376': 3.34092569686675,
 '653': 3.325776209655776,
 '3704': 3.325776209655776,
 '377': 3.3029991190555266,
 '1801': 3.295371515666251}

In [74]:
user_cf.recommend('100', K=5, N=20)

{'1220': 3.9964167153425363,
 '2872': 3.6259916465489965,
 '1240': 3.5962303652904914,
 '733': 3.565695639052418,
 '1214': 3.5566428691877965,
 '1242': 3.344636733701157,
 '457': 3.344636733701157,
 '2105': 3.2178099473393598,
 '2571': 3.193377856456906,
 '1610': 3.1537903603542112,
 '2353': 3.147997786820545,
 '1196': 2.974662915653427,
 '2916': 2.953439072304082,
 '1222': 2.806545929921639,
 '1200': 2.7995790115806933,
 '3527': 2.7972350470329106,
 '10': 2.778548305953053,
 '377': 2.6062652363334387,
 '1801': 2.6052952675669174,
 '1377': 2.6032094160432413}

# 评估

In [70]:
precision, recall = user_cf.evaluate()

In [73]:
print('- 准确率：{:%}\n- 召回率：{:%}'.format(precision, recall))

- 准确率：27.813121%
- 召回率：8.402024%


# 数据展示

In [63]:
def dic2dataframe(dic):
    return pd.DataFrame(dic).T.fillna(0)

In [65]:
S_df = dic2dataframe(S)
C_df = dic2dataframe(C)
N_df = pd.Series(N)

In [83]:
S_df.shape, C_df.shape, N_df.shape

((6040, 6040), (6040, 6040), (6040,))

In [69]:
S_df.head()

Unnamed: 0,5404,150,3550,3146,4950,227,3198,3281,5948,5636,...,46,4549,1341,3838,3893,2908,1615,986,3662,2204
4361,0.264979,0.211407,0.198507,0.223408,0.185058,0.103098,0.06574,0.245803,0.131534,0.217783,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5404,0.0,0.27103,0.257282,0.193382,0.266528,0.188982,0.142261,0.275344,0.196553,0.300569,...,0.013231,0.018185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.27103,0.0,0.213644,0.242447,0.281682,0.15257,0.113499,0.157625,0.236965,0.282706,...,0.025637,0.035234,0.035734,0.018128,0.02425,0.039952,0.054384,0.018128,0.008689,0.0
3550,0.257282,0.213644,0.0,0.149261,0.286759,0.121554,0.167934,0.246334,0.148337,0.335601,...,0.030638,0.042108,0.02847,0.028886,0.067621,0.047746,0.057771,0.014443,0.034612,0.011254
3146,0.193382,0.242447,0.149261,0.0,0.185219,0.030096,0.111946,0.203305,0.108514,0.167303,...,0.025286,0.034752,0.017623,0.07152,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
C_df.head()

Unnamed: 0,5404,150,3550,3146,4950,227,3198,3281,5948,5636,...,46,4549,1341,3838,3893,2908,1615,986,3662,2204
4361,34.0,28.0,33.0,15.0,35.0,5.0,6.0,30.0,23.0,57.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5404,0.0,47.0,56.0,17.0,66.0,12.0,17.0,44.0,45.0,103.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,47.0,0.0,48.0,22.0,72.0,10.0,14.0,26.0,56.0,100.0,...,2.0,2.0,4.0,1.0,2.0,2.0,3.0,1.0,1.0,0.0
3550,56.0,48.0,0.0,17.0,92.0,10.0,26.0,51.0,44.0,149.0,...,3.0,3.0,4.0,2.0,7.0,3.0,4.0,1.0,5.0,1.0
3146,17.0,22.0,17.0,0.0,24.0,1.0,7.0,17.0,13.0,30.0,...,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
N_df.head()

4361     98
5404    168
150     179
3550    282
3146     46
dtype: int64