# 推荐算法的实现与改进

第一部分，定义了基于人的推荐函数以及基于物的推荐函数

In [5]:
import random

import math
from operator import itemgetter


class CF():

    def __init__(self):
        self.n_sim_user = 20
        self.n_sim_movie = 20
        self.n_rec_movie = 10

        self.trainSet = {}
        self.testSet = {}

        self.user_sim_matrix = {}
        self.movie_sim_matrix = {}
        self.movie_popular = {}
        self.movie_count = 0
        print('相似用户数 = %d' % self.n_sim_user)
        print('相似电影数 = %d' % self.n_sim_movie)
        print('单人推荐电影数 = %d' % self.n_rec_movie)

    def get_dataset(self, filename, pivot=0.75):
        trainSet_len = 0
        testSet_len = 0
        for i, line in enumerate(self.load_file(filename)):
            if(i > 2000000):
                break
            user, movie, rating, timestamp = line.split('::')
            if random.random() < pivot:
                self.trainSet.setdefault(user, {})
                self.trainSet[user][movie] = rating
                trainSet_len += 1
            else:
                self.testSet.setdefault(user, {})
                self.testSet[user][movie] = rating
                testSet_len += 1
        print('数据读取成功')
        print('训练集 = %s' % trainSet_len)
        print('测试集 = %s' % testSet_len)

    def load_file(self, filename):
        with open(filename, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                yield line.strip('\r\n')
        print('加载%s 成功!' % filename)

    def calc_user_sim(self):

        # key = movieID, value = list of userIDs who have seen this movie
        print('计算倒排索引。')
        movie_user = {}
        for user, movies in self.trainSet.items():
            for movie in movies:
                if movie not in movie_user:
                    movie_user[movie] = set()
                movie_user[movie].add(user)
        print('倒排索引建立成功')

        self.movie_count = len(movie_user)
        print('电影数量 = %d' % self.movie_count)

        print('建立用户相关矩阵')
        for movie, users in movie_user.items():
            for u in users:
                for v in users:
                    if u == v:
                        continue
                    self.user_sim_matrix.setdefault(u, {})
                    self.user_sim_matrix[u].setdefault(v, 0)
                    self.user_sim_matrix[u][v] += 1
        print('建立用户相关矩阵成功')

        print('计算相似矩阵')
        for u, related_users in self.user_sim_matrix.items():
            for v, count in related_users.items():
                self.user_sim_matrix[u][v] = count / math.sqrt(len(self.trainSet[u]) * len(self.trainSet[v]))
        print('计算相似矩阵成功！')

    def calc_movie_sim(self):
        for user, movies in self.trainSet.items():
            for movie in movies:
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1

        self.movie_count = len(self.movie_popular)
        print("电影数量 = %d" % self.movie_count)

        print('建立电影相关矩阵')
        for user, movies in self.trainSet.items():
            for m1 in movies:
                for m2 in movies:
                    if m1 == m2:
                        continue
                    self.movie_sim_matrix.setdefault(m1, {})
                    self.movie_sim_matrix[m1].setdefault(m2, 0)
                    self.movie_sim_matrix[m1][m2] += 1
        print("计算电影相关矩阵成功")

        print("计算电影相似矩阵")
        for m1, related_movies in self.movie_sim_matrix.items():
            for m2, count in related_movies.items():
                if self.movie_popular[m1] == 0 or self.movie_popular[m2] == 0:
                    self.movie_sim_matrix[m1][m2] = 0
                else:
                    self.movie_sim_matrix[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
        print('计算电影相似矩阵成功')

    def recommendByItem(self, user):
        K = self.n_sim_movie
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainSet[user]

        for movie, rating in watched_movies.items():
            for related_movie, w in sorted(self.movie_sim_matrix[movie].items(), key=itemgetter(1), reverse=True)[:K]:
                if related_movie in watched_movies:
                    continue
                rank.setdefault(related_movie, 0)
                rank[related_movie] += w * float(rating)
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]

    def recommendByUser(self, user):
        K = self.n_sim_user
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainSet[user]

        # v=similar user, wuv=similar factor
        for v, wuv in sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:K]:
            for movie in self.trainSet[v]:
                if movie in watched_movies:
                    continue
                rank.setdefault(movie, 0)
                rank[movie] += wuv
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]

加载数据集

In [6]:
rating_file = '/home/aistudio/data/data26467/ml-1m/ratings.dat'
CF = CF()
CF.get_dataset(rating_file)

相似用户数 = 20
相似电影数 = 20
单人推荐电影数 = 10
加载/home/aistudio/data/data26467/ml-1m/ratings.dat 成功!
数据读取成功
训练集 = 749901
测试集 = 250307


定义评估函数

In [9]:
def evaluate(cf, function):
    hit = 0
    rec_count = 0
    test_count = 0
    all_rec_movies = set()
    if function == 'UserCF':
        for i, user in enumerate(cf.trainSet):
            test_movie = cf.testSet.get(user, {})
            rec_movies_user = cf.recommendByUser(user)
            for movie, w in rec_movies_user:
                if movie in test_movie:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += len(rec_movies_user)
            test_count += len(test_movie)
            if (i % 100 == 0) and (i != 0):
                print("num:{}\n".format(i))
                precision = hit / (1.0 * rec_count)
                recall = hit / (1.0 * test_count)
                coverage = len(all_rec_movies) / (1.0 * cf.movie_count)
                print('准确率=%.4f\t召回率=%.4f\t覆盖率=%.4f\t推荐电影数量=%.4f\t推荐成功数=%.4f' % (precision, recall, coverage, rec_count, hit))
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * cf.movie_count)
        print('准确率=%.4f\t召回率=%.4f\t覆盖率=%.4f' % (precision, recall, coverage))
    elif function == 'ItemCF':
        for i, user in enumerate(cf.trainSet):
            test_movie = cf.testSet.get(user, {})
            rec_movies_item = cf.recommendByItem(user)
            for movie, w in rec_movies_item:
                if movie in test_movie:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += len(rec_movies_item)
            test_count += len(test_movie)
            if (i % 100 == 0) and (i != 0):
                print("num:{}\n".format(i))
                precision = hit / (1.0 * rec_count)
                recall = hit / (1.0 * test_count)
                coverage = len(all_rec_movies) / (1.0 * cf.movie_count)
                print('准确率=%.4f\t召回率=%.4f\t覆盖率=%.4f\t推荐电影数量=%.4f\t推荐成功数=%.4f' % (precision, recall, coverage, rec_count, hit))
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * cf.movie_count)
        print('准确率=%.4f\t召回率=%.4f\t覆盖率=%.4f' % (precision, recall, coverage))
    elif function == 'MixCF':
        for i, user in enumerate(cf.trainSet):
            test_movie = cf.testSet.get(user, {})
            rec_movies_user = cf.recommendByUser(user)
            rec_movies_item = cf.recommendByItem(user)
            rec_movies = {}
            for movie, w in rec_movies_user:
                rec_movies.setdefault(movie, 0)
                rec_movies[movie] += w
            for movie, w in rec_movies_item:
                rec_movies.setdefault(movie, 0)
                rec_movies[movie] += w
            rec_movies = sorted(rec_movies.items(), key=itemgetter(1), reverse=True)[0:cf.n_rec_movie]
            for movie, w in rec_movies:
                if movie in test_movie:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += len(rec_movies)
            test_count += len(test_movie)
            if (i % 100 == 0) and (i != 0):
                print("num:{}\n".format(i))
                precision = hit / (1.0 * rec_count)
                recall = hit / (1.0 * test_count)
                coverage = len(all_rec_movies) / (1.0 * cf.movie_count)
                print('准确率=%.4f\t召回率=%.4f\t覆盖率=%.4f\t推荐电影数量=%.4f\t推荐成功数=%.4f' % (precision, recall, coverage, rec_count, hit))
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * cf.movie_count)
        print('准确率=%.4f\t召回率=%.4f\t覆盖率=%.4f' % (precision, recall, coverage))

In [None]:
进行测试

In [None]:
function = 'UserCF'
if function == 'UserCF':
    CF.calc_user_sim()
    evaluate(CF, function)
if function == 'ItemCF':
    CF.calc_movie_sim()
    evaluate(CF, function)
if function == 'MixCF':
    CF.calc_movie_sim()
    CF.calc_user_sim()
    evaluate(CF, function)

计算倒排索引。
倒排索引建立成功
电影数量 = 3668
建立用户相关矩阵
