In [17]:
import random
import math
from operator import itemgetter

## 初始化

In [18]:
class UserBasedCF():
    # 初始化相关参数
    def __init__(self):
        # 找到与目标用户兴趣相似的50个用户，为其推荐5部电影
        self.n_sim_user = 50
        self.n_rec_movie = 5

        # 将数据集划分为训练集和测试集
        self.trainSet = {}
        self.testSet = {}

        # 用户相似度矩阵
        self.user_sim_matrix = {}
        self.movie_count = 0

        print('Similar user number = %d' % self.n_sim_user)
        print('Recommneded movie number = %d' % self.n_rec_movie)


    # 读文件得到“用户-电影”数据
    def get_dataset(self, filename, pivot=0.75):
        trainSet_len = 0
        testSet_len = 0
        for line in self.load_file(filename):
            user, movie, rating, timestamp = line.split(',')
            if random.random() < pivot:
                self.trainSet.setdefault(user, {})
                self.trainSet[user][movie] = rating
                trainSet_len += 1
            else:
                self.testSet.setdefault(user, {})
                self.testSet[user][movie] = rating
                testSet_len += 1
        print('Split trainingSet and testSet success!')
        print('TrainSet = %s' % trainSet_len)
        print('TestSet = %s' % testSet_len)


    # 读文件，返回文件的每一行
    def load_file(self, filename):
        with open(filename, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:  # 去掉文件第一行的title
                    continue
                yield line.strip('\r\n')
        print('Load %s success!' % filename)

    # 针对目标用户U，找到其最相似的K个用户，产生N个推荐
    def recommend(self, user):
        K = self.n_sim_user
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainSet[user]

        # v=similar user, wuv=similar factor
        for v, wuv in sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:K]:
            for movie in self.trainSet[v]:
                if movie in watched_movies:
                    continue
                rank.setdefault(movie, 0)
                rank[movie] += wuv*float(self.trainSet[v][movie])
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]


    # 产生推荐并通过准确率、召回率和覆盖率进行评估
    def evaluate(self):
        print("Evaluation start ...")
        N = self.n_rec_movie
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()

        for i, user, in enumerate(self.trainSet):
            test_movies = self.testSet.get(user, {})
            rec_movies = self.recommend(user)
            for movie, w in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += N
            test_count += len(test_movies)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        print('precisioin=%.4f\trecall=%.4f\t  coverage=%.4f' % (precision, recall, coverage))


In [19]:
class Euclidean(UserBasedCF):
    # 欧几里得相似度
    def calc_user_sim_euclidean(self):

        # 构建“电影-用户”倒排索引
        # key = movieID, value = list of userIDs who have seen this movie
        print('Building movie-user table ...')
        movie_user = {}
        for user, movies in self.trainSet.items():
            for movie in movies:
                if movie not in movie_user:
                    movie_user[movie] = set()
                movie_user[movie].add(user)
        print('Build movie-user table success!')
        self.movie_count = len(movie_user)
        print('Total movie number = %d' % self.movie_count)

        self.user_count = len(self.trainSet)
        print('Total user  number = %d' % self.user_count)

        for a in self.trainSet:
            for b in self.trainSet:
                if a == b :
                    continue
                si = {}
                for item in self.trainSet[a]:
                    if item in self.trainSet[b]:
                        si[item] = 1

                if len(si) == 0:
                    self.user_sim_matrix.setdefault(a, {})
                    self.user_sim_matrix[a].setdefault(b,0)
                    continue

                result =sum(pow(float(self.trainSet[a][item])-float(self.trainSet[b][item]),2)for item in si)
                
                self.user_sim_matrix.setdefault(a, {})
                self.user_sim_matrix[a].setdefault(b, 0)
                self.user_sim_matrix[a][b] = 1/(1+math.sqrt(result))

        print('Calculate user similarity matrix success!')


## 欧氏距离
<br>
<br>
<br>
$\displaystyle d(x, y)=\sqrt{\left(\sum\left(x_{i}-y_{i}\right)^{2}\right)}$
<br>
<br>
<br>
$\displaystyle\operatorname{sim}(x, y)=\frac{1}{1+d(x, y)}$
<br>
<br>
<br>

In [20]:
class Pearson(UserBasedCF):
    # 皮尔逊相似度
    def calc_user_sim_pearson(self):
        # 构建“电影-用户”倒排索引
        # key = movieID, value = list of userIDs who have seen this movie
        print('Building movie-user table ...')
        movie_user = {}
        for user, movies in self.trainSet.items():
            for movie in movies:
                if movie not in movie_user:
                    movie_user[movie] = set()
                movie_user[movie].add(user)
        print('Build movie-user table success!')
        self.movie_count = len(movie_user)
        print('Total movie number = %d' % self.movie_count)

        self.user_count = len(self.trainSet)
        print('Total user  number = %d' % self.user_count)

        for a in self.trainSet:
            for b in self.trainSet:
                if a == b :
                    continue
                si = {}
                for item in self.trainSet[a]:
                    if item in self.trainSet[b]:
                        si[item] = 1
                #得到列表元素个数
                n = len(si)
                #二者无共同之处，则返回1
                if n == 0:
                    continue
                #对所有偏好求和
                sum1 = sum([float(self.trainSet[a][item]) for item in si])
                sum2 = sum([float(self.trainSet[b][item]) for item in si])

                #求平方和
                sum1Sq = sum([pow(float(self.trainSet[a][item]),2) for item in si])
                sum2Sq = sum([pow(float(self.trainSet[b][item]),2) for item in si])

                #求乘积之和
                pSum = sum([float(self.trainSet[a][item]) * float(self.trainSet[b][item]) for item in si])

                #计算皮尔逊评价值
                num = pSum - (sum1*sum2/n)
                den = math.sqrt((sum1Sq - pow(sum1, 2)/n) * (sum2Sq - pow(sum2, 2)/n))
                if den == 0: 
                    r = 0
                    self.user_sim_matrix.setdefault(a, {})
                    self.user_sim_matrix[a].setdefault(b, 0)
                    self.user_sim_matrix[a][b] = r
                    continue

                r = num/den

                self.user_sim_matrix.setdefault(a, {})
                self.user_sim_matrix[a].setdefault(b, 0)
                self.user_sim_matrix[a][b] = r

## 皮尔逊相关度
<br>
<br>
<br>
$\displaystyle r=\frac{\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)\left(Y_{i}-\bar{Y}\right)}{\sqrt{\sum_{i=1}^{n}\left(X_{i}-\bar{X}\right)^{2}} \sqrt{\sum_{i=1}^{n}\left(Y_{i}-\bar{Y}\right)^{2}}}$
<br>
<br>
<br>
<br>
<br>

In [21]:
class Cosine(UserBasedCF):
    # 余弦相似度
    def calc_user_sim_cos(self):

        # 构建“电影-用户”倒排索引
        # key = movieID, value = list of userIDs who have seen this movie
        print('Building movie-user table ...')
        movie_user = {}
        for user, movies in self.trainSet.items():
            for movie in movies:
                if movie not in movie_user:
                    movie_user[movie] = set()
                movie_user[movie].add(user)
        print('Build movie-user table success!')
        self.movie_count = len(movie_user)
        print('Total movie number = %d' % self.movie_count)

        self.user_count = len(self.trainSet)
        print('Total user  number = %d' % self.user_count)

        for a in self.trainSet:
            for b in self.trainSet:
                if a == b :
                    continue
                si = {}
                for item in self.trainSet[a]:
                    if item in self.trainSet[b]:
                        si[item] = 1

                if len(si) == 0:
                    self.user_sim_matrix.setdefault(a, {})
                    self.user_sim_matrix[a].setdefault(b,0)
                    continue

                mol = sum( (float(self.trainSet[a][item]) * float(self.trainSet[b][item])) for item in si)
                den = math.sqrt(sum(pow(float(self.trainSet[a][item]),2)for item in si)) * math.sqrt(sum(pow(float(self.trainSet[b][item]),2)for item in si))
                result = mol/den

                self.user_sim_matrix.setdefault(a, {})
                self.user_sim_matrix[a].setdefault(b, 0)
                self.user_sim_matrix[a][b] = result

        print('Calculate user similarity matrix success!')

## 余弦相似度
<br>
<br>
<br>
<br>
$\displaystyle\cos (\theta)=\frac{A \cdot B}{\|A\|\|B\|}=\frac{\sum_{i=1}^{n} A_{i} \times B_{i}}{\sqrt{\sum_{i=1}^{n}\left(A_{i}\right)^{2} \times \sqrt{\sum_{i=1}^{n}\left(B_{i}\right)^{2}}}}$
<br>
<br>
<br>
<br>
<br>

In [22]:
if __name__ == '__main__':
    rating_file = 'C:\\Users\\韩茂洲\\OneDrive\\桌面\\数据集\\ml-latest-small\\ratings.csv'

### 欧几里得距离

In [23]:
    ecu = Euclidean()
    ecu.get_dataset(rating_file)
    print("Use Euclidean")
    ecu.calc_user_sim_euclidean()   ###欧几里得距离
    ecu.evaluate()

Similar user number = 50
Recommneded movie number = 5
Load C:\Users\韩茂洲\OneDrive\桌面\数据集\ml-latest-small\ratings.csv success!
Split trainingSet and testSet success!
TrainSet = 75531
TestSet = 25305
Use Euclidean
Building movie-user table ...
Build movie-user table success!
Total movie number = 8763
Total user  number = 610
Calculate user similarity matrix success!
Evaluation start ...
precisioin=0.1275	recall=0.0154	  coverage=0.0126


### 皮尔逊相似度

In [24]:
    pearson = Pearson()
    pearson.get_dataset(rating_file)
    print("Use Pearson")
    pearson.calc_user_sim_pearson()   ###皮尔逊相似度
    pearson.evaluate()

Similar user number = 50
Recommneded movie number = 5
Load C:\Users\韩茂洲\OneDrive\桌面\数据集\ml-latest-small\ratings.csv success!
Split trainingSet and testSet success!
TrainSet = 75463
TestSet = 25373
Use Pearson
Building movie-user table ...
Build movie-user table success!
Total movie number = 8787
Total user  number = 610
Evaluation start ...
precisioin=0.1682	recall=0.0202	  coverage=0.0112


### 余弦相似度

In [25]:
    cosine = Cosine()
    cosine.get_dataset(rating_file)
    print("Use Cosine")
    cosine.calc_user_sim_cos()           ###余弦相似度
    cosine.evaluate()

Similar user number = 50
Recommneded movie number = 5
Load C:\Users\韩茂洲\OneDrive\桌面\数据集\ml-latest-small\ratings.csv success!
Split trainingSet and testSet success!
TrainSet = 75683
TestSet = 25153
Use Cosine
Building movie-user table ...
Build movie-user table success!
Total movie number = 8721
Total user  number = 610
Calculate user similarity matrix success!
Evaluation start ...
precisioin=0.1328	recall=0.0161	  coverage=0.0110
