In [30]:
import numpy as np
import random
'''
author:huang
svd++ algorithm
'''
 
 
class SVDPP:
    def __init__(self,mat,K=20):
        self.mat=np.array(mat)
        self.K=K
        self.bi={}
        self.bu={}
        self.qi={}
        self.pu={}
        self.avg=np.mean(self.mat[:,2])
        self.y={}
        self.u_dict={}
        self.RR={}
        for i in range(self.mat.shape[0]):
            
            uid=self.mat[i,0]
            iid=self.mat[i,1]
            self.u_dict.setdefault(uid,[])
            self.u_dict[uid].append(iid)
            self.bi.setdefault(iid,0)
            self.bu.setdefault(uid,0)
            self.qi.setdefault(iid,np.random.random((self.K,1))/10*np.sqrt(self.K))
            self.pu.setdefault(uid,np.random.random((self.K,1))/10*np.sqrt(self.K))
            self.y.setdefault(iid,np.zeros((self.K,1))+.1)
    def predict(self,uid,iid):  #预测评分的函数
        #setdefault的作用是当该用户或者物品未出现过时，新建它的bi,bu,qi,pu及用户评价过的物品u_dict，并设置初始值为0
        self.bi.setdefault(iid,0)
        self.bu.setdefault(uid,0)
        self.qi.setdefault(iid,np.zeros((self.K,1)))
        self.pu.setdefault(uid,np.zeros((self.K,1)))
        self.y.setdefault(uid,np.zeros((self.K,1)))
        self.u_dict.setdefault(uid,[])
        u_impl_prf,sqrt_Nu=self.getY(uid, iid)
        rating=self.avg+self.bi[iid]+self.bu[uid]+np.sum(self.qi[iid]*(self.pu[uid]+u_impl_prf)) #预测评分公式
        #由于评分范围在1到5，所以当分数大于5或小于1时，返回5,1.
        if rating>5:
            rating=5
        if rating<1:
            rating=1
        return rating
    
    #计算sqrt_Nu和∑yj
    def getY(self,uid,iid):
        Nu=self.u_dict[uid]
        I_Nu=len(Nu)
        sqrt_Nu=np.sqrt(I_Nu)
        y_u=np.zeros((self.K,1))
        if I_Nu==0:
            u_impl_prf=y_u
        else:
            for i in Nu:
                y_u+=self.y[i]
            u_impl_prf = y_u / sqrt_Nu
        
        return u_impl_prf,sqrt_Nu
    
    def train(self,steps=30,gamma=0.04,Lambda=0.15):    #训练函数，step为迭代次数。
        print('train data size',self.mat.shape)     
        for step in range(steps):
            print('step',step+1,'is running')
            KK=np.random.permutation(self.mat.shape[0]) #随机梯度下降算法，kk为对矩阵进行随机洗牌
            rmse=0.0
            for i in range(self.mat.shape[0]):    
                j=KK[i]
                uid=self.mat[j,0]
                iid=self.mat[j,1]
                rating=self.mat[j,2]
                
                #
                uids='%d'%uid
                iids='%d'%iid
                self.RR.setdefault(uids,{})
                self.RR[uids].setdefault(iids,rating)
                
                predict=self.predict(uid, iid)
                u_impl_prf,sqrt_Nu=self.getY(uid, iid)
                eui=rating-predict            
                rmse+=eui**2
                self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])  
                self.bi[iid]+=gamma*(eui-Lambda*self.bi[iid])
                self.pu[uid]+=gamma*(eui*self.qi[iid]-Lambda*self.pu[uid])
                self.qi[iid]+=gamma*(eui*(self.pu[uid]+u_impl_prf)-Lambda*self.qi[iid])
                for j in self.u_dict[uid]:
                    self.y[j]+=gamma*(eui*self.qi[j]/sqrt_Nu-Lambda*self.y[j])
                                    
            gamma=0.93*gamma 
            print('rmse is',np.sqrt(rmse/self.mat.shape[0]))
            return self.RR

    
    def test(self,testSet):  #gamma以0.93的学习率递减
        
        testSet=np.array(testSet)
        print('test data size',testSet.shape)
        rmse=0.0
        for i in range(testSet.shape[0]):
            uid=testSet[i,0]
            iid=testSet[i,1]
            rating=testSet[i,2]
            eui=rating-self.predict(uid, iid)
            rmse+=eui**2
        print('rmse of test data is',np.sqrt(rmse/testSet.shape[0]))
    
    
def getMLData(): #获取训练集和测试集的函数
    import re
    f=open("u1.base",'r')
    lines=f.readlines()
    f.close()
    data=[]
    for line in lines:
        list=re.split('\t|\n',line)
        #
        (userId, itemId, rating, timestamp) = line.strip().split('\t') 
        
        if int(list[2]) !=0:
            data.append([int(i) for i in list[:3]])
        #
        movieUser.setdefault(itemId,[])
        movieUser[itemId].append(userId.strip())
        
    trainSet=data
    f=open("u1.test",'r')
    lines=f.readlines()
    f.close()
    data=[]
    for line in lines:
         #
        (userId, itemId, rating, timestamp) = line.strip().split('\t')   
        testSet2.setdefault(userId,{})
        testSet2[userId].setdefault(itemId,float(rating))
        
        list=re.split('\t|\n',line)
        if int(list[2]) !=0:
            data.append([int(i) for i in list[:3]])
    testSet=data
   
    
    #
    #生成用户用户共有电影矩阵
    for m in movieUser.keys():
        for u in movieUser[m]:
            u2u.setdefault(u,{})
            for n in movieUser[m]:
                if u!=n:
                    u2u[u].setdefault(n,[])
                    u2u[u][n].append(m)
    
    return trainSet,testSet,testSet2,u2u

#
#计算一个用户的平均评分  
def getAverageRating(user):  
    average = (sum(RR[user].values())*1.0) / len(RR[user].keys())  
    return average


#
#计算用户相似度  
def getUserSim(u2u,RR):
    userSim = {}
    # 计算用户的用户相似度  
    for u in u2u.keys(): #对每个用户u
        userSim.setdefault(u,{})  #将用户u加入userSim中设为key，该用户对应一个字典
        average_u_rate = getAverageRating(u)  #获取用户u对电影的平均评分
        for n in u2u[u].keys():  #对与用户u相关的每个用户n             
            userSim[u].setdefault(n,0)  #将用户n加入用户u的字典中
 
            average_n_rate = getAverageRating(n)  #获取用户n对电影的平均评分
              
            part1 = 0  #皮尔逊相关系数的分子部分
            part2 = 0  #皮尔逊相关系数的分母的一部分
            part3 = 0  #皮尔逊相关系数的分母的一部分
            for m in u2u[u][n]:  #对用户u和用户n的共有的每个电影  
                part1 += (RR[u][m]-average_u_rate)*(RR[n][m]-average_n_rate)*1.0  
                part2 += pow(RR[u][m]-average_u_rate, 2)*1.0  
                part3 += pow(RR[n][m]-average_n_rate, 2)*1.0  
                  
            part2 = sqrt(part2)  
            part3 = sqrt(part3)  
            if part2 == 0 or part3 == 0:  #若分母为0，相似度为0
                userSim[u][n] = 0
            else:
                userSim[u][n] = part1 / (part2 * part3)
    return userSim

#
#寻找用户最近邻并生成推荐结果
def getRecommendations(N,RR,userSim):
    pred = {}
    for user in RR.keys():    #对每个用户
        pred.setdefault(user,{})    #生成预测空列表
        interacted_items = RR[user].keys() #获取该用户评过分的电影  
        average_u_rate = getAverageRating(user)  #获取该用户的评分平均分
        userSimSum = 0
        simUser = sorted(userSim[user].items(),key = lambda x : x[1],reverse = True)[0:N]
        for n, sim in simUser:  
            average_n_rate = getAverageRating(n)
            userSimSum += sim   #对该用户近邻用户相似度求和
            for m, nrating in RR[n].items():  
                if m in interacted_items:  
                    continue  
                else:
                    pred[user].setdefault(m,0)
                    pred[user][m] += (sim * (nrating - average_n_rate))
        for m in pred[user].keys():  
                pred[user][m] = average_u_rate + (pred[user][m]*1.0) / userSimSum
    return pred

#
#计算预测分析准确度
def getRMSE(testSet2,pred):
    MAE = 0
    rSum = 0
    setSum = 0
 
    for user in pred.keys():    #对每一个用户
        for movie, rating in pred[user].items():    #对该用户预测的每一个电影    
            if user in testSet2.keys() and movie in testSet2[user].keys() :   #如果用户为该电影评过分
                setSum = setSum + 1     #预测准确数量+1
                rSum = rSum + abs(testSet2[user][movie]-rating)*abs(testSet2[user][movie]-rating)      #累计预测评分误差
    MSE = rSum / setSum
    RMSE=(MSE)**0.5
    return RMSE


movieUser = {}
testSet2={}
trainSet,testSet,testSet2,u2u=getMLData()
RR={}
a=SVDPP(trainSet,30) 
RR=a.train()
print(RR)
userSim = getUserSim(u2u,RR)
print('正在寻找最近邻...')
for N in (5,10,20,30,40,50,60,70,80,90,100):            #对不同的近邻数
        pred = getRecommendations(N,RR,userSim)   #获得推荐
        rmse = getRMSE(testSet2,pred)  #计算MAE
        print('邻居数为：N= %d 时 预测评分准确度为：RMSE=%f'%(N,rmse))
a.test(testSet)

train data size (80000, 3)
step 1 is running
rmse is 1.037133470005452
{'178': {'751': 4, '31': 4, '100': 4, '1315': 4, '588': 4, '222': 4, '153': 4, '156': 2, '993': 5, '249': 3, '226': 4, '654': 3, '472': 4, '331': 4, '881': 2, '300': 5, '823': 2, '1016': 4, '333': 3, '202': 5, '62': 4, '82': 5, '304': 4, '435': 4, '230': 4, '164': 3, '66': 4, '220': 3, '1033': 2, '298': 2, '64': 5, '658': 5, '173': 5, '316': 4, '238': 4, '281': 3, '95': 5, '157': 5, '792': 5, '51': 4, '143': 4, '744': 3, '506': 3, '729': 4, '313': 5, '180': 3, '763': 4, '405': 3, '293': 4, '69': 5, '724': 4, '168': 4, '511': 5, '483': 4, '97': 5, '764': 3, '79': 4, '756': 3, '625': 3, '790': 3, '591': 5, '607': 3, '1011': 3, '12': 5, '50': 5, '684': 5, '235': 1, '38': 3, '92': 3, '11': 5, '172': 4, '280': 4, '200': 3, '203': 4, '781': 4, '246': 4, '194': 4, '873': 3, '735': 5, '269': 4, '1047': 2, '354': 4, '233': 4, '535': 3, '846': 3, '265': 5, '628': 4, '423': 4, '1101': 4, '1197': 4, '127': 5, '215': 5, '1038': 

正在寻找最近邻...
邻居数为：N= 5 时 预测评分准确度为：RMSE=1.067058
邻居数为：N= 10 时 预测评分准确度为：RMSE=1.057628
邻居数为：N= 20 时 预测评分准确度为：RMSE=1.058863
邻居数为：N= 30 时 预测评分准确度为：RMSE=1.060133
邻居数为：N= 40 时 预测评分准确度为：RMSE=1.051688
邻居数为：N= 50 时 预测评分准确度为：RMSE=1.046364
邻居数为：N= 60 时 预测评分准确度为：RMSE=1.043824
邻居数为：N= 70 时 预测评分准确度为：RMSE=1.041982
邻居数为：N= 80 时 预测评分准确度为：RMSE=1.038821
邻居数为：N= 90 时 预测评分准确度为：RMSE=1.040284
邻居数为：N= 100 时 预测评分准确度为：RMSE=1.038841
test data size (20000, 3)
rmse of test data is 0.9890945654509211


In [18]:
from math import sqrt
from sklearn.metrics import mean_squared_error

    
def loadData():
    trainSet = {}
    testSet = {}
    movieUser = {}
    u2u = {}
    TrainFile = 'u1.base'   #指定训练集 
    TestFile = 'u1.test'    #指定测试集
    
    #加载训练集
    for line in open(TrainFile):
        (userId, itemId, rating, timestamp) = line.strip().split('\t')   
        trainSet.setdefault(userId,{})
        trainSet[userId].setdefault(itemId,float(rating))
 
        movieUser.setdefault(itemId,[])
        movieUser[itemId].append(userId.strip())
    #加载测试集
    for line in open(TestFile): 
        (userId, itemId, rating, timestamp) = line.strip().split('\t')   
        testSet.setdefault(userId,{})
        testSet[userId].setdefault(itemId,float(rating))
 
    #生成用户用户共有电影矩阵
    for m in movieUser.keys():
        for u in movieUser[m]:
            u2u.setdefault(u,{})
            for n in movieUser[m]:
                if u!=n:
                    u2u[u].setdefault(n,[])
                    u2u[u][n].append(m)
    return trainSet,testSet,u2u
      
  
 
#计算一个用户的平均评分  
def getAverageRating(user):  
    average = (sum(trainSet[user].values())*1.0) / len(trainSet[user].keys())  
    return average
 
#计算用户相似度  
def getUserSim(u2u,trainSet):
    userSim = {}
    # 计算用户的用户相似度  
    for u in u2u.keys(): #对每个用户u
        userSim.setdefault(u,{})  #将用户u加入userSim中设为key，该用户对应一个字典
        average_u_rate = getAverageRating(u)  #获取用户u对电影的平均评分
        for n in u2u[u].keys():  #对与用户u相关的每个用户n             
            userSim[u].setdefault(n,0)  #将用户n加入用户u的字典中
 
            average_n_rate = getAverageRating(n)  #获取用户n对电影的平均评分
              
            part1 = 0  #皮尔逊相关系数的分子部分
            part2 = 0  #皮尔逊相关系数的分母的一部分
            part3 = 0  #皮尔逊相关系数的分母的一部分
            for m in u2u[u][n]:  #对用户u和用户n的共有的每个电影  
                part1 += (trainSet[u][m]-average_u_rate)*(trainSet[n][m]-average_n_rate)*1.0  
                part2 += pow(trainSet[u][m]-average_u_rate, 2)*1.0  
                part3 += pow(trainSet[n][m]-average_n_rate, 2)*1.0  
                  
            part2 = sqrt(part2)  
            part3 = sqrt(part3)  
            if part2 == 0 or part3 == 0:  #若分母为0，相似度为0
                userSim[u][n] = 0
            else:
                userSim[u][n] = part1 / (part2 * part3)
    return userSim
  
 
#寻找用户最近邻并生成推荐结果
def getRecommendations(N,trainSet,userSim):
    pred = {}
    for user in trainSet.keys():    #对每个用户
        pred.setdefault(user,{})    #生成预测空列表
        interacted_items = trainSet[user].keys() #获取该用户评过分的电影  
        average_u_rate = getAverageRating(user)  #获取该用户的评分平均分
        userSimSum = 0
        simUser = sorted(userSim[user].items(),key = lambda x : x[1],reverse = True)[0:N]
        for n, sim in simUser:  
            average_n_rate = getAverageRating(n)
            userSimSum += sim   #对该用户近邻用户相似度求和
            for m, nrating in trainSet[n].items():  
                if m in interacted_items:  
                    continue  
                else:
                    pred[user].setdefault(m,0)
                    pred[user][m] += (sim * (nrating - average_n_rate))
        for m in pred[user].keys():  
                pred[user][m] = average_u_rate + (pred[user][m]*1.0) / userSimSum
    return pred
 
#计算预测分析准确度
def getRMSE(testSet,pred):
    MAE = 0
    rSum = 0
    setSum = 0
 
    for user in pred.keys():    #对每一个用户
        for movie, rating in pred[user].items():    #对该用户预测的每一个电影    
            if user in testSet.keys() and movie in testSet[user].keys() :   #如果用户为该电影评过分
                setSum = setSum + 1     #预测准确数量+1
                rSum = rSum + abs(testSet[user][movie]-rating)*abs(testSet[user][movie]-rating)      #累计预测评分误差
    MSE = rSum / setSum
    RMSE=(MSE)**0.5
    return RMSE

 
 
if __name__ == '__main__':
    
    print('正在加载数据...')
    trainSet,testSet,u2u = loadData()
    print(trainSet)
    print('正在计算用户间相似度...')
    userSim = getUserSim(u2u,trainSet)
    
    print('正在寻找最近邻...')
    for N in (5,10,20,30,40,50,60,70,80,90,100):            #对不同的近邻数
        pred = getRecommendations(N,trainSet,userSim)   #获得推荐
        rmse = getRMSE(testSet,pred)  #计算MAE
        print('邻居数为：N= %d 时 预测评分准确度为：RMSE=%f'%(N,rmse))


正在加载数据...
{'1': {'1': 5.0, '2': 3.0, '3': 4.0, '4': 3.0, '5': 3.0, '7': 4.0, '8': 1.0, '9': 5.0, '11': 2.0, '13': 5.0, '15': 5.0, '16': 5.0, '18': 4.0, '19': 5.0, '21': 1.0, '22': 4.0, '25': 4.0, '26': 3.0, '28': 4.0, '29': 1.0, '30': 3.0, '32': 5.0, '34': 2.0, '35': 1.0, '37': 2.0, '38': 3.0, '40': 3.0, '41': 2.0, '42': 5.0, '43': 4.0, '45': 5.0, '46': 4.0, '48': 5.0, '50': 5.0, '52': 4.0, '55': 5.0, '57': 5.0, '58': 4.0, '59': 5.0, '63': 2.0, '66': 4.0, '68': 4.0, '71': 3.0, '75': 4.0, '77': 4.0, '79': 4.0, '83': 3.0, '87': 5.0, '88': 4.0, '89': 5.0, '93': 5.0, '94': 2.0, '95': 4.0, '99': 3.0, '101': 2.0, '105': 2.0, '106': 4.0, '109': 5.0, '110': 1.0, '111': 5.0, '115': 5.0, '116': 3.0, '119': 5.0, '122': 3.0, '123': 4.0, '124': 5.0, '126': 2.0, '127': 5.0, '131': 1.0, '133': 4.0, '135': 4.0, '136': 3.0, '137': 5.0, '138': 1.0, '139': 3.0, '141': 3.0, '142': 2.0, '144': 4.0, '146': 4.0, '147': 3.0, '149': 2.0, '152': 5.0, '153': 3.0, '156': 4.0, '158': 3.0, '162': 4.0, '165': 5.0, '

KeyboardInterrupt: 