**用之前准备好的数据做协同过滤，最后将所有信息串起来生成新的训练测试数据**

In [68]:
# 将所有特征串联起来，构成RS_Train.csv
#RS_Test.csv
#为最后推荐系统做准备
import pickle
import numpy as np
import scipy.io as sio
import scipy.sparse as ss
from numpy.random import random  
from collections import defaultdict

In [69]:
train_dpath = "C:/Users/Lzg/Desktop/data/w4/train.csv"
test_dpath = "C:/Users/Lzg/Desktop/data/w4/test.csv"

**三种协同过滤代码补充**

In [88]:
class RecommonderSystem:
  def __init__(self):
    # 读入数据做初始化
    
    #用户和活动新的索引
    self.userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
    self.eventIndex = pickle.load(open("PE_eventIndex.pkl", 'rb'))
    self.n_users = len(self.userIndex)
    self.n_items = len(self.eventIndex)
    
    #用户-活动关系矩阵R
    #在train_SVD会重新从文件中读取,二者要求的格式不同，来不及统一了:(
    self.userEventScores = sio.mmread("PE_userEventScores").todense()
    
    #用户平均打分
    self.nonzero_scores_index = np.transpose(np.nonzero(self.userEventScores))
    self.n_nonzero_scores = self.nonzero_scores_index.shape[0]
    self.mu = np.sum(self.userEventScores)/self.n_nonzero_scores
    print("n_nonzero_scores = "+str(self.n_nonzero_scores))
    print("self.mu = "+str(self.mu))
    
    #倒排表
    ##每个用户参加的事件
    self.itemsForUser = pickle.load(open("PE_eventsForUser.pkl", 'rb'))
    ##事件参加的用户
    self.usersForItem = pickle.load(open("PE_usersForEvent.pkl", 'rb'))
    
    #基于模型的协同过滤参数初始化,训练
    self.init_SVD()
    self.train_SVD(trainfile =train_dpath)
    
    #根据用户属性计算出的用户之间的相似度
    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
    
    #根据活动属性计算出的活动之间的相似度
    self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
    self.eventContSim = sio.mmread("EV_eventContSim").todense()
    
    #每个用户的朋友的数目
    self.numFriends = sio.mmread("UF_numFriends")
    #用户的每个朋友参加活动的分数对该用户的影响
    self.userFriends = sio.mmread("UF_userFriends").todense()
    
    #活动本身的热度
    self.eventPopularity = sio.mmread("EA_eventPopularity").todense()

  def init_SVD(self, K=20):
    #初始化模型参数（for 基于模型的协同过滤SVD_CF）
    self.K = K  
    
    #init parameters
    #bias
    self.bi = np.zeros(self.n_items)  
    self.bu = np.zeros(self.n_users)  
    
    #the small matrix
    self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))
    self.Q = random((self.K, self.n_items))/10*(np.sqrt(self.K))  
                  
          
  def train_SVD(self,trainfile = train_dpath, steps=100,gamma=0.04,Lambda=0.15):
    #训练SVD模型（for 基于模型的协同过滤SVD_CF）
    #gamma：为学习率
    #Lambda：正则参数
    
    #偷懒了，为了和原来的代码的输入接口一样，直接从训练文件中去读取数据
    print("SVD Train...")
    ftrain = open(trainfile, 'r')
    ftrain.readline()
    self.mu = 0.0
    n_records = 0
    uids = []  #每条记录的用户索引
    i_ids = [] #每条记录的item索引
    #用户-Item关系矩阵R（内容同userEventScores相同），临时变量，训练完了R不再需要
    R = np.zeros((self.n_users, self.n_items))
    
    for line in ftrain:
        cols = line.strip().split(",")
        u = self.userIndex[cols[0]]  #用户
        i = self.eventIndex[cols[1]] #活动
        
        uids.append(u)
        i_ids.append(i)
        
        R[u,i] = int(cols[4])  #interested
        self.mu += R[u,i]
        n_records += 1
    
    ftrain.close()
    self.mu /= n_records
    
    # 请补充完整SVD模型训练过程
    for step in range(steps):
        rmse_sum = 0.0
        kk = np.random.permutation(self.n_nonzero_scores)
        for j in range(self.n_nonzero_scores):
            b = kk[j]
            temp = self.nonzero_scores_index[b]
            u = temp[0]
            i = temp[1]
            
            #预测残差计算
            eui = self.userEventScores[u,i]-self.pred_SVD(u,i)
            #残差平方和
            rmse_sum += eui**2
            #开始进行梯度下降的更新,p,Q,bu,bi
            for k in range(self.K):
                self.P[u,k] += gamma*eui*self.Q[k,i]-Lambda*self.P[u,k]
                self.Q[k,i] += gamma*eui*self.P[u,k]-Lambda*self.Q[k,i]
                
                self.bu[u] += gamma*(eui-Lambda*self.bu[u])
                self.bi[i] += gamma*(eui-Lambda*self.bi[i])
        gamma =gamma*0.93  #学习率递减  
            
    print ("SVD trained")
    
  def pred_SVD(self, uid, i_id):
    #根据当前参数，预测用户uid对Item（i_id）的打分        
    ans=self.mu + self.bi[i_id] + self.bu[uid] + np.dot(self.P[uid,:],self.Q[:,i_id])  
        
    #将打分范围控制在0-1之间
    if ans>1:  
        return 1  
    elif ans<0:  
        return 0
    return ans  

  def sim_cal_UserCF(self, uid1, uid2 ):
    #请补充基于用户的协同过滤中的两个用户uid1和uid2之间的相似度（根据两个用户对item打分的相似度）
    similarity = 0.0
    
    #找到两个用户都打过分的事件
    item_common = {}   
    for item in self.itemsForUser[uid1]:
        if item in self.itemsForUser[uid2]:
            item_common[item] = 1
    n = len(item_common)
    if(n==0):
        similarity = 0
        return similarity
    #计算相似度合
    s1 = np.array([self.userEventScores[uid1,item] for item in item_common])
    s2 = np.array([self.userEventScores[uid2,item] for item in item_common]) 
  
    sum1 = np.sum(s1)
    sum2 = np.sum(s2)
    sum1sq = np.sum(s1**2)
    sum2sq = np.sum(s2**2)
    psum = np.sum(s1*s2)
    #分子
    num = psum-(sum1*sum2/n)
    #分母
    den = np.sqrt((sum1sq-sum1**2/n)*(sum2sq-sum2**2/n))
    if den == 0:
        similarity = 0
        return 0
    similarity = num/den
    return similarity  

  def userCFReco(self, userId, eventId):
    """
    根据User-based协同过滤，得到event的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    #请补充完整代码
    ans = 0.0
    u = self.userIndex[userId]
    i = self.eventIndex[eventId]
    sim_acc = 0.0
    rat_acc = 0.0
    for user in self.usersForItem[i]:
        sim = self.sim_cal_UserCF(uid1 = user, uid2 = u )  #算一下当前用户与其他用户的相似度，在参加过同个item的用户中计算
        if sim == 0:
            continue
        rat_acc += sim*userEventScores[user,i]  #打分加上用户相似度的权重
        sim_acc += sim 
    if sim_acc == 0:   #没有用户关联
        return self.mu
    ans = rat_acc/sim_acc 
    if ans > 2:      
        return 2
    elif ans < -1:
        return -1
    return ans


  def sim_cal_ItemCF(self, i_id1, i_id2):
    #计算Item i_id1和i_id2之间的相似性
    #请补充完整代码
    similarity = 0.0
    si = {}
    for user in self.usersForItem[i_id1]:   #找到参加相同时间的用户
        if user in self.usersForItem[i_id2]:
            si[user] = 1
    n = len(si)
    if (n == 0):
        return 0
    s1 = np.array([self.userEventScores[u,i_id1] for u in si])
    s2 = np.array([self.userEventScores[u,i_id2] for u in si])
  
    sum1 = np.sum(s1)
    sum2 = np.sum(s2)
    sum1sq = np.sum(s1**2)
    sum2sq = np.sum(s2**2)
    psum = np.sum(s1*s2)
    #分子
    num = psum-(sum1*sum2/n)
    #分母
    den = np.sqrt((sum1sq-sum1**2/n)*(sum2sq-sum2**2/n))
    if den == 0:
        return 0
    simularity =  num/den   
    return simularity      
            
  def eventCFReco(self, userId, eventId):    
    """
    根据基于物品的协同过滤，得到Event的推荐度
    基本的伪代码思路如下：
    for item i 
        for every item j tht u has a preference for
            compute similarity s between i and j
            add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    #请补充完整代码
    ans = 0.0
    u = self.userIndex[userId]
    i = self.eventIndex[eventId]
    sim_acc = 0.0
    rat_acc = 0.0
    for item in self.itemsForUser[u]:
        sim = self.sim_cal_ItemCF(item,i)
        rat_acc += sim*self.userEventScores[u,item]   #根据相关item相似度对感兴趣程度进行加权
        sim_acc += sim
    if sim_acc == 0:     
        return self.mu
    ans = rat_acc/sim_acc
    if ans > 2:    #取值归一到-1到2之间
        return 2
    elif ans <-1:
        return -1
    return ans
    
  def svdCFReco(self, userId, eventId):
    #基于模型的协同过滤, SVD++/LFM
    u = self.userIndex[userId]
    i = self.eventIndex[eventId]

    return self.pred_SVD(u,i)

  def userReco(self, userId, eventId):
    """
    类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，计算event的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]

    vs = self.userEventScores[:, j]
    sims = self.userSimMatrix[i, :]

    prod = sims * vs

    try:
      return prod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      return 0

  def eventReco(self, userId, eventId):
    """
    类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，计算Event的推荐度
    基本的伪代码思路如下：
    for item i 
      for every item j that u has a preference for
        compute similarity s between i and j
        add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    js = self.userEventScores[i, :]
    psim = self.eventPropSim[:, j]
    csim = self.eventContSim[:, j]
    pprod = js * psim
    cprod = js * csim
    
    pscore = 0
    cscore = 0
    try:
      pscore = pprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    try:
      cscore = cprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    return pscore, cscore

  def userPop(self, userId):
    """
    基于用户的朋友个数来推断用户的社交程度
    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
    """
    if userId in self.userIndex:
      i = self.userIndex[userId]
      try:
        return self.numFriends[0, i]
      except IndexError:
        return 0
    else:
      return 0

  def friendInfluence(self, userId):
    """
    朋友对用户的影响
    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
    """
    nusers = np.shape(self.userFriends)[1]
    i = self.userIndex[userId]
    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

  def eventPop(self, eventId):
    """
    本活动本身的热度
    主要是通过参与的人数来界定的
    """
    i = self.eventIndex[eventId]
    return self.eventPopularity[i, 0]


**产生新的训练数据和测试数据**

In [91]:
def generateRSData(RS, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    
    fn = train_dpath if train else test_dpath
    fin = open(fn, 'r')
    fname = "train.csv" if train else "test.csv"
    fout = open("RS_"+fname, 'w')
    
    #忽略第一行（列名字）
    fin.readline().strip().split(",")
    
    # write output header
    if header:
      ocolnames = ["invited", "userCF_reco", "evtCF_reco","svdCF_reco","user_reco", "evt_p_reco",
        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
      if train:
        ocolnames.append("interested")
        ocolnames.append("not_interested")
      fout.write(",".join(ocolnames) + "\n")
    
    ln = 0
    for line in fin:
      ln += 1
      if ln%500 == 0:
          print ("%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
          #break;
      
      cols = line.strip().split(",")
      userId = cols[0]
      eventId = cols[1]
      invited = cols[2]
      
      userCF_reco = RS.userCFReco(userId, eventId)
      itemCF_reco = RS.eventCFReco(userId, eventId)
      svdCF_reco = RS.svdCFReco(userId, eventId)
        
      user_reco = RS.userReco(userId, eventId)
      evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)
      user_pop = RS.userPop(userId)
     
      frnd_infl = RS.friendInfluence(userId)
      evt_pop = RS.eventPop(eventId)
      ocols = [invited, userCF_reco, itemCF_reco, svdCF_reco,user_reco, evt_p_reco,
        evt_c_reco, user_pop, frnd_infl, evt_pop]
      
      if train:
        ocols.append(cols[4]) # interested
        ocols.append(cols[5]) # not_interested
      fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
    
    
    fin.close()
    fout.close()


**生成实例调用**

In [92]:
RS = RecommonderSystem()
print ("生成训练数据...\n")
generateRSData(RS,train=True,  header=True)

print ("生成预测数据...\n")
generateRSData(RS, train=False, header=True)

n_nonzero_scores = 4131
self.mu = 1.0
SVD Train...
SVD trained
生成训练数据...

C:/Users/Lzg/Desktop/data/w4/train.csv:500 (userId, eventId)=(123290209, 1887085024)
C:/Users/Lzg/Desktop/data/w4/train.csv:1000 (userId, eventId)=(272886293, 199858305)
C:/Users/Lzg/Desktop/data/w4/train.csv:1500 (userId, eventId)=(395305791, 1582270949)
C:/Users/Lzg/Desktop/data/w4/train.csv:2000 (userId, eventId)=(527523423, 3272728211)
C:/Users/Lzg/Desktop/data/w4/train.csv:2500 (userId, eventId)=(651258472, 792632006)
C:/Users/Lzg/Desktop/data/w4/train.csv:3000 (userId, eventId)=(811791433, 524756826)
C:/Users/Lzg/Desktop/data/w4/train.csv:3500 (userId, eventId)=(985547042, 1269035551)
C:/Users/Lzg/Desktop/data/w4/train.csv:4000 (userId, eventId)=(1107615001, 173949238)
C:/Users/Lzg/Desktop/data/w4/train.csv:4500 (userId, eventId)=(1236336671, 3849306291)
C:/Users/Lzg/Desktop/data/w4/train.csv:5000 (userId, eventId)=(1414301782, 2652356640)
C:/Users/Lzg/Desktop/data/w4/train.csv:5500 (userId, eventId)=(15954

**看一下写好的新数据**

In [95]:
import pandas as pd
train = pd.read_csv("RS_train.csv")
train.head()

Unnamed: 0,invited,userCF_reco,evtCF_reco,svdCF_reco,user_reco,evt_p_reco,evt_c_reco,user_pop,frnd_infl,evt_pop,interested,not_interested
0,0,0.0,0.268282,0.60911,0.0,0.8259265,0.8259265,0.000231,0.0,-3.9e-05,0,0
1,0,0.0,0.268282,0.60911,0.0,0.1649779,0.1649779,0.000231,0.0,1.8e-05,0,0
2,0,1.2,0.591891,0.948998,122.134697,-1.0,-1.0,0.000231,0.0,0.000173,1,0
3,0,0.0,0.268282,0.60911,0.0,1.024117,1.024117,0.000231,0.0,1.6e-05,0,0
4,0,-0.102564,0.168161,0.949058,27.177369,2.443664e-07,2.443664e-07,0.000231,0.0,6.4e-05,0,0


In [97]:
train.shape

(15398, 12)

In [98]:
test = pd.read_csv("RS_test.csv")
test.head()

Unnamed: 0,invited,userCF_reco,evtCF_reco,svdCF_reco,user_reco,evt_p_reco,evt_c_reco,user_pop,frnd_infl,evt_pop
0,0,0.268282,0.268282,1.0,0.0,0.0,0.0,0.000118,0.0,0.000138
1,0,0.268282,0.268282,1.0,0.0,0.0,0.0,0.000118,0.0,3e-05
2,0,0.268282,0.268282,1.0,0.0,0.0,0.0,0.000118,0.0,5.8e-05
3,0,0.268282,0.268282,1.0,0.0,0.0,0.0,0.000118,0.0,6.9e-05
4,0,0.268282,0.268282,1.0,0.0,0.0,0.0,0.000118,0.0,4.6e-05


In [99]:
test.shape

(10237, 10)

**本次数据量很大，对用户、事件的处理虽然有大部分代码，理解还是用了很长时间，在随机梯度下降算法中开始没有用R矩阵中的用户事件索引，直接遍历train文件中的用户和事件，效率非常低，还是要尽量使用稀疏矩阵，用有效数据。特征工程方面比较难，需要积累经验。**