In [1]:
path='../../event_recommendation_engine_challenge_data/'

In [18]:
# 将所有特征串联起来，构成RS_Train.csv
#RS_Test.csv
#为最后推荐系统做准备
from __future__ import division

import pickle
import numpy as np
import scipy.io as sio
import scipy.sparse as ss
from numpy.random import random  
from collections import defaultdict

class RecommonderSystem:
    def __init__(self,X):
        # 读入数据做初始化
        self.X=X
    
        #用户和活动新的索引
        self.userIndex = pickle.load(open(path+"PE_userIndex.pkl", 'rb'))
        self.eventIndex = pickle.load(open(path+"PE_eventIndex.pkl", 'rb'))
        self.n_users = len(self.userIndex)
        self.n_items = len(self.eventIndex)
    
        #用户-活动关系矩阵R
        #在train_SVD会重新从文件中读取,二者要求的格式不同，来不及统一了:(
        self.userEventScores = sio.mmread(path+"PE_userEventScores").todense()
    
        #倒排表
        ##每个用户参加的事件
        self.itemsForUser = pickle.load(open(path+"PE_eventsForUser.pkl", 'rb'))
        ##事件参加的用户
        self.usersForItem = pickle.load(open(path+"PE_usersForEvent.pkl", 'rb'))
    
        #基于模型的协同过滤参数初始化,训练
        self.init_SVD()
        self.train_SVD(self.X)
    
        #根据用户属性计算出的用户之间的相似度
        self.userSimMatrix = sio.mmread(path+"US_userSimMatrix").todense()
    
        #根据活动属性计算出的活动之间的相似度
        self.eventPropSim = sio.mmread(path+"EV_eventPropSim").todense()
        self.eventContSim = sio.mmread(path+"EV_eventContSim").todense()
    
        #每个用户的朋友的数目
        self.numFriends = sio.mmread(path+"UF_numFriends")
        #用户的每个朋友参加活动的分数对该用户的影响
        self.userFriends = sio.mmread(path+"UF_userFriends").todense()
    
        #活动本身的热度
        self.eventPopularity = sio.mmread(path+"EA_eventPopularity").todense()
        
        self.n_Items = len(self.usersForItem)+1 #数组的索引从0开始，浪费第0个元素
        self.similarity = np.zeros((self.n_Items, self.n_Items), dtype=np.float)
        self.similarity[:,:] = -1


    def init_SVD(self, K=20):
        #初始化模型参数（for 基于模型的协同过滤SVD_CF）
        self.K = K  
    
        #init parameters
        #bias
        self.bi = np.zeros(self.n_items)  
        self.bu = np.zeros(self.n_users)  
        self.mu = 0.0
    
        #the small matrix
        self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))
        self.Q = random((self.K, self.n_items))/10*(np.sqrt(self.K))  
                  
          
    def train_SVD(self,X, steps=100,gamma=0.04,Lambda=0.15):
        #训练SVD模型（for 基于模型的协同过滤SVD_CF）
        #gamma：为学习率
        #Lambda：正则参数
    
        #偷懒了，为了和原来的代码的输入接口一样，直接从训练文件中去读取数据
        print ("SVD Train...")
        self.mu = np.mean(self.X[:,4])
        for step in range(steps): 
            print ('the ',step,'-th  step is running')  

            uids = []  #每条记录的用户索引
            i_ids = [] #每条记录的item索引
            #用户-Item关系矩阵R（内容同userEventScores相同），临时变量，训练完了R不再需要
            R = np.zeros((self.n_users, self.n_items))
            
            #将训练样本打散顺序
            kk = np.random.permutation(self.X.shape[0])
            for j in range(self.X.shape[0]):
                #每次一个训练样本
                k=kk[j]
                u = self.userIndex[self.X[k][0]]  #用户
                i = self.eventIndex[self.X[k][1]] #活动
        
                uids.append(u)
                i_ids.append(i)
        
                R[u,i] = int(self.X[k][4])  #interested

            
                #随机梯度下降，更新
                self.bu[u]+=gamma*(eui-Lambda*self.bu[u])  
                self.bi[i]+=gamma*(eui-Lambda*self.bi[i]) 
                
                temp=self.Q[i]  
                self.Q[i]+=gamma*(eui*self.P[uid]-Lambda*self.Q[i])  
                self.P[u]+=gamma*(eui*temp-Lambda*self.P[u])
                #学习率递减
            gamma=gamma*0.93
        # 请补充完整SVD模型训练过程
        print ("SVD trained")
    
    def pred_SVD(self, uid, i_id):
        #根据当前参数，预测用户uid对Item（i_id）的打分 
        self.bi.setdefault(i_id,0)  
        self.bu.setdefault(uid,0)  
        
        self.Q.setdefault(i_id,np.zeros((self.k,1)))  
        self.P.setdefault(uid,np.zeros((self.k,1)))  
        
        if (self.Q[i_id].all()==None):  
            self.Q[i_id]=np.zeros((self.k,1))  
        if (self.P[uid].all()==None):  
            self.P[uid]=np.zeros((self.k,1))  
        
        #ans=self.mu + self.bi[i_id] + self.bu[uid] + np.sum(self.Q[i_id]*self.P[uid])  
        
        ans=self.mu + self.bi[i_id] + self.bu[uid] + np.dot(self.P[uid,:],self.Q[:,i_id])  
        
        #将打分范围控制在0-1之间
        if ans>1:  
            return 1  
        elif ans<0:  
            return 0
        return ans  

    def sim_cal_UserCF(self, uid1, uid2 ):
        #请补充基于用户的协同过滤中的两个用户uid1和uid2之间的相似度（根据两个用户对item打分的相似度）
        if self.similarity[uid1][uid2]!=-1:  #如果已经计算好
            return self.similarity[uid1][uid2]  
        
        si={}  
        for item in self.itemsForUser[uid1]:  #uid1所有打过分的Item1
            if item in self.itemsForUser[uid2]:  #如果uid2也对该Item打过分
                si[item]=1  #item为一个有效item
        
        #print si
        n=len(si)   #有效item数，有效item为即对uid对Item打过分，uid2也对Item打过分
        if (n==0):  #没有共同打过分的item，相似度设为1.因为最低打分为1？
            self.similarity[uid1][uid2]=0  
            self.similarity[uid1][uid2]=0  
            return 0  
        
        #用户uid1打过分的所有有效的item
        s1=np.array([self.itemsForUser[uid1][item] for item in si])  
        
        #用户uid2打过分的所有有效的Item
        s2=np.array([self.itemsForUser[uid2][item] for item in si])  
        
        sum1=np.sum(s1)  
        sum2=np.sum(s2)  
        sum1Sq=np.sum(s1**2)  
        sum2Sq=np.sum(s2**2)  
        pSum=np.sum(s1*s2)  
        
        #分子
        num=pSum-(sum1*sum2/n)  
        
        #分母
        den=np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n))  
        if den==0:  
            self.similarity[uid1][uid2]=0  
            self.similarity[uid2][uid1]=0  
            return 0  
        similarity=num/den
        self.similarity[uid1][uid2]= similarity 
        self.similarity[uid2][uid1]= similarity 
    
        return similarity  

    def userCFReco(self, userId, eventId):
        """
        根据User-based协同过滤，得到event的推荐度
        基本的伪代码思路如下：
        for item i
            for every other user v that has a preference for i
            compute similarity s between u and v
            incorporate v's preference for i weighted by s into running aversge
            return top items ranked by weighted average
        """
        #请补充完整代码
        sim_accumulate=0.0
        rat_acc=0.0
        
        for user in self.usersForItem[i_id]:  #对i_id打过分的所有用户
            sim = self.sim_cal_UserCF(user,uid)    #该user与uid之间的相似度
            if sim<=0:continue              
            rat_acc += sim * self.usersForItem[i_id][user] 
            sim_accumulate += sim  
        
        #print rat_acc,sim_accumulate  
        if sim_accumulate==0: #no same user rated,return average rates of the data  
            return  self.mu  
        ans=rat_acc/sim_accumulate
        return ans


        
    def sim_cal_ItemCF(self, i_id1, i_id2):
        #计算Item i_id1和i_id2之间的相似性
        if self.similarity[i_id1][i_id2]!=-1:  #如果已经计算好
            return self.similarity[i_id1][i_id2]  
        
        si={}  
        for user in self.usersForItem[i_id1]:  #所有对Item1打过分的的user
            if user in self.usersForItem[i_id2]:  #如果该用户对Item2也打过分
                #print self.UsersForItem[i_id2]
                si[user]=1  #user为一个有效用用户
        
        #print si
        n=len(si)   #有效用户数，有效用户为即对Item1打过分，也对Item2打过分
        if (n==0):  #没有共同打过分的用户，相似度设为1.因为最低打分为1？
            self.similarity[i_id1][i_id2]=0  
            self.similarity[i_id1][i_id1]=0  
            return 0  
        
        #所有有效用户对Item1的打分
        s1=np.array([self.usersForItem[i_id1][u] for u in si])  
        
        #所有有效用户对Item2的打分
        s2=np.array([self.usersForItem[i_id2][u] for u in si])  
        
        sum1=np.sum(s1)  
        sum2=np.sum(s2)  
        sum1Sq=np.sum(s1**2)  
        sum2Sq=np.sum(s2**2)  
        pSum=np.sum(s1*s2)  
        
        #分子
        num=pSum-(sum1*sum2/n)  
        
        #分母
        den=np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n))  
        if den==0:  
            self.similarity[i_id1][i_id2]=0  
            self.similarity[i_id2][i_id1]=0  
            return 0  
        similarity=num/den
        
        self.similarity[i_id1][i_id2]=similarity  
        self.similarity[i_id2][i_id1]=similarity  
        return similarity
        #请补充完整代码
    
    def eventCFReco(self, userId, eventId):    
        """
        根据基于物品的协同过滤，得到Event的推荐度
        基本的伪代码思路如下：
        for item i 
            for every item j tht u has a preference for
                compute similarity s between i and j
                add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        sim_accumulate=0.0  
        rat_acc=0.0  
            
        for item in self.itemsForUser[uid]:  #用户uid打过分的所有Item
            sim = self.sim_cal(item,i_id)    #该Item与i_id之间的相似度
            if sim<0:continue  
            #print sim,self.user_movie[uid][item],sim*self.user_movie[uid][item]  
            
            rat_acc += sim * self.itemsForUser[uid][item]  
            sim_accumulate += sim  
        
        #print rat_acc,sim_accumulate  
        if sim_accumulate==0: #no same user rated,return average rates of the data  
            return  self.mu  
        ans = rat_acc/sim_accumulate
        return ans
        #请补充完整代码

    
    def svdCFReco(self, userId, eventId):
        #基于模型的协同过滤, SVD++/LFM
        u = self.userIndex[userId]
        i = self.eventIndex[eventId]

        return self.pred_SVD(u,i)

    def userReco(self, userId, eventId):
        """
        类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，计算event的推荐度
        基本的伪代码思路如下：
        for item i
            for every other user v that has a preference for i
                compute similarity s between u and v
                incorporate v's preference for i weighted by s into running aversge
        return top items ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]

        vs = self.userEventScores[:, j]
        sims = self.userSimMatrix[i, :]

        prod = sims * vs

        try:
            return prod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            return 0

    def eventReco(self, userId, eventId):
        """
        类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，计算Event的推荐度
        基本的伪代码思路如下：
        for item i 
            for every item j that u has a preference for
                compute similarity s between i and j
                add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        js = self.userEventScores[i, :]
        psim = self.eventPropSim[:, j]
        csim = self.eventContSim[:, j]
        pprod = js * psim
        cprod = js * csim
        
        pscore = 0
        cscore = 0
        try:
            pscore = pprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
        try:
            cscore = cprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
        return pscore, cscore

    def userPop(self, userId):
        """
        基于用户的朋友个数来推断用户的社交程度
        主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
        """
        if self.userIndex.has_key(userId):
            i = self.userIndex[userId]
            try:
                return self.numFriends[0, i]
            except IndexError:
                return 0
        else:
            return 0

    def friendInfluence(self, userId):
        """
        朋友对用户的影响
        主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
        用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
        """
        nusers = np.shape(self.userFriends)[1]
        i = self.userIndex[userId]
        return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

    def eventPop(self, eventId):
        """
        本活动本身的热度
        主要是通过参与的人数来界定的
        """
        i = self.eventIndex[eventId]
        return self.eventPopularity[i, 0]



In [19]:
def generateRSData(RS, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    fn = "train.csv" if train else "test.csv"
    fin = open(path+fn, 'rb')
    fout = open(path+"RS_" + fn, 'wb')
    
    #忽略第一行（列名字）
    fin.readline().strip().split(b",")
    
    # write output header
    if header:
        ocolnames = ["invited", "userCF_reco", "evtCF_reco","svdCF_reco","user_reco", "evt_p_reco",
        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
    if train:
        ocolnames.append("interested")
        ocolnames.append("not_interested")
    fout.write(",".join(ocolnames) + "\n")
    
    ln = 0
    for line in fin:
        ln += 1
        if ln%500 == 0:
            print("%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
          #break;
      
        cols = line.strip().split(b",")
        userId = cols[0]
        eventId = cols[1]
        invited = cols[2]
      
        userCF_reco = RS.userCFReco(userId, eventId)
        itemCF_reco = RS.eventCFReco(userId, eventId)
        svdCF_reco = RS.svdCFReco(userId, eventId)
        
        user_reco = RS.userReco(userId, eventId)
        evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)
        user_pop = RS.userPop(userId)
     
        frnd_infl = RS.friendInfluence(userId)
        evt_pop = RS.eventPop(eventId)
        ocols = [invited, userCF_reco, itemCF_reco, svdCF_reco,user_reco, evt_p_reco,
        evt_c_reco, user_pop, frnd_infl, evt_pop]
      
    if train:
        ocols.append(cols[4]) # interested
        ocols.append(cols[5]) # not_interested
        fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
    
    fin.close()
    fout.close()


In [20]:
import pandas as pd 
df_train = pd.read_csv(path+'train.csv')
train_data = df_train.values

In [21]:
import pandas as pd 
df_test = pd.read_csv(path+'test.csv')
test_data = df_test.values

In [23]:
RS = RecommonderSystem(train_data)
print ("生成训练数据...\n")
generateRSData(RS,train=True,  header=True)

print ("生成预测数据...\n")
generateRSData(RS, train=False, header=True)

SVD Train...
the  0 -th  step is running


KeyError: 3166414361

时间、地点等特征都没有处理了，可以考虑用户看到event的时间与event开始时间的差、用户地点和event地点的差异。。。