In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = np.loadtxt('../ml-1m/ratings.dat', delimiter='::', usecols=[0,1,3], dtype=int)
N_USER = np.max(dataset[:,0])
N_ITEM = np.max(dataset[:,1])

In [3]:
def generate_train_from_local(path, n_user, n_item, n_neg=4):
    data = np.loadtxt(fname=path, delimiter="\t", skiprows=1, dtype=int)
    train_matrix = np.zeros((n_user, n_item), dtype = np.int8)
    for line in data:
        train_matrix[line[0],line[1]] = 1
    return train_matrix

def generate_test_from_local(path):
    data = np.loadtxt(fname=path, delimiter="\t", skiprows=1, dtype=int)
    return data

In [4]:
class UserCF():
    def __init__(self, train_matrix):
        self.train_matrix = train_matrix
        self.invert = self.get_Item_User_invert()
        self.W = self.getSimilaryMatrix()
        return
    
    def get_Item_User_invert(self):
        invert = list()
        for item in range(self.train_matrix.shape[1]):
            invert.append(np.nonzero(self.train_matrix[:,item])[0])
        return invert
    
    def getSimilaryMatrix(self):
        n_user = self.train_matrix.shape[0]
        matrix = np.zeros((n_user, n_user))
        vector = np.zeros(n_user)
        for i, users in enumerate(self.invert):
            if len(users) == 0:
                continue
            for u1 in users:
                vector[u1] += 1
                for u2 in users: 
                    if u1 == u2:
                        continue
                    matrix[u1][u2] += 1
        W = np.zeros((n_user, n_user))
        for u1, vec in enumerate(matrix):
            for u2, v  in enumerate(vec):
                if np.sqrt(vector[u1]*vector[u2]) == 0:
                    W[u1][u2] = 0
                else:
                    W[u1][u2] = v / np.sqrt(vector[u1]*vector[u2])
        return W

    def recommend(self, K):
        allres = list()
        for u1, vlist in enumerate(self.W):
            rank = dict()
            index = np.argsort(-1 * vlist)[:K]
            for item, vec in enumerate(self.invert):
                if u1 in vec:
                    continue
                pui = 0
                for u2 in index:
                    if u2 in vec:
                        pui += self.W[u1][u2]
                rank[item] = pui
            rank = sorted(rank.items(), key = lambda x:x[1],reverse=True)
            res = list()
            for item, score in rank:
                res.append(item)
            allres.append(res)
        return allres

In [5]:
def getHitRatio(ranklist, gtItem):
    #HR击中率，如果topk中有正例ID即认为正确
    if gtItem in ranklist:
        return 1
    return 0

def getNDCG(ranklist, gtItem):
    #NDCG归一化折损累计增益
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return np.log(2) / np.log(i+2)
    return 0

def movieEval(rank_all_users, test, topK):
    hit_list = list()
    undcg_list = list()
    for u, rlist in enumerate(rank_all_users):
        pos_item = test[u][1]
        hit_list.append(getHitRatio(rlist[:topK], pos_item))
        undcg_list.append(getNDCG(rlist[:topK], pos_item))
    hr = np.mean(hit_list)
    ndcg = np.mean(undcg_list)
    print('HR@', topK, ' = %.4f' %  hr)
    print('NDCG@', topK, ' = %.4f' % ndcg)
    return hr, ndcg, rank_all_users

In [6]:
train_matrix = generate_train_from_local("../ml-1m/ml.train.txt", n_user=N_USER, n_item=N_ITEM)
test = generate_test_from_local("../ml-1m/ml.test.txt")
time_start, time_end = 0, 0
userCF = UserCF(train_matrix)
time_end=time.time()
print('time cost:', time_end-time_start)
np.savetxt("./evalres/usercf/time.txt", [time_end-time_start]) 

time cost: 1616680893.4384754


In [10]:
def eval_userK(userCF, test, userK, topK):
    hr_list = list()
    ndcg_list = list()
    for k in userK:
        rank_all_users = userCF.recommend(userK)
        hr, ndcg, rank_all_users = movieEval(rank_all_users, test, topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
    np.savetxt("./evalres/usercf/hr_list_knn.txt", hr_list)
    np.savetxt("./evalres/usercf/ndcg_list_knn.txt", ndcg_list)    
    print('------Finished------')
userK = [5, 10, 20, 40, 80, 160]
topK  = 100
eval_userK(userCF, test, userK, topK)

HR@ 100  = 0.3190
NDCG@ 100  = 0.0778
HR@ 100  = 0.3571
NDCG@ 100  = 0.0885
HR@ 100  = 0.3831
NDCG@ 100  = 0.0963
HR@ 100  = 0.3969
NDCG@ 100  = 0.1017
HR@ 100  = 0.3980
NDCG@ 100  = 0.1039
HR@ 100  = 0.3896
NDCG@ 100  = 0.1040
------Finished------


In [None]:
def eval_topK(userCF, test, userK, topK):
    hr_list = list()
    ndcg_list = list()
    rank_all_users = userCF.recommend(userK)
    for k in topK:
        hr, ndcg, rank_all_users = movieEval(rank_all_users, test, topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
    np.savetxt("./evalres/usercf/hr_list_topk.txt", hr_list)
    np.savetxt("./evalres/usercf/ndcg_list_topk.txt", ndcg_list)    
    print('------Finished------')
userK = 80
topK  = [50,100,200]
eval_topK(userCF, test, userK, topK)