In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = np.loadtxt('../ml-1m/ratings.dat', delimiter='::', usecols=[0,1,3], dtype=int)
N_USER = np.max(dataset[:,0])
N_ITEM = np.max(dataset[:,1])

In [3]:
def generate_train_from_local(path, n_user, n_item):
    data = np.loadtxt(fname=path, delimiter="\t", skiprows=1, dtype=int)
    train_matrix = np.zeros((n_user, n_item), dtype = np.int8)
    for line in data:
        train_matrix[line[0],line[1]] = 1
    return train_matrix

def generate_test_from_local(path):
    data = np.loadtxt(fname=path, delimiter="\t", skiprows=1, dtype=int)
    return data

In [4]:
class ItemCF():
    def __init__(self, train_matrix):
        self.train_matrix = train_matrix
        self.invert = self.get_User_Item_invert()
        self.W = self.getSimilaryMatrix()
        return
    
    def get_User_Item_invert(self):
        invert = list()
        for user in range(self.train_matrix.shape[0]):
            invert.append(list(np.nonzero(self.train_matrix[user])[0]))
        return invert
    
    def getSimilaryMatrix(self):
        n_item = self.train_matrix.shape[1]
        matrix = np.zeros((n_item, n_item))
        vector = np.zeros(n_item)
        for u, items in enumerate(self.invert):
            if len(items) == 0:
                continue
            for i1 in items:
                vector[i1] += 1
                for i2 in items: 
                    if i1 == i2:
                        continue
                    matrix[i1][i2] += 1
        W = np.zeros((n_item, n_item))
        for i1, vec in enumerate(matrix):
            for i2, v  in enumerate(vec):
                if np.sqrt(vector[i1]*vector[i2]) == 0:
                    W[i1][i2] = 0
                else:
                    W[i1][i2] = v / np.sqrt(vector[i1]*vector[i2])
        return W

    def recommend(self, K):
        allres = list()
        for u, record in enumerate(self.train_matrix):
            items = np.nonzero(record-1)[0]
            rank = dict()
            for i in items:
                relate_items = np.argsort(-1*self.W[i])[:K]
                for j in relate_items:
                    if j not in items:
                        continue
                    if j not in rank.keys():
                        rank[j] = 0
                    rank[j] += self.W[i][j]*1
            rank = sorted(rank.items(), key = lambda x:x[1],reverse=True)
            res = list()
            for item, score in rank:
                res.append(item)
            allres.append(res)
        return allres

In [5]:
def getHitRatio(ranklist, gtItem):
    #HR击中率，如果topk中有正例ID即认为正确
    if gtItem in ranklist:
        return 1
    return 0

def getNDCG(ranklist, gtItem):
    #NDCG归一化折损累计增益
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return np.log(2) / np.log(i+2)
    return 0

def movieEval(rank_all_users, test, topK):
    hit_list = list()
    undcg_list = list()
    for u, rlist in enumerate(rank_all_users):
        pos_item = test[u][1]
        hit_list.append(getHitRatio(rlist[:topK], pos_item))
        undcg_list.append(getNDCG(rlist[:topK], pos_item))
    hr = np.mean(hit_list)
    ndcg = np.mean(undcg_list)
    print('HR@', topK, ' = %.4f' %  hr)
    print('NDCG@', topK, ' = %.4f' % ndcg)
    return hr, ndcg, rank_all_users

In [6]:
train_matrix = generate_train_from_local("../ml-1m/ml.train.txt", n_user=N_USER, n_item=N_ITEM)
test = generate_test_from_local("../ml-1m/ml.test.txt")
time_start, time_end = 0, 0
itemCF = ItemCF(train_matrix)
time_end=time.time()
print('time cost:', time_end-time_start)
np.savetxt("./evalres/itemcf/time.txt", [time_end-time_start]) 

time cost: 1616681311.3638217


In [13]:
def eval_itemK(itemCF, test, itemK, topK):
    hr_list = list()
    ndcg_list = list()
    for k in itemK:
        rank_all_users = itemCF.recommend(k)
        hr, ndcg, rank_all_users = movieEval(rank_all_users, test, topK)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
    np.savetxt("./evalres/itemcf/hr_list_knn.txt", hr_list)
    np.savetxt("./evalres/itemcf/ndcg_list_knn.txt", ndcg_list)    
    print('------Finished------')
itemK = [5, 10, 20, 40, 80, 160]
topK  = 100
eval_itemK(itemCF, test, itemK, topK)

HR@ 100  = 0.0343
NDCG@ 100  = 0.0058
HR@ 100  = 0.0384
NDCG@ 100  = 0.0065
HR@ 100  = 0.0536
NDCG@ 100  = 0.0092
HR@ 100  = 0.1233
NDCG@ 100  = 0.0218
HR@ 100  = 0.2409
NDCG@ 100  = 0.0562
HR@ 100  = 0.2586
NDCG@ 100  = 0.0654
------Finished------


In [7]:
def eval_topk(itemCF, test, itemK, topK):
    hr_list = list()
    ndcg_list = list()
    rank_all_users = itemCF.recommend(itemK)
    for k in topK:
        hr, ndcg, rank_all_users = movieEval(rank_all_users, test, k)
        hr_list.append(hr)
        ndcg_list.append(ndcg)
    np.savetxt("./evalres/itemcf/hr_list_topk.txt", hr_list)
    np.savetxt("./evalres/itemcf/ndcg_list_topk.txt", ndcg_list)    
    print('------Finished------')
itemK = 80
topK  = [50,100,200]
eval_topk(itemCF, test, itemK, topK)

HR@ 50  = 0.1651
NDCG@ 50  = 0.0438
HR@ 100  = 0.2409
NDCG@ 100  = 0.0562
HR@ 200  = 0.3053
NDCG@ 200  = 0.0651
------Finished------
