In [1]:
import torch 
from torch import nn
from torch.nn import init
import torch.utils.data as data_utils
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
SEED = 2019
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [2]:
dataset = np.loadtxt('../ml-1m/ratings.dat', delimiter='::', usecols=[0,1,3], dtype=int)
#dataset = np.loadtxt('../Yelp/yelp.rating', usecols=[0,1,3], dtype=int)

In [6]:
def generate_train_from_local(path, n_user, n_item):
    data = np.loadtxt(fname=path, delimiter="\t", skiprows=1, dtype=int)
    train_matrix = np.zeros((n_user, n_item), dtype = np.int8)
    for line in data:
        train_matrix[line[0],line[1]] = 1
    user_pos = dict()
    max_item_id = train_matrix.shape[1]
    max_item_num = 0
    for u, i in enumerate(train_matrix):
        pos_item = list(np.nonzero(i)[0])
        pos_item_num = len(pos_item)
        if  pos_item_num > max_item_num:
            max_item_num = pos_item_num
        user_pos[u] = pos_item
    train_user = list()
    train_item = list()
    for k in user_pos.keys():
        while len(user_pos[k]) < max_item_num:
            user_pos[k].append(max_item_id)
        train_user.append(k)
        train_item.append(user_pos[k])
    return np.array(train_user), np.array(train_item), train_matrix

def generate_test_from_local(path):
    data = np.loadtxt(fname=path, delimiter="\t", skiprows=1, dtype=int)
    return data

train_user, train_item, train_matrix = generate_train_from_local(path="../ml-1m/ml.train.txt",n_user=N_USER,n_item=N_ITEM)
test = generate_test_from_local(path="../ml-1m/ml.test.txt")

In [8]:
N_USER = np.max(dataset[:,0])
N_ITEM = np.max(dataset[:,1])
EMB_SIZE = 64
NEG_WEIGHT = 0.1
DROP_RATIO = 0.3
LEARNING_RATE = 0.05
BATCH_SIZE = 128
EPOCH = 200

In [10]:
class ENMF(nn.Module):
    def __init__(self, emb_size, n_user, n_item, neg_weight, drop_out, count, c0=512, x=0.6):
        super().__init__()
        self.c0 = c0
        self.x  = x
        self.count = count
        self.n_user = n_user
        self.n_item = n_item
        self.neg_weight = neg_weight
        self.emb_size   = emb_size
        self.user_embs = nn.Embedding(n_user, emb_size)
        self.item_embs = nn.Embedding(n_item+1, emb_size)
        self.h = nn.Parameter(torch.randn(emb_size, 1))
        self.dropout = nn.Dropout(p=drop_out)
        self.freq = self.calcu_freq()
        self._reset_para()
        return
    
    def _reset_para(self):
        nn.init.xavier_normal_(self.user_embs.weight)
        nn.init.xavier_normal_(self.item_embs.weight)
        nn.init.constant_(self.h, 0.01)
        return
    
    def calcu_freq(self):
        freq_items = sorted(self.count.keys())
        freq_count = [self.count[k] for k in freq_items]
        freq = np.zeros(self.item_embs.weight.shape[0])
        freq[freq_items] = freq_count       
        #freq = freq/np.sum(freq)
        freq = np.power(freq, self.x)
        freq = self.c0 * freq/np.sum(freq)
        freq = torch.from_numpy(freq).type(torch.float).cuda()
        return freq
    
    def forward(self, uids, pos_iids):
        '''
        uids: B
        u_iids: B * L
        '''
        u_emb = self.dropout(self.user_embs(uids))
        pos_embs = self.item_embs(pos_iids)

        # torch.einsum("ab,abc->abc")
        # B * L * D
        mask = (~(pos_iids.eq(self.n_item))).float()
        pos_embs = pos_embs * mask.unsqueeze(2)

        # torch.einsum("ac,abc->abc")
        # B * L * D
        pq = u_emb.unsqueeze(1) * pos_embs
        # torch.einsum("ajk,kl->ajl")
        # B * L
        hpq = pq.matmul(self.h).squeeze(2)
        
        
        
        # loss
        pos_data_loss = torch.sum((1 - self.neg_weight) * hpq.square() - 2.0 * hpq)

        # torch.einsum("ab,ac->abc")
        part_1 = self.item_embs.weight.unsqueeze(2).bmm(self.item_embs.weight.unsqueeze(1))
        part_2 = u_emb.unsqueeze(2).bmm(u_emb.unsqueeze(1))

        # D * D
        part_1 = part_1.sum(0)
        part_2 = part_2.sum(0)
        part_3 = self.h.mm(self.h.t())
        all_data_loss = torch.sum(part_1 * part_2 * part_3)

        loss = self.neg_weight * all_data_loss + pos_data_loss
        return loss
    
    def rank(self, uid):
        '''
        uid: Batch_size
        '''
        uid_embs = self.user_embs(uid)
        user_all_items = uid_embs.unsqueeze(1) * self.item_embs.weight
        items_score = user_all_items.matmul(self.h).squeeze(2)
        return items_score
    
'''    def rank(self, user):
        res = self.user_embs(user).unsqueeze(0)
        res = res * self.item_embs.weight
        res = res.matmul(self.h).squeeze(1)
        return res'''

In [11]:
def getHitRatio(ranklist, gtItem):
    #HR击中率，如果topk中有正例ID即认为正确
    if gtItem in ranklist:
        return 1
    return 0

def getNDCG(ranklist, gtItem):
    #NDCG归一化折损累计增益
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return np.log(2) / np.log(i+2)
    return 0

def getH(ranklist1, ranklist2):
    L = len(ranklist1)
    common = len(list(set(ranklist1).intersection(set(ranklist2))))
    return 1-common/L

def movieEval_1(model, test_loader, train_matrix, topK = 100):
    n_users = train_matrix.shape[0]
    hit_list = list()
    undcg_list = list()
    rank_all_users = list()
    model.eval()
    with torch.no_grad(): 
        for step, (batch_x, batch_y) in enumerate(test_loader):
            if torch.cuda.is_available():
                batch_x = batch_x.cuda()  
            prediction = model.rank(batch_x)
            pred_vector = -1 * (prediction.cpu().data.numpy())
            ranklist = np.argsort(pred_vector)
            for j, r in enumerate(ranklist):
                real_r = list()
                u = batch_x[j].cpu().data.numpy()
                i = 0
                while len(real_r) < topK:
                    if r[i]==train_matrix.shape[1]:
                        continue
                    if train_matrix[u][r[i]] == 0:
                        real_r.append(r[i])
                    i += 1     
                rank_all_users.append(real_r)
                pos_item = batch_y[j].cpu().data.numpy()
                hit_list.append(getHitRatio(real_r, pos_item))
                undcg_list.append(getNDCG(real_r, pos_item))
    model.train()
    hr = np.mean(hit_list)
    ndcg = np.mean(undcg_list)
    print('HR@', topK, ' = %.4f' %  hr)
    print('NDCG@', topK, ' = %.4f' % ndcg)
    return hr, ndcg, rank_all_users

In [12]:
def createLoader(train_user, train_item, test, batch_size):
    torch_x1 = torch.from_numpy(train_user).type(torch.LongTensor)
    torch_x2 = torch.from_numpy(train_item).type(torch.LongTensor)
    torch_test = torch.from_numpy(test).type(torch.LongTensor)
    torch_dataset = data_utils.TensorDataset(torch_x1, torch_x2)
    train_loader = data_utils.DataLoader(dataset = torch_dataset, batch_size = batch_size, shuffle = True, num_workers = 0)
    torch_testset = data_utils.TensorDataset(torch_test[:,0],torch_test[:,1])
    test_loader = data_utils.DataLoader(dataset = torch_testset, batch_size = batch_size, num_workers = 0)
    return train_loader, test_loader

freq_model ENMF(
  (user_embs): Embedding(6041, 64)
  (item_embs): Embedding(3954, 64)
  (dropout): Dropout(p=0.5, inplace=False)
)
tail_model ENMF(
  (user_embs): Embedding(6041, 64)
  (item_embs): Embedding(3954, 64)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:
def createModel(emb_size, lr, n_user, n_item, neg_weight, drop_out, hhDict):
    model = ENMF(emb_size=emb_size, n_user=n_user, n_item=n_item, neg_weight=neg_weight, drop_out = drop_out, count=hhDict)
    if(torch.cuda.is_available()):
        model = model.cuda()
    optimizer = torch.optim.Adagrad(model.parameters(), lr = lr)
    print(model)
    return model, optimizer

In [None]:
train_user, train_item, train_label, train_matrix = generate_train_from_local(path="../ml-1m/ml.train.txt",n_user=N_USER, n_item=N_ITEM)
test = generate_test_from_local(path="../ml-1m/ml.test.txt", n_user=N_USER, n_item=N_ITEM)

def train(train_user, train_item, train_label, test, train_matrix, epoch, batch_size, n_factors, layers, lr, topK, n_user, n_item):    
    loader = createLoader(train_user, train_item, train_label, batch_size)
    model, loss_func, optimizer = createModel(n_factors, layers, lr, n_user, n_item)
    train_loss_list = list()
    hr_list = [0.0]
    ndcg_list = [0.0]
    for e in range(epoch):
        train_loss = list()
        for step, (batch_x1, batch_x2, batch_y) in enumerate(loader):
            if torch.cuda.is_available():
                batch_x1, batch_x2, batch_y = batch_x1.cuda(), batch_x2.cuda(), batch_y.cuda()
            optimizer.zero_grad()
            prediction = model(batch_x1, batch_x2)
            loss = loss_func(prediction, batch_y) 
            loss.backward()        
            train_loss.append(loss.cpu().item())
            optimizer.step()
        print('------第'+str(e+1)+'个epoch------')
        mean_train_loss = np.mean(train_loss)
        print('train_loss', '= %.4f' % mean_train_loss)
        train_loss_list.append(mean_train_loss)  
    '''
        if (e+1)%5==0:
            hr, ndcg, rank_all_users = movieEval_1(model, loss_func, test, train_matrix, n_user=n_user, n_item=n_item, topK=topK)
            hr_list.append(hr)
            ndcg_list.append(ndcg)
    np.savetxt("./evalres/ncf/train_loss_list_"+str(epoch)+"epoch.txt", train_loss_list)    
    np.savetxt("./evalres/ncf/hr_list_"+str(epoch)+"epoch.txt", hr_list)
    np.savetxt("./evalres/ncf/ndcg_list_"+str(epoch)+"epoch.txt", ndcg_list) 
    '''
    movieEval_1(model, loss_func, test, train_matrix, n_user=n_user, n_item=n_item, topK=topK)
    torch.cuda.empty_cache()
    print('------Finished------')
    return model

# Hyper parameters
ACTIVATION = torch.relu
TOPK = 100
BATCH_SIZE = 256
LEARNING_RATE = 0.001
EPOCH = 200
LAYERS = [128, 64, 32, 16, 8]    # MLP  0层为输入层  0层/2为嵌入层  
GMF_N_FACTORS  = 64          # GMF隐层size  
#train(train_user, train_item, train_label, test, train_matrix, epoch=EPOCH, batch_size=BATCH_SIZE, n_factors=GMF_N_FACTORS, layers=LAYERS, lr=LEARNING_RATE, topK=TOPK, n_user = N_USER, n_item = N_ITEM)
model = train(train_user, train_item, train_label, test, train_matrix, epoch=6, batch_size=BATCH_SIZE, n_factors=GMF_N_FACTORS, layers=LAYERS, lr=LEARNING_RATE, topK=TOPK, n_user = N_USER, n_item = N_ITEM)

------第1个epoch------
train_loss: 0.07700866061107565
HR@ 100  = 0.0387
NDCG@ 100  = 0.0096
------第2个epoch------
train_loss: -26.551939477523167
HR@ 100  = 0.0720
NDCG@ 100  = 0.0177
------第3个epoch------
train_loss: -467.6944483121236
HR@ 100  = 0.0998
NDCG@ 100  = 0.0241
------第4个epoch------
train_loss: -2683.3884785970054
HR@ 100  = 0.1131
NDCG@ 100  = 0.0271
------第5个epoch------
train_loss: -7152.594375610352
HR@ 100  = 0.1219
NDCG@ 100  = 0.0293
------第6个epoch------
train_loss: -11161.950764973959
HR@ 100  = 0.1263
NDCG@ 100  = 0.0294
------第7个epoch------
train_loss: -13850.70258585612
HR@ 100  = 0.1250
NDCG@ 100  = 0.0292
------第8个epoch------
train_loss: -14911.8918355306
HR@ 100  = 0.1250
NDCG@ 100  = 0.0297
------第9个epoch------
train_loss: -15480.157491048178
HR@ 100  = 0.1361
NDCG@ 100  = 0.0326
------第10个epoch------
train_loss: -15909.849416097006
HR@ 100  = 0.1401
NDCG@ 100  = 0.0338
------第11个epoch------
train_loss: -16193.700480143229
HR@ 100  = 0.1454
NDCG@ 100  = 0.0352
--