In [123]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import faiss
from collections import defaultdict




In [124]:
config = {
    'embedding_dim' : 64,
    'lr':0.001,
    'batch_size':256,
    'Epoch':100,
    'max_length' : 20,
    'device' : torch.device('cuda'),
    'K':4,
    'n_items':15406,
    'train_path':'train_enc.csv',
    "val_path": 'valid_enc.csv',
    "test_path":'test_enc.csv'
}


In [125]:
df = pd.read_csv(config['train_path'])


In [126]:
df['item_id'].nunique()
# df.groupby('user_id')['item_id'].apply(list).to_dict()


15405

In [127]:
class SeqnenceDataset(Dataset):
    def __init__(self, config, df, phase='train'):
        self.config = config
        self.df = df
        self.max_length = self.config['max_length']
        self.df = self.df.sort_values(by=['user_id', 'timestamp'])
        self.user2item = self.df.groupby('user_id')['item_id'].apply(list).to_dict()
        self.user_list = self.df['user_id'].unique()
        self.phase = phase

    def __len__(self, ):
        return len(self.user2item)

    def __getitem__(self, index):
        if self.phase == 'train':
            user_id = self.user_list[index]
            item_list = self.user2item[user_id]
            hist_item_list = []
            hist_mask_list = []

            k = np.random.randint(4,len(item_list))
            item_id = item_list[k]  # 该index对应的item加入item_id_list

            if k >= self.max_length:  # 选取seq_len个物品
                hist_item_list.append(item_list[k - self.max_length: k])
                hist_mask_list.append([1.0] * self.max_length)
            else:
                hist_item_list.append(item_list[:k] + [0] * (self.max_length - k))
                hist_mask_list.append([1.0] * k + [0.0] * (self.max_length - k))

            return torch.Tensor(hist_item_list).squeeze(0).long(), torch.Tensor(hist_mask_list).squeeze(
                0).long(), torch.Tensor([item_id]).long()
        else:
            user_id = self.user_list[index]
            item_list = self.user2item[user_id]
            hist_item_list = []
            hist_mask_list = []

            k = int(0.8 * len(item_list))

            if k >= self.max_length:  # 选取seq_len个物品
                hist_item_list.append(item_list[k - self.max_length: k])
                hist_mask_list.append([1.0] * self.max_length)
            else:
                hist_item_list.append(item_list[:k] + [0] * (self.max_length - k))
                hist_mask_list.append([1.0] * k + [0.0] * (self.max_length - k))

            return torch.Tensor(hist_item_list).squeeze(0).long(), torch.Tensor(hist_mask_list).squeeze(
                0).long(), item_list[k:]


In [128]:
def my_collate(batch):
    hist_item = torch.rand(len(batch), batch[0][0].shape[0])
    hist_mask = torch.rand(len(batch), batch[0][0].shape[0])
    item_list = []
    for i in range(len(batch)):
        hist_item[i,:] = batch[i][0]
        hist_mask[i,:] = batch[i][1]
        item_list.append(batch[i][2])
    hist_item = hist_item.long()
    hist_mask = hist_mask.long()
    return hist_item, hist_mask, item_list


In [129]:
device = config['device']
train_df = pd.read_csv(config['train_path'])
valid_df = pd.read_csv(config['val_path'])
test_df = pd.read_csv(config['test_path'])
train_dataset = SeqnenceDataset(config, train_df, phase='train')
valid_dataset = SeqnenceDataset(config, valid_df, phase='test')
test_dataset = SeqnenceDataset(config, test_df, phase='test')


In [130]:
trainloader = DataLoader(train_dataset,batch_size=config['batch_size'], shuffle=True)
testloader = DataLoader(test_dataset,batch_size=config['batch_size'], collate_fn=my_collate)



In [131]:
train_df['user_id'].nunique()


110120

In [132]:
# next(iter(trainloader))


## commirec_SA


In [133]:
class MultiInterest_SA(nn.Module):
    def __init__(self,embedding_dim, K, d=None) -> None:
        super(MultiInterest_SA, self).__init__()
        self.embedding_dim = embedding_dim
        self.K = K
        
        self.d = d if d is not None else embedding_dim*4
        self.W1 = torch.nn.parameter.Parameter(torch.rand(self.embedding_dim, self.d), requires_grad = True)
        self.W2 = torch.nn.parameter.Parameter(torch.rand(self.d, self.K), requires_grad=True)
        
    def forward(self, seq_emb, mask = None):
        # seq_embed.shape (batch_size, seq_len, embedding_dim) 
        H = torch.einsum('bse, ed -> bsd', seq_emb, self.W1).tanh()
        if mask != None:
            A = torch.einsum('bsd, dk -> bsk', H, self.W2) + -1.e9 * (1 - mask.float())
            A = F.softmax(A, dim=1)
        else:
            A = F.softmax(torch.einsum('bsd, dk -> bsk', H, self.W2), dim=1)
        A = A.permute(0, 2, 1)
        multi_interest_emb = torch.matmul(A, seq_emb)
        return multi_interest_emb


In [134]:
class Comirec_SA(nn.Module):
    def __init__(self,config) -> None:
        super().__init__()
        self.config = config
        self.embedding_dim = self.config['embedding_dim']
        self.max_length = self.config['max_length']
        self.device = self.config['device']
        self.n_items = self.config['n_items']
        
        self.item_emb = nn.Embedding(self.n_items, self.embedding_dim, padding_idx=0)
        self.multi_interest_sa = MultiInterest_SA(self.embedding_dim, K= self.config['K'])
        self.loss_func = nn.CrossEntropyLoss()
        self.reset_parameter()
        
    def output_items(self):
        return self.item_emb.weight
    
    def calculate_score(self, user_emb):
        all_items = self.item_emb.weight
        scores = torch.matmul(user_emb, all_items.transpose(1, 0))  # [b, n]
        return scores

    def calculate_loss(self,user_emb,item):
        scores = self.calculate_score(user_emb)
        return self.loss_func(scores,item.squeeze(1))
        
    def reset_parameter(self):
        for weight in self.parameters():
            torch.nn.init.kaiming_normal_(weight)
            
    def forward(self, item_seq, mask, item, train = True):
        if train:
            seq_emb = self.item_emb(item_seq)
            item_emb = self.item_emb(item).squeeze(1)
            mask = mask.unsqueeze(-1).float()
            multi_interest_emb = self.multi_interest_sa(seq_emb,mask)  # (batch_size, K, embedding_dim)
            cos_res = torch.bmm(multi_interest_emb, item_emb.squeeze(1).unsqueeze(-1))
            k_index = torch.argmax(cos_res, dim=1)

            best_interest_emb = torch.rand(multi_interest_emb.shape[0], multi_interest_emb.shape[2]).to(self.device)
            # 用每一条中 最好的  （概率最大的 cos_res最大的  兴趣点向量 去和item向量计算损失
            for k in range(multi_interest_emb.shape[0]):
                best_interest_emb[k, :] = multi_interest_emb[k, k_index[k], :]

            loss = self.calculate_loss(best_interest_emb, item)
            output_dict = {
                'user_emb':multi_interest_emb,
                'loss':loss,
                'acc_50':-1
            }
            
        else:
            seq_emb = self.item_emb(item_seq)

            mask = mask.unsqueeze(-1).float()
            multi_interest_emb = self.multi_interest_sa(seq_emb, mask)
            
            output_dict = {
                'user_emb': multi_interest_emb,
            }
        
        return output_dict


In [135]:
model = Comirec_SA(config)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = config['lr'], betas=(0.5,0.99))


In [136]:
for epoch in range(1, 1 + config['Epoch']):
    pbar = tqdm(trainloader)
    model.train()
    loss_list = []
    for batch_data in pbar:
        (item_seq, mask, item) = batch_data
        item_seq = item_seq.to(device)
        mask = mask.to(device)
        item = item.to(device)

        output_dict = model(item_seq, mask, item)
        loss = output_dict['loss']

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_list.append(loss.item())

        pbar.set_description('Epoch :{} Loss:{:7.3f}'.format(epoch, np.mean(loss_list)))


Epoch :1 Loss:  9.526:   2%|▏         | 9/431 [00:00<00:36, 11.46it/s]

Epoch :1 Loss:  8.259: 100%|██████████| 431/431 [00:31<00:00, 13.57it/s]
Epoch :2 Loss:  7.049: 100%|██████████| 431/431 [00:32<00:00, 13.42it/s]
Epoch :3 Loss:  6.788: 100%|██████████| 431/431 [00:31<00:00, 13.57it/s]
Epoch :4 Loss:  6.605: 100%|██████████| 431/431 [00:30<00:00, 14.14it/s]
Epoch :5 Loss:  6.448: 100%|██████████| 431/431 [00:29<00:00, 14.44it/s]
Epoch :6 Loss:  6.337: 100%|██████████| 431/431 [00:30<00:00, 14.00it/s]
Epoch :7 Loss:  6.232: 100%|██████████| 431/431 [00:31<00:00, 13.89it/s]
Epoch :8 Loss:  6.165: 100%|██████████| 431/431 [00:30<00:00, 14.05it/s]
Epoch :9 Loss:  6.112: 100%|██████████| 431/431 [00:30<00:00, 14.29it/s]
Epoch :10 Loss:  6.061: 100%|██████████| 431/431 [00:31<00:00, 13.53it/s]
Epoch :11 Loss:  6.018: 100%|██████████| 431/431 [00:32<00:00, 13.13it/s]
Epoch :12 Loss:  5.984: 100%|██████████| 431/431 [00:29<00:00, 14.45it/s]
Epoch :13 Loss:  5.954: 100%|██████████| 431/431 [00:31<00:00, 13.57it/s]
Epoch :14 Loss:  5.920: 100%|██████████| 431/43

## 进行召回  得到找回结果

In [137]:
def get_recall_result(model,data,embedding_dim,device,topN):
    item_embs = model.output_items().cpu().detach().numpy()
    
    index = faiss.IndexFlatIP(embedding_dim)
    index.add(item_embs)
    
    preds = defaultdict(list)
    test_gd = defaultdict(list)
    dummy_user_id = 0
    
    for (item_seq, mask, targets) in tqdm(data):
        item_seq = item_seq.to(device)
        mask = mask.to(device)
        
        model.eval()
        user_embs = model(item_seq,mask,None,train=False)['user_emb']
        user_embs = user_embs.cpu().detach().numpy()
        
        num_interest = user_embs.shape[1]
        user_embs = np.reshape(user_embs,[-1,embedding_dim]) # reshape之后 相连的 num_interest向量都是同一个user的
        
        D,I = index.search(user_embs,topN)
        
        for i,item_gd_list in enumerate(targets):
            temp_df = pd.DataFrame()
            temp_df['item'] = I[i*num_interest:(i+1)*num_interest].reshape(-1)
            temp_df['score'] = D[i*num_interest:(i+1)*num_interest].reshape(-1)
            temp_df = temp_df.sort_values(by='score',ascending=False)
            temp_df = temp_df.drop_duplicates(subset=['item'], keep='first', inplace=False)
            
            recall_item_list = temp_df['item'][:topN].values
            
            preds[dummy_user_id] = list(recall_item_list)
            test_gd[dummy_user_id] = item_gd_list
            
            dummy_user_id += 1
            
    return preds, test_gd


In [138]:
preds, test_gd = get_recall_result(model,testloader,config['embedding_dim'],device,topN=100)


  2%|▏         | 1/54 [00:00<00:44,  1.19it/s]

100%|██████████| 54/54 [00:29<00:00,  1.82it/s]


In [139]:
def hitrate(preds,test_gd):
    hit_num = 0
    total_num = 0
    
    for user in tqdm(preds.keys()):
        hit_num += len(set(preds[user]) & set(test_gd[user]))
        total_num += len(test_gd[user])
    return hit_num / total_num


计算召回的100个物品的命中率

In [140]:
hitrate(preds,test_gd)


100%|██████████| 13600/13600 [00:00<00:00, 130977.20it/s]


0.1949968109412971

In [141]:
model.item_emb.weight


Parameter containing:
tensor([[ 3.0468e-01,  3.2177e+00,  9.1495e-01,  ...,  3.1341e-01,
          3.0869e-04,  3.7267e-01],
        [ 1.3094e-01,  4.7279e-01,  1.5771e-01,  ..., -2.8948e-01,
         -4.7956e-02, -1.1491e-01],
        [-3.1506e-01, -4.9621e-01, -7.7427e-01,  ..., -1.4341e-02,
         -3.3446e-01, -1.2319e-01],
        ...,
        [ 2.7224e-01,  3.1711e+00,  9.4916e-01,  ...,  2.8645e-01,
          1.9601e-02,  3.3171e-01],
        [ 3.0403e-01,  3.2188e+00,  9.1516e-01,  ...,  3.1349e-01,
          1.2202e-03,  3.7368e-01],
        [ 3.0533e-01,  3.2176e+00,  9.1509e-01,  ...,  3.1431e-01,
         -1.4495e-05,  3.7236e-01]], device='cuda:0', requires_grad=True)