In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pickle
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import defaultdict, Counter
from torch.cuda.amp import autocast, GradScaler
from torch.optim import *
from torch.utils.data import Dataset, DataLoader
from lion import Lion
from sklearn.model_selection import train_test_split
os.environ["CUDA_VISIBLE_DEVICES"] = str(3)

# model

In [2]:
class BaseModel(torch.nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.max_ = None
        self.emb_dropout = torch.nn.Dropout(p=0.2)
        self.seq_len = 2
        

    def seq_embed(self, input_ids, seq_len=2):
        input_embed = self.emb_dropout(self.item_emb(input_ids[:,-seq_len:]))
        input_embed = input_embed.reshape(input_ids.shape[0],-1)
        
        input_embed = self.mlp(input_embed)
        input_embed = F.normalize(input_embed, p=2, dim=1)
        return input_embed
    
    def recall_embed(self, recall_ids):
        item_embed = self.emb_dropout(self.item_emb(recall_ids))
        item_embed = F.normalize(item_embed, p=2, dim=1)
        return item_embed
    
    def forward(self,
                input_ids,
                recall_items,
                number = 0,
               ):
        input_embed = self.seq_embed(input_ids, self.seq_len)
        item_embed = self.recall_embed(recall_items)
        
        logits = input_embed.mm(item_embed.T)
        
        return logits
    
    def reset_para(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        nn.init.normal_(self.item_emb.weight, std=0.01)
                
class DoubleSeq(BaseModel):
    def __init__(self):
        super(DoubleSeq, self).__init__()
        
        self.seq_len = 2
        self.item_emb = torch.nn.Embedding(1410676, 128, padding_idx=0)
        self.emb_dropout = torch.nn.Dropout(p=0.0)
        # self.gru = nn.GRU(input_size=128, hidden_size=128)
        self.mlp = nn.Sequential(
                torch.nn.Linear(128*2, 128),
                torch.nn.GELU(),
                torch.nn.Linear(128, 128)
            )
        self.reset_para()

In [3]:
class WeightSeq(torch.nn.Module):
    def __init__(self, path=None):
        super(WeightSeq, self).__init__()

        self.item_num = 1410676
        self.seq_len = 0
        self.item_emb = torch.nn.Embedding(1410676, 128, padding_idx=0)
        self.emb_dropout = torch.nn.Dropout(p=0.0)
        self.max_ = None
        self.seq_len = 2
        self.mlp = nn.Sequential(
                torch.nn.Linear(128*4, 128*4),
                torch.nn.GELU(),
                torch.nn.Linear(128*4, 128)
            )
        self.reset_para()
        if path is not None:
            self.item_emb = torch.load(path)


    def seq_embed(self, input_ids, all_ids =None, index=None, seq_len=2):
        input_embed = self.emb_dropout(self.item_emb(input_ids)) 
        a = input_embed[:,:-4,:]
        mask = input_ids.clone()[:,:-4]
        mask[mask!=0] = 1
        a = (a*mask.unsqueeze(-1)).sum(dim=-2)
        mask = mask.sum(dim=-1).unsqueeze(-1)
        mask_t = mask.clone()
        mask_t[mask_t==0]=1
        a = a/mask_t
        mask[mask==0]=-1
        mask[mask>0]=0
        mask[mask==-1]=1
        add = self.emb_dropout(self.item_emb(torch.tensor([0]).cuda())).repeat(input_ids.shape[0],1)
        add = add *mask
        a = a+add
        
        b = input_embed[:,-4:-2,:]
        mask = input_ids.clone()[:,-4:-2]
        mask[mask!=0] = 1
        b = (b*mask.unsqueeze(-1)).sum(dim=-2)
        mask = mask.sum(dim=-1).unsqueeze(-1)
        mask_t = mask.clone()
        mask_t[mask_t==0]=1
        b = b/mask_t
        mask[mask==0]=-1
        mask[mask>0]=0
        mask[mask==-1]=1
        add = self.emb_dropout(self.item_emb(torch.tensor([0]).cuda())).repeat(input_ids.shape[0],1)
        add = add *mask
        b = b+add
        
        c = input_embed[:,-2,:]

        d = input_embed[:,-1,:]
        
        input_embed = self.mlp(torch.cat([a,b,c,d],dim=-1))
        input_embed = F.normalize(input_embed, p=2, dim=1)
        return input_embed
    
    def recall_embed(self, recall_ids):
        item_embed = self.emb_dropout(self.item_emb(recall_ids))
        item_embed = F.normalize(item_embed, p=2, dim=1)
        return item_embed
    
    def forward(self,
                input_ids,
                recall_items,
                all_ids = None,
                index = None,
                number = 0,
               ):
        input_embed = self.seq_embed(input_ids, all_ids, index, self.seq_len)
        item_embed = self.recall_embed(recall_items)
        
        logits = input_embed.mm(item_embed.T)
        
        return logits

    def reset_para(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        nn.init.normal_(self.item_emb.weight, std=0.01)

# main

In [15]:
model = DoubleSeq().cuda()
model.load_state_dict(torch.load('ckpt/0.3084.pt'))
model.eval()
# model = WeightSeq().cuda()
# model.load_state_dict(torch.load('ckpt/0.3144.pt'))
# model.eval()

DoubleSeq(
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (item_emb): Embedding(1410676, 128, padding_idx=0)
  (mlp): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=128, out_features=128, bias=True)
  )
)

In [16]:
for name in ['test']:
    if name == 'test':
        data = pd.read_pickle('../data/test_data2_p2.dataset')
    elif name == 'train':
        data = pd.read_pickle('../data/train_data2_005.dataset')
    elif name == 'valid':
        data = pd.read_pickle('../data/valid_data2_005.dataset')

    session = data['session'].tolist()
    locale = data['locale'].tolist()
    locale2ids = pickle.load(open('../data/local2ids.pkl', 'rb'))

    recall_items = torch.tensor(locale2ids[locale[0]]).cuda()
    recall2 = []
    recall2_logits = []
    for i in tqdm(range(len(session))):
        if i !=0 and locale[i] != locale[i-1]:
            recall_items = torch.tensor(locale2ids[locale[i]]).cuda()
        row = session[i]
        if name != 'test':
            label = row[-1]
            row = row[:-1]
        row = list(dict.fromkeys(row[::-1]))[::-1]
        if len(row)==1:
            row = row+row
        input_ids = torch.tensor([row[-2:]]).cuda()
        with torch.no_grad():
            with autocast():
                pred = model(input_ids, recall_items)[0]
        idx = pred.topk(200).indices 
        pred_items_ids = recall_items[idx.cpu().numpy()].tolist()
        logits = pred[idx].tolist()
        recall2.append(pred_items_ids)
        recall2_logits.append(logits)
    recall2_data = pd.DataFrame({'next_item_prediction':recall2,'nn_logits':recall2_logits})
    recall2_data.to_pickle('../data/recall2_'+name+'2_200_p2.dataset')

100%|██████████| 34690/34690 [01:26<00:00, 401.51it/s] 


In [17]:
for name in ['test']:
    if name == 'test':
        data = pd.read_pickle('../data/test_data2_p2.dataset')
        recall_data = pd.read_pickle('../data/recall1_test2_200_p2.dataset')
    elif name == 'train':
        data = pd.read_pickle('../data/train_data2_005.dataset')
        recall_data = pd.read_pickle('../data/recall1_train2_200_p2.dataset')
    elif name == 'valid':
        data = pd.read_pickle('../data/valid_data2_005.dataset')
        recall_data = pd.read_pickle('../data/recall1_valid2_200_p2.dataset')
    session = data['session'].tolist()
    locale = data['locale'].tolist()
    recall1 = recall_data['next_item_prediction']
    locale2ids = pickle.load(open('../data/local2ids.pkl', 'rb'))
    
    recall1_logits = []
    for i in tqdm(range(len(session))):
        recall_items = torch.tensor(recall1[i]).cuda()
        if recall_items.shape[0] == 0:
            recall1_logits.append([])
            continue
        row = session[i]
        if name != 'test':
            label = row[-1]
            row = row[:-1]
        row = list(dict.fromkeys(row[::-1]))[::-1]
        if len(row)==1:
            row = row+row
        input_ids = torch.tensor([row]).cuda()
        with torch.no_grad():
            with autocast():
                pred = model(input_ids, recall_items)[0].tolist()
        recall1_logits.append(pred)
        
    recall2_data = pd.read_pickle('../data/recall2_'+name+'2_200_p2.dataset')
    recall2 = recall2_data['next_item_prediction'].tolist()
    recall2_logits = recall2_data['nn_logits'].tolist()
    real_recall = []
    real_logits = []
    rank = []
    for i in tqdm(range(len(recall2))):
        recall_temp = recall1[i][:200]
        logits_temp = recall1_logits[i][:200]
        rank_temp = list(range(len(logits_temp)))
        row = session[i]
        if name != 'test':
            row = row[:-1]
        for j in range(len(recall2[i])):
            if len(rank_temp)==250:
                break
            if recall2[i][j] in recall_temp:
                continue
            if recall2[i][j] in row:
                continue
            recall_temp.append(recall2[i][j])
            logits_temp.append(recall2_logits[i][j])
            if recall2[i][j] in recall1[i]:
                rank_temp.append(recall1[i].index(recall2[i][j]))
            else:
                rank_temp.append(-1)
        real_recall.append(recall_temp)
        real_logits.append(logits_temp)
        rank.append(rank_temp)
    df = pd.DataFrame({'next_item_prediction':real_recall,'nn_logits':real_logits,'rank':rank})
    df.to_pickle('../xgb-task2/recall_'+name+'2_250_with_nn_p2.dataset')

100%|██████████| 34690/34690 [01:07<00:00, 516.80it/s]
100%|██████████| 34690/34690 [03:26<00:00, 167.59it/s]


# 验证结果

In [16]:
mrr = 0
recall_at = [0 for _ in range(200)]
for i in tqdm(range(len(session))):
    row = session[i]
    label = row[-1]
    row = row[:-1]
    if label not in recall2[i]:
        continue
    index = 9999
    number = 0
    for j in range(len(recall2[i])):
        if recall2[i][j] in row:
            continue
        number+=1
        if recall2[i][j] == label:
            index = number
            break
    if index<=100:
        recall_at[index-1]+=1
        mrr += 1/index
print(mrr/len(session))

100%|██████████| 1668/1668 [00:00<00:00, 83529.53it/s]

0.34533651210096367





# updata

In [9]:
model = DoubleSeq().cuda()
model.load_state_dict(torch.load('ckpt/0.3084.pt'))
model.eval()

DoubleSeq(
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (item_emb): Embedding(1410676, 128, padding_idx=0)
  (mlp): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=128, out_features=128, bias=True)
  )
)

In [14]:
name = ['train','test','valid'][1]
if name == 'test':
    data = pd.read_pickle('../data/test_data2_p2.dataset')
    recall_data = pd.read_pickle('../XGBOOST/recall_'+name+'2_1050_with_nn_p2_2.dataset')
elif name == 'train':
    data = pd.read_pickle('../data/train_data2_005.dataset')
    recall_data = pd.read_pickle('../XGBOOST/recall_'+name+'2_250_with_nn_p2_2.dataset')
elif name == 'valid':
    data = pd.read_pickle('../data/valid_data2_005.dataset')
    recall_data = pd.read_pickle('../XGBOOST/recall_'+name+'2_250_with_nn_p2_2.dataset')
session = data['session'].tolist()
locale = data['locale'].tolist()
recall1 = recall_data['next_item_prediction']
locale2ids = pickle.load(open('../data/local2ids.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: '../XGBOOST/recall_test2_1050_with_nn_p4.dataset'

In [11]:
recall1_logits = []
for i in tqdm(range(len(session))):
    recall_items = torch.tensor(recall1[i]).cuda()
    if recall_items.shape[0] == 0:
        recall1_logits.append([])
        continue
    row = session[i]
    if name != 'test':
        label = row[-1]
        row = row[:-1]
    row = list(dict.fromkeys(row[::-1]))[::-1]
    if len(row)==1:
        row = row+row
    input_ids = torch.tensor([row]).cuda()
    with torch.no_grad():
        with autocast():
            pred = model(input_ids, recall_items)[0].tolist()
    recall1_logits.append(pred)

100%|██████████| 1668/1668 [00:03<00:00, 534.36it/s]


In [12]:
df = pd.DataFrame({'nn_logits':recall1_logits})
df.to_pickle('../XGBOOST/logits_'+name+'2_p2.dataset')
df

Unnamed: 0,nn_logits
0,"[0.2978515625, 0.65673828125, 0.46435546875, 0..."
1,"[0.36669921875, 0.41650390625, 0.414794921875,..."
2,"[0.90283203125, 0.923828125, 0.82177734375, 0...."
3,"[-0.002193450927734375, 0.3876953125, 0.485839..."
4,"[0.55908203125, 0.5615234375, 0.4404296875, 0...."
...,...
1663,"[0.80224609375, 0.75390625, 0.83203125, 0.5502..."
1664,"[0.78173828125, 0.6279296875, 0.5234375, 0.655..."
1665,"[0.88134765625, 0.83251953125, 0.81201171875, ..."
1666,"[0.6845703125, 0.68359375, 0.6240234375, 0.615..."
