In [1]:
import torch
import random
import torch.nn as nn
import pandas as pd
import numpy as np
import copy
import os
from tqdm import tqdm
from torch.optim import *
from torch.utils.data import Dataset, DataLoader
from typing import Optional, Callable, Any, Tuple
from transformers import AutoTokenizer, AutoModelForMaskedLM,BertModel,get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, precision_score, mean_squared_error
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = "cuda" if torch.cuda.is_available() else "cpu"
def seed_everything(seed=2001):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True
seed_everything()

In [3]:
pro_df = pd.read_csv("./data/products_train.csv", sep=',')

In [3]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base").to(device)

In [4]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = np.array(data)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        data = self.data[idx]
        return data

In [5]:
def collate_fn(data):
    max_length = max([len(x) for x in data])
    max_length = min(512,max_length)
    attention_mask,input_ids,weight =[], [], []
    for x in data:
        if len(x)>max_length:
            x = x[:max_length//2] + x[len(x)-max_length//2:]
        attention = [1] * len(x) + [0] * (max_length - len(x))
        attention_mask.append(attention)
        if len(x) == 2:
            weight.append(0)
        else:
            weight.append(1)
        x = x + [0] * (max_length - len(x))
        input_ids.append(x)
        
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    weight = torch.tensor(weight).unsqueeze(1)

    return input_ids, attention_mask, weight

# bert whitening

In [4]:
def compute_kernel_bias(vecs, vec_dim):

    mu = vecs.mean(axis=0, keepdims=True)
    cov = np.cov(vecs.T)
    u, s, vh = np.linalg.svd(cov)
    W = np.dot(u, np.diag(1 / np.sqrt(s)))

    return W[:, :vec_dim], -mu
def transform_and_normalize(vecs, kernel=None, bias=None):
    if not (kernel is None or bias is None):
        vecs = (vecs + bias).dot(kernel)
    return vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5
def white(vecs,dim=256):
    kernel, bias = compute_kernel_bias(vecs, dim)
    vecs = transform_and_normalize(vecs, kernel, bias)
    return vecs

# title process

In [6]:
title = np.array(pro_df['title'].tolist())
for k in range(len(title)):
    text = title[k]
    for i in '@#$%^&*()_+-=~`?><:;}{[]\|"':
        text = text.replace(i, ' ')
    title[k] = text
    if title[k].isdigit() or (title[k].isalpha() and len(title[k])<3) or title[k] == 'nan':
        title[k] = ''

In [7]:
title_ids = []
for item in tqdm(title):
    title_ids.append(tokenizer(item).input_ids)

100%|██████████| 1551057/1551057 [05:08<00:00, 5025.05it/s]


In [8]:
np.save('./title_ids',np.array(title_ids))

  """Entry point for launching an IPython kernel.


In [6]:
title_ids = np.load('./title_ids.npy',allow_pickle=True)

In [7]:
data_set = MyDataset(title_ids)
dataloader = DataLoader(data_set, batch_size=128, collate_fn=collate_fn, shuffle=False, pin_memory=True,
                              num_workers=4)

In [8]:
title_embeddings = torch.tensor([])
for input_ids, attention_mask, weight in tqdm(dataloader):
    with torch.no_grad():
        with autocast():
            embeddings = model.roberta(input_ids = input_ids.to(device),
                                       attention_mask=attention_mask.to(device)
                                      ).last_hidden_state[:,0,:].cpu()* weight
    title_embeddings = torch.cat([title_embeddings,embeddings],dim=0)
print(title_embeddings.shape)

100%|██████████| 12118/12118 [1:04:56<00:00,  3.11it/s]

torch.Size([1551057, 768])





In [8]:
dim=128
title_embeddings = torch.load('./title_embeddings_768.dataset').cpu().numpy()
title_embeddings = torch.tensor(white(title_embeddings,dim),dtype=torch.float32).cpu()
torch.save(title_embeddings,'./title_embeddings_'+str(dim)+'.dataset')