In [1]:
import torch.optim as optim
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from transformers import BertTokenizer
from utils import NERTagger, get_dataloader
from model import CustomBERT, ContrastiveLossCosine

device = torch.device('cuda')


def set_seed(seed=42):
    """
    랜덤 시드 고정
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # GPU 사용 시에도 시드 고정

    # CUDNN 설정 (연산 속도 vs 재현성 선택)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
set_seed(0)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
ner_tagger = NERTagger()
MODEL_SAVE_PATH = f"./save/toxigen/best_model.pth"
criterion = {"lambda_loss":0.5, "cross-entropy": nn.CrossEntropyLoss(), "contrastive-learning":ContrastiveLossCosine()}

train_loader = get_dataloader(f"./data/toxigen/train.csv", tokenizer, ner_tagger=ner_tagger, use_ner=True,  batch_size=16)
# valid_loader = get_dataloader(f"./data/toxigen/valid.csv", tokenizer, ner_tagger=None, use_ner=False, batch_size=16)

model = CustomBERT("bert-base-uncased", hidden_dim=768).to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

num_epochs = 6

dataloader = train_loader
loss_type = "cross-entropy"

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


---Start dataload---
---End dataload---


In [3]:
batch = next(iter(dataloader))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [4]:
print("---Start train!---")
model.train()
total_loss = 0

input_ids = batch["input_ids"].to(device)
head_token_idx = batch["head_token_idx"].to(device)
labels = batch["labels"].to(device)

optimizer.zero_grad()

---Start train!---


In [5]:
import torch.nn.functional as F

class HeadAttention(nn.Module):
    def __init__(self, hidden_dim, head_dim):
        super(HeadAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.head_dim = head_dim

        self.W_q = nn.Linear(hidden_dim, head_dim, bias=False)
        self.W_k = nn.Linear(hidden_dim, head_dim, bias=False)
        self.W_v = nn.Linear(hidden_dim, head_dim, bias=False)

    def forward(self, cls_embedding, head_token_embedding):
        Q_h = self.W_q(cls_embedding)   # [CLS]의 Query
        K_h = self.W_k(head_token_embedding)  # 특정 토큰의 Key
        V_h = self.W_v(head_token_embedding)  # 특정 토큰의 Value

        attention_scores = torch.matmul(Q_h, K_h.T) / (self.head_dim ** 0.5)
        attention_weights = F.softmax(attention_scores, dim=-1)

        output = torch.matmul(attention_weights, V_h)
        return output

In [6]:
from transformers import BertModel

bert = BertModel.from_pretrained("bert-base-uncased", output_attentions=True).to(device)
head_attention = HeadAttention(768, 768).to(device)
classifier = nn.Linear(768, 2).to(device)

In [7]:
outputs = bert(input_ids)
cls_embedding = outputs.last_hidden_state[:, 0, :]
cls_embedding

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[-0.2144,  0.6464, -0.0300,  ..., -0.6731,  0.3182, -1.0252],
        [-0.3088,  0.6115,  0.1277,  ..., -0.6242,  0.5552, -0.7398],
        [-0.3368,  0.5849,  0.5805,  ..., -0.4178,  0.4864, -0.4485],
        ...,
        [-0.3106,  0.5797,  0.1688,  ..., -0.6051,  0.5062, -0.7638],
        [-0.3192,  0.4904,  0.0245,  ..., -0.4990,  0.4545, -0.8290],
        [-0.3000,  0.5989,  0.2211,  ..., -0.5376,  0.5293, -0.7566]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [8]:
batch_size = input_ids.shape[0]

In [None]:
# head_token_idx = torch.tensor([[1], [1], [0], [0], [0], [3], [0], [0], [6], [0], [0], [0], [0], [0], [0], [1]])

In [9]:
head_token_idx.to(device)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [10]:
binary_tensor = (head_token_idx != 0).int()  # True -> 1, False -> 0

print(binary_tensor)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0',
       dtype=torch.int32)


In [11]:
cls_embedding

tensor([[-0.2144,  0.6464, -0.0300,  ..., -0.6731,  0.3182, -1.0252],
        [-0.3088,  0.6115,  0.1277,  ..., -0.6242,  0.5552, -0.7398],
        [-0.3368,  0.5849,  0.5805,  ..., -0.4178,  0.4864, -0.4485],
        ...,
        [-0.3106,  0.5797,  0.1688,  ..., -0.6051,  0.5062, -0.7638],
        [-0.3192,  0.4904,  0.0245,  ..., -0.4990,  0.4545, -0.8290],
        [-0.3000,  0.5989,  0.2211,  ..., -0.5376,  0.5293, -0.7566]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [20]:
cls_embedding.size()

torch.Size([16, 768])

In [12]:
head_token_embeddings = outputs.last_hidden_state[torch.arange(batch_size).unsqueeze(1), head_token_idx]  
head_token_embeddings = head_token_embeddings.mean(dim=1) 
head_token_embeddings

tensor([[-0.2144,  0.6464, -0.0300,  ..., -0.6731,  0.3182, -1.0252],
        [-0.3088,  0.6115,  0.1277,  ..., -0.6242,  0.5552, -0.7398],
        [-0.3368,  0.5849,  0.5805,  ..., -0.4178,  0.4864, -0.4485],
        ...,
        [-0.3106,  0.5797,  0.1688,  ..., -0.6051,  0.5062, -0.7638],
        [-0.3192,  0.4904,  0.0245,  ..., -0.4990,  0.4545, -0.8290],
        [-0.3000,  0.5989,  0.2211,  ..., -0.5376,  0.5293, -0.7566]],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [18]:
head_token_embeddings.size()

torch.Size([16, 768])

In [13]:
head_attention_output = head_attention(cls_embedding, head_token_embeddings)
head_attention_output


tensor([[ 0.1664,  0.1229, -0.1659,  ..., -0.1194,  0.7561, -0.0572],
        [ 0.1666,  0.1229, -0.1659,  ..., -0.1193,  0.7562, -0.0572],
        [ 0.1667,  0.1229, -0.1659,  ..., -0.1193,  0.7564, -0.0572],
        ...,
        [ 0.1666,  0.1229, -0.1659,  ..., -0.1193,  0.7562, -0.0572],
        [ 0.1665,  0.1229, -0.1659,  ..., -0.1193,  0.7562, -0.0573],
        [ 0.1666,  0.1229, -0.1659,  ..., -0.1193,  0.7562, -0.0572]],
       device='cuda:0', grad_fn=<MmBackward0>)

In [17]:
head_attention_output.size()

torch.Size([16, 768])

In [33]:
outputs.attentions[0][:, 0, :, :].size()

torch.Size([16, 512, 512])

In [None]:
head_attention_output = head_attention_output * binary_tensor.to(device)

In [None]:
head_attention_output

In [None]:
final_embedding = cls_embedding + head_attention_output
final_embedding