<div align="center">
    <img src='https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdna%2FcK7IvP%2FbtrY7MoNroA%2FAAAAAAAAAAAAAAAAAAAAANRlYs3-AmJlv2XO44yQLk4xgPYAAnEOG6O2_zT5j1oP%2Fimg.png%3Fcredential%3DyqXZFxpELC7KVnFOS48ylbz2pIh7yKj8%26expires%3D1753973999%26allow_ip%3D%26allow_referer%3D%26signature%3DvIusSJvEQZ7Ds%252Bi13VnJMEZLoHQ%253D' width = 300 >
</div>

## BERT(Bidirectional Encoder Representations from Transformers)

Google의 자연어 처리 사전학습 모델(PLM)로 문맥을 **양방향**으로 이해할 수 있도록 설계된 모델이다. 
- `BERT-base` 기준 총 12개의 transformer encoder layer로 구성되어있다.
    - Hidden size = 768
    - Attention heads: 12
    - Parameters: 약 110M(Million)

### Pre-trained 목표

1. **Masked Language Model(MLM)**: 입력 문장에서 전체 토큰 중 15%를 무작위로 마스킹하고, 이를 맞추는 언어모델링이다. 80%를 [MASK]로 대체한 후 10%는 랜덤한 다른 토큰으로, 나머지 10%는 원래 토큰 그대로 유지하여 계산한다. 

2. **Next Sentence Prediction (NSP)**: 두 개의 문장을 입력으로 받아, 두 번째 문장이 실제로 첫 번째 문장 뒤에 오는지 아닌지를 맞추는 이진 분류 문제이다. 전체 입력의 의미를 담는 [CLS] 토큰의 출력을 활용하여 NSP를 판단한다. 

### Embedding layer 

In [84]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import math 

class BertTokenEmbedding(nn.Module):
    def __init__(self, 
                 vocab_size: int, 
                 embedding_dim: int = 768, 
                 max_position_embeddings: int = 512,
                 type_vocab_size: int = 2, # 2개의 문장을 받으므로 
                 dropout_p: float = 0.1):
        super().__init__()
        
        self.token = nn.Embedding(vocab_size, embedding_dim) # 입력 받은 sentence 임베딩 
        self.segment = nn.Embedding(type_vocab_size, embedding_dim) 
        self.position = nn.Embedding(max_position_embeddings, embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout_p)
        
    def forward(self, token_ids, segment_ids):
        x = self.token(token_ids) + self.segment(segment_ids) + self.position(token_ids)
        x = self.layer_norm(x)
        return self.dropout(x)

### Multi-Head Self-Attention

In [85]:
from math import sqrt 

def scaled_dot_product_attention(querys: torch.Tensor, 
                                 keys: torch.Tensor, 
                                 values: torch.Tensor, 
                                 attention_mask: torch.Tensor = None, # 패딩 마스크 
                                 is_casual: bool = False):
    dim_k = querys.size(-1)
    scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k)
    
    # 패딩 마스크가 0인 위치는 -inf로 설정하여 softmax 계산 시 0이 되도록 설정 
    if attention_mask is not None:
        scores = scores.masked_fill(attention_mask == 0, float('-inf'))

    attention_weights = F.softmax(scores, dim = 1)
    output = attention_weights @ values
    return output, attention_weights

In [86]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int):
        super().__init__()
        assert d_model % num_heads == 0, "embedding_dim must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        # 수정 
        # self.seq_len = seq_len # size에서 추출 
        self.dim_k = d_model // num_heads # 임베딩 차원을 head의 개수로 나눠서 각 head의 차원 계산
        
        self.weight_q = nn.Linear(d_model, d_model)
        self.weight_k = nn.Linear(d_model, d_model)
        self.weight_v = nn.Linear(d_model, d_model)
        
        # 멀티헤드이므로, 최종적으로 선형 변환 레이어 필요 
        self.concat_linear = nn.Linear(d_model, d_model)
        
    def forward(self,
                query: torch.Tensor, 
                key: torch.Tensor, 
                value: torch.Tensor, 
                attention_mask: torch.Tensor = None) -> torch.Tensor:
        # (batch_size, seq_len, embedding_dim)
        # batch_size 값 추출 
        batch_size = query.size(0)
        seq_len = query.size(1)
        # print(query.size()) Debug 
        
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, num_heads, dim_k)
        # seq_len으로 정의하지 않고 -1로 자동 정의할 때 오류 발생 가능성 O 
        query = self.weight_q(query).view(batch_size, seq_len, self.num_heads, self.dim_k).transpose(1,2)
        key = self.weight_k(key).view(batch_size, seq_len, self.num_heads, self.dim_k).transpose(1, 2)
        value = self.weight_v(value).view(batch_size, seq_len, self.num_heads, self.dim_k).transpose(1, 2)
        
        # (batch_size, num_heads, seq_len, dim_k) -> (batch_size, seq_len, d_model)
        attn_output, attn_weights = scaled_dot_product_attention(query, key, value, attention_mask)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        output = self.concat_linear(attn_output)
        
        return output

### Position-wise Feed-Forwardr Network(FFN)

In [87]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, dim_feedforward: int, dropout_p: float):
        super().__init__()
        
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        
        self.activation = nn.GELU()
        
        self.dropout = nn.Dropout(dropout_p)
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        norm_x = self.norm(x)
        
        ffn_output = self.linear1(norm_x)
        ffn_output = self.activation(ffn_output)
        ffn_output = self.dropout(ffn_output)
        
        ffn_output = self.linear2(ffn_output)
        
        ffn_output = self.dropout(ffn_output)
        output = x + ffn_output # 잔차 연결 residual connection
        
        return output

### Transformer Encoder

In [88]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int, dropout_p: float):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_p)
        self.feed_forward = PositionwiseFeedForward(d_model, dim_feedforward, dropout_p)
        
    def forward(self, src, attention_mask = None) -> torch.Tensor:
        norm_x = self.norm(src)
        attn_output = self.attn(norm_x, norm_x, norm_x, attention_mask)
        # 잔차 연결 
        x = src + self.dropout1(attn_output)
        x = self.feed_forward(x) 
        
        return x

### BERT Encoder (N Layer Transformer Encoder)

In [89]:
class BERTEncoder(nn.Module):
    def __init__(self, 
                 num_layers: int, 
                 d_model: int, 
                 num_heads: int, 
                 dim_feedforward: int, 
                 dropout_p: float):
        super().__init__()
        
        # num_layers 개의 Transformer Encoder layer 생성 
        self.layers = nn.ModuleList(
            [TransformerEncoder(d_model, num_heads, dim_feedforward, dropout_p) for _ in range(num_layers)]
        )
        
    def forward(self, x, attention_mask = None):
        for layer in self.layers:
            x = layer(x, attention_mask)
        return x

### Sample Data

In [90]:
batch_size = 2
seq_len = 10
d_model = 768
num_heads = 12
d_ff = 3072
num_layers = 2
dropout_p = 0.2

x = torch.rand(batch_size, seq_len, d_model)
attention_mask = torch.ones(batch_size, 1, 1, seq_len)  # 모두 attend 가능

encoder = BERTEncoder(num_layers, d_model, num_heads, d_ff, dropout_p)
out = encoder(x, attention_mask)
print(out.shape)  # (2, 10, 768)

torch.Size([2, 10, 768])


### Classifier Layer 추가

BERT는 원래 Encoder 모델로, MLM과 NSP고 사전 학습 되어있는 모델이다.따라서 대부분의 downstream task에서는 기존 BERT 구조에 **task-specific head**를 추가하여 해당 테스크를 해결한다. 

In [91]:
class BertClassifier(nn.Module):
    def __init__(self, encoder, d_model, num_classes, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder = encoder
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(d_model, num_classes)
        
    def forward(self, x, attention_mask=None):
        x = self.embedding(x)                 # (B, L) -> (B, L, D)
        x = self.encoder(x, attention_mask)   # (B, L, D)
        x = x.transpose(1, 2)                 # (B, D, L)
        x = self.pool(x).squeeze(-1)          # (B, D)
        return self.fc(x)


In [93]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 임의의 데이터셋 정의 
class FakeDataset(Dataset):
    def __init__(self, num_samples=200, seq_len=16, vocab_size=100, num_classes=2):
        self.inputs = torch.randint(0, vocab_size, (num_samples, seq_len))
        self.labels = torch.randint(0, num_classes, (num_samples,))
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]


### `train()` 정의

In [94]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total, correct = 0, 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    acc = correct / total * 100
    print(f"Train Accuracy: {acc:.2f}%")
    return acc


In [95]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERT 인코더 설정 
encoder = BERTEncoder(
    num_layers=2,
    d_model=32,
    num_heads=4,
    dim_feedforward=128,
    dropout_p=0.1
).to(device)

model = BertClassifier(encoder, d_model=32, num_classes=2, vocab_size=100).to(device)

dataset = FakeDataset(num_samples=300, seq_len=16, vocab_size=100, num_classes=2)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# 학습 반복
for epoch in range(20):
    acc = train(model, loader, optimizer, criterion, device)
    if acc == 100.0:
        print(f"Accuracy at Epoch {epoch + 1}")
        break

Train Accuracy: 48.67%
Train Accuracy: 58.67%
Train Accuracy: 65.33%
Train Accuracy: 66.33%
Train Accuracy: 65.67%
Train Accuracy: 69.00%
Train Accuracy: 69.67%
Train Accuracy: 68.33%
Train Accuracy: 74.00%
Train Accuracy: 75.33%
Train Accuracy: 75.67%
Train Accuracy: 80.00%
Train Accuracy: 82.33%
Train Accuracy: 82.67%
Train Accuracy: 80.33%
Train Accuracy: 87.00%
Train Accuracy: 89.33%
Train Accuracy: 93.67%
Train Accuracy: 92.00%
Train Accuracy: 96.33%
