<a href="https://colab.research.google.com/github/haegomm/ai_practice/blob/master/Multi_head_Attention%EC%9C%BC%EB%A1%9C_%EA%B0%90%EC%A0%95_%EB%B6%84%EC%84%9D_%EB%AA%A8%EB%8D%B8_%EA%B5%AC%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install datasets sacremoses



In [13]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from torch import nn
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

# 데이터셋 로드
ds = load_dataset("stanfordnlp/imdb")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    encoding = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors='pt')
    texts = encoding.input_ids
    labels = torch.LongTensor(labels)

    return texts, labels

train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [14]:
# Positional Encoding 함수
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]

    return torch.FloatTensor(pos_encoding)

##[MY CODE]Multi-Head Attention 구현

In [15]:
# Multi-Head Attention 구현
class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.d_model = d_model
        self.n_heads = n_heads
        self.depth = d_model // n_heads

        self.wq = nn.Linear(input_dim, d_model)
        self.wk = nn.Linear(input_dim, d_model)
        self.wv = nn.Linear(input_dim, d_model)

        self.dense = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def split_heads(self, x):
        # x: (batch_size, seq_len, d_model) -> (batch_size, n_heads, seq_len, depth)
        batch_size, seq_len, d_model = x.size()
        x = x.view(batch_size, seq_len, self.n_heads, self.depth)
        return x.transpose(1, 2)  # (batch_size, n_heads, seq_len, depth)

    def forward(self, x, mask):
        batch_size, seq_len, _ = x.size()

        Q = self.split_heads(self.wq(x))  # (batch_size, n_heads, seq_len, depth)
        K = self.split_heads(self.wk(x))  # (batch_size, n_heads, seq_len, depth)
        V = self.split_heads(self.wv(x))  # (batch_size, n_heads, seq_len, depth)

        # Scaled Dot-Product Attention
        score = torch.matmul(Q, K.transpose(-2, -1)) / sqrt(self.depth)  # (batch_size, n_heads, seq_len, seq_len)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)
            score = score.masked_fill(mask == 0, -1e9)

        attention = self.softmax(score)  # (batch_size, n_heads, seq_len, seq_len)
        out = torch.matmul(attention, V)  # (batch_size, n_heads, seq_len, depth)

        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)  # (batch_size, seq_len, d_model)
        out = self.dense(out)  # (batch_size, seq_len, d_model)

        return out

##[MY CODE] Layer Normalization, Dropout, Residual Connection 추가

In [16]:
# Transformer Layer 구현
class TransformerLayer(nn.Module):
    def __init__(self, input_dim, d_model, dff, n_heads, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim, d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Multi-Head Attention
        attn_output = self.mha(x, mask)  # (batch_size, seq_len, d_model)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)  # Residual Connection + LayerNorm

        # Feed-Forward Network
        ffn_output = self.ffn(out1)  # (batch_size, seq_len, d_model)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)  # Residual Connection + LayerNorm

        return out2

In [17]:
# TextClassifier 수정
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, dff, n_heads, max_len=400, dropout=0.1):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
        self.dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            TransformerLayer(d_model, d_model, dff, n_heads, dropout) for _ in range(n_layers)
        ])
        self.classification = nn.Linear(d_model, 1)

    def forward(self, x):
        mask = (x != tokenizer.pad_token_id).float()  # (batch_size, seq_len)

        x = self.embedding(x)  # (batch_size, seq_len, d_model)
        x = x * sqrt(self.embedding.embedding_dim)
        x = x + self.pos_encoding[:, :x.size(1), :]
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        x = x[:, 0, :]  # [CLS] 토큰의 표현 사용
        x = self.classification(x).squeeze(-1)  # (batch_size)

        return x

##[MY CODE] 5-layer 4-head Transformer 모델 구성

In [18]:
# 모델 초기화
vocab_size = len(tokenizer)
d_model = 256  # d_model을 256으로 설정 (n_heads=4의 경우 D'=64)
n_layers = 5
dff = 512
n_heads = 4
dropout = 0.1

model = TextClassifier(vocab_size, d_model, n_layers, dff, n_heads, max_len=400, dropout=dropout)
model = model.to('cuda')

In [19]:
# 손실 함수 및 옵티마이저 설정
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [20]:
# 정확도 계산 함수
def accuracy_fn(model, dataloader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to('cuda'), labels.to('cuda').float()
            outputs = model(inputs)
            preds = (torch.sigmoid(outputs) > 0.5).long()
            correct += (preds == labels.long()).sum().item()
            total += labels.size(0)

    return correct / total

In [None]:
# 학습 루프
n_epochs = 50

for epoch in range(n_epochs):
    model.train()
    total_loss = 0.0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        inputs, labels = inputs.to('cuda'), labels.to('cuda').float()

        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_acc = accuracy_fn(model, train_loader)
    test_acc = accuracy_fn(model, test_loader)

    print(f"Epoch {epoch+1:2d} | Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

Epoch  1 | Train Loss: 273.6025 | Train Acc: 0.5057 | Test Acc: 0.5016
Epoch  2 | Train Loss: 270.7406 | Train Acc: 0.6027 | Test Acc: 0.6013
Epoch  3 | Train Loss: 270.2100 | Train Acc: 0.5000 | Test Acc: 0.5000
Epoch  4 | Train Loss: 270.7163 | Train Acc: 0.5000 | Test Acc: 0.5000
Epoch  5 | Train Loss: 271.6340 | Train Acc: 0.5000 | Test Acc: 0.5000
Epoch  6 | Train Loss: 271.2083 | Train Acc: 0.5002 | Test Acc: 0.5000
Epoch  7 | Train Loss: 271.3338 | Train Acc: 0.5000 | Test Acc: 0.5000
Epoch  8 | Train Loss: 271.4624 | Train Acc: 0.5000 | Test Acc: 0.5000
Epoch  9 | Train Loss: 271.3127 | Train Acc: 0.5000 | Test Acc: 0.5000
Epoch 10 | Train Loss: 271.2302 | Train Acc: 0.5000 | Test Acc: 0.5000
