## Step 1: 기본 준비

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [2]:
# 데이터셋 로드
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast

ds = load_dataset("stanfordnlp/imdb")
# 최신 라이브러리와 성능 최적화를 위해 BertTokenizerFast.from_pretrained 방식으로 변경
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 데이터 로더 설정
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        texts.append(row['text'])
        labels.append(row['label'])

    texts = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors='pt')['input_ids']
    labels = torch.LongTensor(labels)
    return texts, labels

train_loader = DataLoader(ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



## Step 2: Multi-head Attention 구현

In [3]:
import torch.nn as nn
import math

# Multi-head Attention 클래스 정의
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads  # D = H * D'

        # Q, K, V의 Linear layer
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        # 최종적으로 결합한 결과를 위한 output layer
        self.wo = nn.Linear(d_model, d_model)

        # Softmax 함수
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        batch_size = x.size(0)

        # 1. Q, K, V 계산 (batch_size, seq_len, d_model)
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        # 2. H개의 head로 나누기 (batch_size, seq_len, d_model) -> (batch_size, H, seq_len, D')
        q = q.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k = k.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)

        # 3. Scaled dot-product attention 계산
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)  # (batch_size, H, seq_len, seq_len)

        # 4. Mask 적용
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # 5. Attention 계산 후 V와 곱하기
        attention = self.softmax(scores)
        context = torch.matmul(attention, v)  # (batch_size, H, seq_len, D')

        # 6. H개의 head 결합하기 (batch_size, seq_len, d_model)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # 7. 최종 output layer에 통과
        output = self.wo(context)

        return output


## Step 3: Layer Normalization, Dropout, Residual Connection 구현

In [4]:
# Transformer Layer 구현
class TransformerLayer(nn.Module):
    def __init__(self, d_model, n_heads, dff, dropout_rate=0.1):
        super(TransformerLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        # 1. MHA -> Dropout -> Residual -> Layer Norm
        attn_output = self.mha(x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(attn_output + x)

        # 2. Feed Forward -> Dropout -> Residual -> Layer Norm
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.norm2(ffn_output + out1)

        return out2


## Step 4: Text Classifier 정의

In [5]:
# Positional encoding 구현
import numpy as np

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)

# TextClassifier 정의
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, dff, max_len, dropout_rate=0.1):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
        self.layers = nn.ModuleList([TransformerLayer(d_model, n_heads, dff, dropout_rate) for _ in range(n_layers)])
        self.classifier = nn.Linear(d_model, 1)

    def forward(self, x, mask=None):
        seq_len = x.shape[1]
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = x + self.pos_encoding[:, :seq_len]

        for layer in self.layers:
            x = layer(x, mask)

        x = x[:, 0]
        logits = self.classifier(x)
        return logits


## Step 5: 학습 설정 및 정확도 계산

In [6]:
# Optimizer 및 손실 함수 설정
import torch.optim as optim

# n_layers=5, n_heads=4
model = TextClassifier(len(tokenizer.vocab), 128, 5, 4, 512, max_len=400, dropout_rate=0.1).to('cuda')
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

# 학습 루프
def train_model(model, train_loader, test_loader, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for data in train_loader:
            inputs, labels = data
            inputs, labels = inputs.to('cuda'), labels.to('cuda').float()

            optimizer.zero_grad()
            preds = model(inputs)
            loss = loss_fn(preds.squeeze(-1), labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch} | Loss: {total_loss:.4f}")

        # Validation accuracy 확인
        model.eval()
        train_acc = calculate_accuracy(model, train_loader)
        test_acc = calculate_accuracy(model, test_loader)
        print(f"Train Accuracy: {train_acc:.4f} | Test Accuracy: {test_acc:.4f}")

# 정확도 계산 함수
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            inputs, labels = inputs.to('cuda'), labels.to('cuda')
            preds = model(inputs)
            predicted = (preds > 0).long().squeeze(-1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return correct / total

# 모델 학습 실행
train_model(model, train_loader, test_loader, n_epochs=50)


Epoch 0 | Loss: 266.8517
Train Accuracy: 0.7068 | Test Accuracy: 0.6994
Epoch 1 | Loss: 211.1117
Train Accuracy: 0.7672 | Test Accuracy: 0.7564
Epoch 2 | Loss: 182.2322
Train Accuracy: 0.7969 | Test Accuracy: 0.7816
Epoch 3 | Loss: 160.2649
Train Accuracy: 0.8441 | Test Accuracy: 0.8121
Epoch 4 | Loss: 146.7743
Train Accuracy: 0.8446 | Test Accuracy: 0.8072
Epoch 5 | Loss: 136.1413
Train Accuracy: 0.8444 | Test Accuracy: 0.8002
Epoch 6 | Loss: 127.2619
Train Accuracy: 0.8720 | Test Accuracy: 0.8262
Epoch 7 | Loss: 119.4705
Train Accuracy: 0.8772 | Test Accuracy: 0.8331
Epoch 8 | Loss: 117.0781
Train Accuracy: 0.9108 | Test Accuracy: 0.8496
Epoch 9 | Loss: 103.7663
Train Accuracy: 0.9218 | Test Accuracy: 0.8505
Epoch 10 | Loss: 99.3887
Train Accuracy: 0.9254 | Test Accuracy: 0.8447
Epoch 11 | Loss: 89.2755
Train Accuracy: 0.9300 | Test Accuracy: 0.8399
Epoch 12 | Loss: 85.8176
Train Accuracy: 0.9412 | Test Accuracy: 0.8526
Epoch 13 | Loss: 80.3079
Train Accuracy: 0.9309 | Test Accuracy: