## Step 1: 데이터셋 준비 및 Data Loader 설정

In [5]:
!pip install datasets



In [6]:
# 데이터셋 로드 및 토크나이저 설정
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from torch.nn.utils.rnn import pad_sequence

# 데이터셋 로드
ds = load_dataset("stanfordnlp/imdb")

# BERT Tokenizer 로드
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# collate_fn 함수 수정
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []

    for row in batch:
        tokens = tokenizer(row['text'], padding=True, truncation=True, max_length=max_len, return_tensors='pt')['input_ids'].squeeze(0)
        # 마지막 두 토큰 제외한 입력으로 설정
        inputs = tokens[:-2]
        # 마지막에서 두 번째 토큰을 레이블로 설정
        label = tokens[-2]
        texts.append(inputs)
        labels.append(label)

    texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.tensor(labels)

    return texts, labels

train_loader = DataLoader(ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn)


## Step 2: Transformer 구성 요소 구현

In [7]:
import torch.nn as nn
import math

# Multi-head Attention 클래스 정의
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads  # D = H * D'

        # Q, K, V의 Linear layer
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        # 최종적으로 결합한 결과를 위한 output layer
        self.wo = nn.Linear(d_model, d_model)

        # Softmax 함수
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        batch_size = x.size(0)

        # 1. Q, K, V 계산 (batch_size, seq_len, d_model)
        q = self.wq(x)
        k = self.wk(x)
        v = self.wv(x)

        # 2. H개의 head로 나누기 (batch_size, seq_len, d_model) -> (batch_size, H, seq_len, D')
        q = q.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k = k.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)

        # 3. Scaled dot-product attention 계산
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)

        # 4. Mask 적용
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # 5. Attention 계산 후 V와 곱하기
        attention = self.softmax(scores)
        context = torch.matmul(attention, v)

        # 6. H개의 head 결합하기
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # 7. Output layer 통과
        output = self.wo(context)

        return output

# Positional encoding 정의
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)

# Transformer Layer 구현
class TransformerLayer(nn.Module):
    def __init__(self, d_model, n_heads, dff, dropout_rate=0.1):
        super(TransformerLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        # MHA -> Dropout -> Residual -> Layer Norm
        attn_output = self.mha(x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(attn_output + x)

        # Feed Forward -> Dropout -> Residual -> Layer Norm
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.norm2(ffn_output + out1)

        return out2


## Step 3: TextClassifier 및 학습 설정

In [9]:
import numpy as np

# TextClassifier 정의: 마지막 토큰을 예측하는 모델
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, dff, max_len, dropout_rate=0.1):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
        self.layers = nn.ModuleList([TransformerLayer(d_model, n_heads, dff, dropout_rate) for _ in range(n_layers)])
        self.classifier = nn.Linear(d_model, vocab_size)  # 마지막 토큰의 ID를 예측하기 위해 vocab_size로 조정

    def forward(self, x, mask=None):
        seq_len = x.shape[1]
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = x + self.pos_encoding[:, :seq_len]

        for layer in self.layers:
            x = layer(x, mask)

        x = x[:, -1]  # 마지막 토큰에 대한 예측
        logits = self.classifier(x)
        return logits

# Optimizer 및 손실 함수 설정
import torch.optim as optim

model = TextClassifier(len(tokenizer.vocab), 128, 5, 4, 512, max_len=400, dropout_rate=0.1).to('cuda')
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# 학습 루프
def train_model(model, train_loader, test_loader, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for data in train_loader:
            inputs, labels = data
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            optimizer.zero_grad()
            preds = model(inputs)
            loss = loss_fn(preds, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

        model.eval()
        train_acc = calculate_accuracy(model, train_loader)
        test_acc = calculate_accuracy(model, test_loader)
        print(f"Train Accuracy: {train_acc:.4f} | Test Accuracy: {test_acc:.4f}")

# 정확도 계산 함수
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            inputs, labels = inputs.to('cuda'), labels.to('cuda')
            preds = model(inputs)
            predicted = torch.argmax(preds, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return correct / total

# 모델 학습 실행
train_model(model, train_loader, test_loader, n_epochs=50)

Epoch 1 | Loss: 1228.4641
Train Accuracy: 0.5742 | Test Accuracy: 0.5767
Epoch 2 | Loss: 1002.9105
Train Accuracy: 0.5775 | Test Accuracy: 0.5796
Epoch 3 | Loss: 963.6414
Train Accuracy: 0.5814 | Test Accuracy: 0.5813
Epoch 4 | Loss: 931.7001
Train Accuracy: 0.5886 | Test Accuracy: 0.5833
Epoch 5 | Loss: 896.8537
Train Accuracy: 0.5963 | Test Accuracy: 0.5836
Epoch 6 | Loss: 837.9196
Train Accuracy: 0.6169 | Test Accuracy: 0.5833
Epoch 7 | Loss: 779.2275
Train Accuracy: 0.6356 | Test Accuracy: 0.5821
Epoch 8 | Loss: 726.8249
Train Accuracy: 0.6557 | Test Accuracy: 0.5840
Epoch 9 | Loss: 688.7736
Train Accuracy: 0.6629 | Test Accuracy: 0.5863
Epoch 10 | Loss: 657.6734
Train Accuracy: 0.6648 | Test Accuracy: 0.5825
Epoch 11 | Loss: 631.7366
Train Accuracy: 0.6744 | Test Accuracy: 0.5835
Epoch 12 | Loss: 609.4695
Train Accuracy: 0.6794 | Test Accuracy: 0.5828
Epoch 13 | Loss: 591.0056
Train Accuracy: 0.6931 | Test Accuracy: 0.5856
Epoch 14 | Loss: 568.5691
Train Accuracy: 0.7019 | Test Ac

In [10]:
import random

def test_predictions(model, test_loader, tokenizer, num_samples=10):
    model.eval()
    samples = random.sample(list(test_loader), num_samples)

    for i, (inputs, labels) in enumerate(samples):
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        preds = model(inputs)
        predicted_tokens = torch.argmax(preds, dim=1).cpu().numpy()

        print(f"Sample {i+1}:")
        print(f"Input Tokens: {tokenizer.decode(inputs[0].cpu().numpy())}")
        print(f"Predicted Token: {tokenizer.decode([predicted_tokens[0]])}")
        print(f"Actual Token: {tokenizer.decode([labels[0].cpu().numpy()])}")
        print("-----")

# 임의의 10개 문장으로 예측 결과 확인
test_predictions(model, test_loader, tokenizer, num_samples=10)


Sample 1:
Input Tokens: [CLS] people with an aversion to gore may find some scenes hard going, but the thing is far from being simply a horror classic. the fact that the extraordinary special effects stand up against most modern day cgi is only a small part of why this movie is, finally, rightfully regarded as a masterpiece. technically brilliant in its camera - work and editing, superbly scripted and acted, one of the best openings, one of the best endings, tension and paranoia sustained throughout ( with countless viewings ), an excellent soundtrack, and open to multiple readings and analogy, there simply aren't enough superlatives to do this film justice. absolutely essential viewing [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

#### Train에서는 지속적으로 정확도가 좋아지는 모습을 보이지만, Test에서는 과적합(Overfitting) 문제 때문인지 일정 구간부터 정확도가 더 이상 오르지 않고 오히려 내려가는 모습까지 보입니다.