In [1]:
import torch
print(f"PyTorch 버전: {torch.__version__}")
print(f"CUDA 사용 가능 여부: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU 이름: {torch.cuda.get_device_name(0)}")

PyTorch 버전: 2.5.1+cu121
CUDA 사용 가능 여부: True
GPU 이름: NVIDIA GeForce RTX 4060 Ti


### 임베딩 레이어
- 토큰 임베딩 + 위치 임베딩

In [2]:
# 위치 인코딩
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # 짝수에는 sin,홀수 차원에는 cos
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # 배치 차원을 맨 앞에 추가
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]
        
# 토큰 임베딩 = 파이토치 라이브러리 내부 모듈 (nn.Embedding)

class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embed = PositionalEncoding(d_model, max_len)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.token_embedding(x)
        x = self.pos_embed(x)
        return self.dropout(x)

### 인코더

In [7]:
# 상수로 전달해서 인코더 레이어 생성
encoder_layer = nn.TransformerEncoderLayer(
    d_model=512,
    nhead=8,
    dim_feedforward=2048,
    dropout=0.1,
    batch_first=True
)
encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)

### 분류기

In [4]:
classifier = nn.Linear(512,2) # num_classes = 2

In [8]:
# Encoder only Model

class EncoderOnlyModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, num_classes, max_len=5000, dropout=0.1):
        super().__init__()
        self.embedding = InputEmbedding(vocab_size, d_model, max_len, dropout)
        # 인코더 
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.encoder(x)
        cls_output = x[:, 0, :] # (batch_size, d_model)
        x = self.classifier(cls_output)
        return x

### 데이터셋 로드

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Hugging Face의 AutoTokenizer를 사용합니다.
from datasets import load_dataset
from transformers import AutoTokenizer

# BERT 모델이 사용했던 토크나이저를 로드합니다.
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# 데이터셋 로드
dataset = load_dataset("stanfordnlp/imdb")

# 이제 직접 사전을 만들 필요 없이, 로드한 토크나이저를 바로 사용합니다.
def text_pipeline(text):
    return tokenizer.encode(text, truncation=True, max_length=512)

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        processed_text = torch.tensor(text_pipeline(item['text']), dtype=torch.int64)
        label = torch.tensor(item['label'], dtype=torch.int64)
        return processed_text, label

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    
    labels = torch.tensor(label_list, dtype=torch.int64)
    # 패딩을 할 때, 토크나이저에 정의된 pad 토큰의 ID를 사용합니다.
    texts = pad_sequence(text_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    return texts, labels

train_dataset = MyDataset(dataset['train'])
test_dataset = MyDataset(dataset['test'])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_batch) # 배치 사이즈 조정
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_batch)

### Encoder only model 학습

In [18]:
Vocab_size = tokenizer.vocab_size  # BERT 토크나이저의 실제 vocab_size 사용
d_model = 512
nhead = 4
num_layers = 2
dim_feedforward = 2048
num_classes = 2

encoder_only_model = EncoderOnlyModel(
    vocab_size=Vocab_size,
    d_model=d_model,
    nhead=nhead,
    num_layers=num_layers,
    dim_feedforward=dim_feedforward,
    num_classes=num_classes
)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(encoder_only_model.parameters(), lr=0.0001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder_only_model.to(device)

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss, total_acc = 0, 0  # 변수 초기화 추가
    
    for i, (texts, labels) in enumerate(dataloader):
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        
    return total_loss / len(dataloader.dataset), total_acc / len(dataloader.dataset)

def evaluate(model, dataloader, criterion, device):  # 함수 시그니처 수정
    model.eval()
    total_loss, total_acc = 0, 0

    with torch.no_grad():
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)  # 변수명 통일
            predictions = model(texts)
            loss = criterion(predictions, labels)

            total_loss += loss.item()
            total_acc += (predictions.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader.dataset), total_acc / len(dataloader.dataset)

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    train_loss, train_acc = train(encoder_only_model, train_dataloader, optimizer, criterion, device)
    test_loss, test_acc = evaluate(encoder_only_model, test_dataloader, criterion, device)
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

Epoch 1/10
Train Loss: 0.0338, Train Acc: 0.7157
Test Loss: 0.0294, Test Acc: 0.7773
Epoch 2/10
Train Loss: 0.0278, Train Acc: 0.7907
Test Loss: 0.0270, Test Acc: 0.8042
Epoch 2/10
Train Loss: 0.0278, Train Acc: 0.7907
Test Loss: 0.0270, Test Acc: 0.8042
Epoch 3/10
Train Loss: 0.0250, Train Acc: 0.8194
Test Loss: 0.0251, Test Acc: 0.8186
Epoch 3/10
Train Loss: 0.0250, Train Acc: 0.8194
Test Loss: 0.0251, Test Acc: 0.8186
Epoch 4/10
Train Loss: 0.0232, Train Acc: 0.8392
Test Loss: 0.0253, Test Acc: 0.8228
Epoch 4/10
Train Loss: 0.0232, Train Acc: 0.8392
Test Loss: 0.0253, Test Acc: 0.8228
Epoch 5/10
Train Loss: 0.0215, Train Acc: 0.8536
Test Loss: 0.0235, Test Acc: 0.8328
Epoch 5/10
Train Loss: 0.0215, Train Acc: 0.8536
Test Loss: 0.0235, Test Acc: 0.8328
Epoch 6/10
Train Loss: 0.0197, Train Acc: 0.8671
Test Loss: 0.0256, Test Acc: 0.8343
Epoch 6/10
Train Loss: 0.0197, Train Acc: 0.8671
Test Loss: 0.0256, Test Acc: 0.8343
Epoch 7/10
Train Loss: 0.0183, Train Acc: 0.8784
Test Loss: 0.024