<a href="https://colab.research.google.com/github/haegomm/ai_practice/blob/master/%EC%A3%BC%EC%96%B4%EC%A7%84_%EB%AC%B8%EC%9E%A5%EC%97%90%EC%84%9C_%EB%82%98%EC%98%AC_%EB%8B%A4%EC%9D%8C_%EB%8B%A8%EC%96%B4_%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer 실습

이번 실습에서는 감정 분석 task에 RNN 대신 Transformer를 구현하여 적용해 볼 것입니다.
Library import나 dataloader 생성은 RNN 실습 때와 똑같기 때문에 설명은 넘어가도록 하겠습니다.

In [1]:
!pip install datasets sacremoses

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[

##[MY CODE] Last word prediction dataset 준비

In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from torch.nn.utils.rnn import pad_sequence

# 데이터 셋 및 토크나이저 준비
ds = load_dataset("stanfordnlp/imdb")
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(tokenizer(row['text'], truncation=True, max_length=max_len).input_ids[-2]) # 마지막에서 두 번째 토큰을 라벨로 사용(불필요한 토큰([SEP])은 제외)
    texts.append(torch.LongTensor(tokenizer(row['text'], truncation=True, max_length=max_len).input_ids[:-2])) # 마지막 두 개 토큰 제외

  texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
  labels = torch.LongTensor(labels)

  return texts, labels


train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

##Self-attention

In [3]:
from torch import nn
from math import sqrt


class SelfAttention(nn.Module):
  def __init__(self, input_dim, d_model):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model

    self.wq = nn.Linear(input_dim, d_model)
    self.wk = nn.Linear(input_dim, d_model)
    self.wv = nn.Linear(input_dim, d_model)
    self.dense = nn.Linear(d_model, d_model)

    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x, mask):
    q, k, v = self.wq(x), self.wk(x), self.wv(x)
    score = torch.matmul(q, k.transpose(-1, -2)) # (B, S, D) * (B, D, S) = (B, S, S)
    score = score / sqrt(self.d_model)

    if mask is not None:
      score = score + (mask * -1e9)

    score = self.softmax(score)
    result = torch.matmul(score, v)
    result = self.dense(result)

    return result

In [4]:
class TransformerLayer(nn.Module):
  def __init__(self, input_dim, d_model, dff):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model
    self.dff = dff

    self.sa = SelfAttention(input_dim, d_model)
    self.ffn = nn.Sequential(
      nn.Linear(d_model, dff),
      nn.ReLU(),
      nn.Linear(dff, d_model)
    )

  def forward(self, x, mask):
    x = self.sa(x, mask)
    x = self.ffn(x)

    return x

In [5]:
import numpy as np

# Positional Encoding
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)


max_len = 400
print(positional_encoding(max_len, 256).shape)

torch.Size([1, 400, 256])


##[MY CODE] TextClassifier 모델 수정 (출력 차원을 vocab_size로 변경)

In [6]:
class TextClassifier(nn.Module):
  def __init__(self, vocab_size, d_model, n_layers, dff):
    super().__init__()

    self.vocab_size = vocab_size
    self.d_model = d_model
    self.n_layers = n_layers
    self.dff = dff

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
    self.layers = nn.ModuleList([TransformerLayer(d_model, d_model, dff) for _ in range(n_layers)])
    self.dropout = nn.Dropout(0.3)  # 드롭아웃 추가
    self.classification = nn.Linear(d_model, vocab_size)  # 출력 차원을 vocab_size로 수정

  def forward(self, x):
        mask = (x == tokenizer.pad_token_id)[:, None, :]
        seq_len = x.shape[1]
        x = self.embedding(x) * sqrt(self.embedding.embedding_dim)
        x = x + self.pos_encoding[:, :seq_len]

        for layer in self.layers:
            x = layer(x, mask)

        x = torch.mean(x, dim=1)  # 문장 평균 풀링
        x = self.dropout(x)  # 드롭아웃 적용
        x = self.classification(x)  # [batch_size, vocab_size]

        return x


model = TextClassifier(len(tokenizer), 32, 2, 32)

##[MY CODE] Optimizer 및 Loss 설정

In [7]:
from torch.optim import Adam

lr = 1e-4
model = model.to('cuda')
criterion = nn.CrossEntropyLoss()  # 다중 클래스 분류를 위한 CrossEntropyLoss

optimizer = Adam(model.parameters(), lr=lr)

In [8]:
# 가중치 초기화 함수
def init_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.constant_(module.bias, 0)
model.apply(init_weights)

TextClassifier(
  (embedding): Embedding(30522, 32)
  (layers): ModuleList(
    (0-1): 2 x TransformerLayer(
      (sa): SelfAttention(
        (wq): Linear(in_features=32, out_features=32, bias=True)
        (wk): Linear(in_features=32, out_features=32, bias=True)
        (wv): Linear(in_features=32, out_features=32, bias=True)
        (dense): Linear(in_features=32, out_features=32, bias=True)
        (softmax): Softmax(dim=-1)
      )
      (ffn): Sequential(
        (0): Linear(in_features=32, out_features=32, bias=True)
        (1): ReLU()
        (2): Linear(in_features=32, out_features=32, bias=True)
      )
    )
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (classification): Linear(in_features=32, out_features=30522, bias=True)
)

In [9]:
import numpy as np
import matplotlib.pyplot as plt


def accuracy(model, dataloader):
    correct = 0
    total = 0

    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            outputs = model(inputs)
            predictions = torch.argmax(outputs, dim=-1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return correct / total

In [10]:
n_epochs = 50

for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for data in train_loader:
        inputs, labels = data
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"Epoch {epoch+1:3d} | Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.3f} | Test Acc: {test_acc:.3f}")

Epoch   1 | Train Loss: 1859.9452 | Train Acc: 0.558 | Test Acc: 0.564
Epoch   2 | Train Loss: 1214.2523 | Train Acc: 0.558 | Test Acc: 0.564
Epoch   3 | Train Loss: 1167.5249 | Train Acc: 0.558 | Test Acc: 0.564
Epoch   4 | Train Loss: 1143.7276 | Train Acc: 0.558 | Test Acc: 0.564
Epoch   5 | Train Loss: 1127.4509 | Train Acc: 0.558 | Test Acc: 0.563
Epoch   6 | Train Loss: 1114.4632 | Train Acc: 0.558 | Test Acc: 0.563
Epoch   7 | Train Loss: 1106.2906 | Train Acc: 0.558 | Test Acc: 0.563
Epoch   8 | Train Loss: 1100.3815 | Train Acc: 0.558 | Test Acc: 0.563
Epoch   9 | Train Loss: 1093.3520 | Train Acc: 0.558 | Test Acc: 0.563
Epoch  10 | Train Loss: 1088.7144 | Train Acc: 0.558 | Test Acc: 0.563
Epoch  11 | Train Loss: 1082.5994 | Train Acc: 0.558 | Test Acc: 0.563
Epoch  12 | Train Loss: 1078.4666 | Train Acc: 0.558 | Test Acc: 0.563
Epoch  13 | Train Loss: 1074.2391 | Train Acc: 0.558 | Test Acc: 0.563
Epoch  14 | Train Loss: 1068.7040 | Train Acc: 0.558 | Test Acc: 0.563
Epoch 

##[FEEDBACK] 드랍아웃 적용해보았는데 test정확도가 계속 떨어집니다. 학습이 잘 되지 않은 걸까요? test 정확도를 높이기 위해 어떤 해결방법을 적용할 수 있는지 궁금합니다.