<a href="https://colab.research.google.com/github/hanghae-plus-AI/AI-1-ssungz789/blob/main/w2/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install sacremoses

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


ds = load_dataset("stanfordnlp/imdb")
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')


def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])
    texts.append(row['text'])

  texts = torch.LongTensor(tokenizer(texts, padding=True, truncation=True, max_length=max_len).input_ids)
  labels = torch.LongTensor(labels)

  return texts, labels


train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



## Self-attention

이번에는 self-attention을 구현해보겠습니다.
Self-attention은 shape이 (B, S, D)인 embedding이 들어왔을 때 attention을 적용하여 새로운 representation을 만들어내는 module입니다.
여기서 B는 batch size, S는 sequence length, D는 embedding 차원입니다.
구현은 다음과 같습니다.

In [3]:
from torch import nn
from math import sqrt


class MultiHeadAttention(nn.Module):
  def __init__(self, input_dim, d_model, n_heads):
    super().__init__()

    # d_model : 전체 모델의 차원
    # n_heads : Head의 갯수
    # head_dim : head에서 접근할 차원의 수

    self.input_dim = input_dim
    self.d_model = d_model
    self.n_heads = n_heads
    self.head_dim = d_model // n_heads

    self.wq = nn.Linear(input_dim, d_model)
    self.wk = nn.Linear(input_dim, d_model)
    self.wv = nn.Linear(input_dim, d_model)
    self.dense = nn.Linear(d_model, d_model)

    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x, mask):
    batch_size, seq_len, _ = x.size()

    # Q, K, V 생성 및 reshape
    q = self.wq(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1,2)
    k = self.wk(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1,2)
    v = self.wv(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1,2)

    score = torch.matmul(q, k.transpose(-1, -2)) # (B, S, D) * (B, D, S) = (B, S, S)
    score = score / sqrt(self.head_dim)

    if mask is not None:
      mask = mask.unsqueeze(1)
      score = score + (mask * -1e9)

    # softmax를 사용해 점수의 합을 1로 맞춤
    score = self.softmax(score)
    # 각 헤드의 value에 점수를 곱해 결과값을 구함
    result = torch.matmul(score, v)
    # n_heads * head_dim로 d_models 차원으로 다시 되돌림 FNN 처리를 위해 reshape
    result = result.transpose(1,2).contiguous().view(batch_size, seq_len, self.d_model)
    # 최종 변환
    result = self.dense(result)

    return result

In [4]:
class TransformerLayer(nn.Module):
  def __init__(self, input_dim, d_model, n_heads, dff, dropout=0.1):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model
    self.n_heads = n_heads
    self.dff = dff

    # Multi Head Attention 적용
    self.mha = MultiHeadAttention(input_dim, d_model, n_heads)

    # 확장 및 압축
    self.ffn = nn.Sequential(
      nn.Linear(d_model, dff),
      nn.ReLU(),
      nn.Linear(dff, d_model)
    )

    self.layer_norm1 = nn.LayerNorm(d_model)
    self.layer_norm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask):
    x1 = self.mha(x, mask)
    x1 = self.dropout(x1)
    x = self.layer_norm1(x1 + x)

    x2 = self.ffn(x)
    x2 = self.dropout(x2)
    x = self.layer_norm2(x2 + x)

    return x

In [5]:
import numpy as np


def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)


max_len = 400
print(positional_encoding(max_len, 256).shape)

torch.Size([1, 400, 256])


In [6]:
class TextClassifier(nn.Module):
  def __init__(self, vocab_size, d_model, n_layers, dff, n_heads):
    super().__init__()

    self.vocab_size = vocab_size
    self.d_model = d_model
    self.n_layers = n_layers
    self.dff = dff

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
    self.layers = nn.ModuleList([TransformerLayer(d_model, d_model, dff, n_heads) for _ in range(n_layers)])
    self.classification = nn.Linear(d_model, 1)

  def forward(self, x):
    mask = (x == tokenizer.pad_token_id)
    mask = mask[:, None, :]
    seq_len = x.shape[1]

    x = self.embedding(x)
    x = x * sqrt(self.d_model)
    x = x + self.pos_encoding[:, :seq_len]

    for layer in self.layers:
      x = layer(x, mask)

    x = x[:, 0]
    x = self.classification(x)

    return x


model = TextClassifier(len(tokenizer), 32, 5, 32, 4)

## 학습

학습하는 코드는 기존 실습들과 동일하기 때문에 마지막 결과만 살펴보도록 하겠습니다.

In [7]:
from torch.optim import Adam

lr = 0.001
model = model.to('cuda')
loss_fn = nn.BCEWithLogitsLoss()

optimizer = Adam(model.parameters(), lr=lr)

In [8]:
import numpy as np
import matplotlib.pyplot as plt


def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)
    # preds = torch.argmax(preds, dim=-1)
    preds = (preds > 0).long()[..., 0]

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt

In [9]:
from tqdm import tqdm
n_epochs = 50

for epoch in range(n_epochs):
  total_loss = 0.
  model.train()
  progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{n_epochs}")

  for data in progress_bar:
    model.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda').float()

    preds = model(inputs)[..., 0]
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

  with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

Epoch 1/50: 100%|██████████| 391/391 [00:49<00:00,  7.96it/s, loss=0.5247]


Epoch   0 | Train Loss: 208.95720621943474


Epoch 2/50: 100%|██████████| 391/391 [00:47<00:00,  8.25it/s, loss=0.2579]


Epoch   1 | Train Loss: 137.21809431910515


Epoch 3/50: 100%|██████████| 391/391 [00:47<00:00,  8.26it/s, loss=0.6690]


Epoch   2 | Train Loss: 107.64877858012915


Epoch 4/50: 100%|██████████| 391/391 [00:47<00:00,  8.25it/s, loss=0.2351]


Epoch   3 | Train Loss: 85.33837175369263


Epoch 5/50: 100%|██████████| 391/391 [00:47<00:00,  8.25it/s, loss=0.1740]


Epoch   4 | Train Loss: 66.97415829077363


Epoch 6/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.1221]


Epoch   5 | Train Loss: 51.49184278585017


Epoch 7/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.1753]


Epoch   6 | Train Loss: 40.120860105380416


Epoch 8/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0897]


Epoch   7 | Train Loss: 32.79415053408593


Epoch 9/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.1228]


Epoch   8 | Train Loss: 24.787614832166582


Epoch 10/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0726]


Epoch   9 | Train Loss: 24.296151806600392


Epoch 11/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0315]


Epoch  10 | Train Loss: 21.072692821267992


Epoch 12/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0188]


Epoch  11 | Train Loss: 19.524569215020165


Epoch 13/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0139]


Epoch  12 | Train Loss: 16.177500697551295


Epoch 14/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0133]


Epoch  13 | Train Loss: 15.46159195760265


Epoch 15/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.1528]


Epoch  14 | Train Loss: 12.916283504106104


Epoch 16/50: 100%|██████████| 391/391 [00:47<00:00,  8.22it/s, loss=0.0090]


Epoch  15 | Train Loss: 13.840005638077855


Epoch 17/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0079]


Epoch  16 | Train Loss: 13.997127682669088


Epoch 18/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0153]


Epoch  17 | Train Loss: 11.135119722690433


Epoch 19/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0638]


Epoch  18 | Train Loss: 12.531750052818097


Epoch 20/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0527]


Epoch  19 | Train Loss: 10.707856510998681


Epoch 21/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.1309]


Epoch  20 | Train Loss: 10.477567336056381


Epoch 22/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0021]


Epoch  21 | Train Loss: 11.223405818454921


Epoch 23/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0035]


Epoch  22 | Train Loss: 9.644502219511196


Epoch 24/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0825]


Epoch  23 | Train Loss: 9.641610530321486


Epoch 25/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.1245]


Epoch  24 | Train Loss: 9.877907198970206


Epoch 26/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0066]


Epoch  25 | Train Loss: 8.14481005212292


Epoch 27/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0028]


Epoch  26 | Train Loss: 9.853633951861411


Epoch 28/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0477]


Epoch  27 | Train Loss: 8.459131102310494


Epoch 29/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0033]


Epoch  28 | Train Loss: 8.145062212599441


Epoch 30/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0334]


Epoch  29 | Train Loss: 8.429485631233547


Epoch 31/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0034]


Epoch  30 | Train Loss: 8.333585560438223


Epoch 32/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0023]


Epoch  31 | Train Loss: 8.196931593352929


Epoch 33/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0146]


Epoch  32 | Train Loss: 7.2951610937016085


Epoch 34/50: 100%|██████████| 391/391 [00:47<00:00,  8.22it/s, loss=0.0387]


Epoch  33 | Train Loss: 7.191075348760933


Epoch 35/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0073]


Epoch  34 | Train Loss: 7.2967087181750685


Epoch 36/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0079]


Epoch  35 | Train Loss: 7.379126816696953


Epoch 37/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.1086]


Epoch  36 | Train Loss: 6.481560533226002


Epoch 38/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0234]


Epoch  37 | Train Loss: 6.806011596519966


Epoch 39/50: 100%|██████████| 391/391 [00:47<00:00,  8.22it/s, loss=0.0724]


Epoch  38 | Train Loss: 6.821018839953467


Epoch 40/50: 100%|██████████| 391/391 [00:47<00:00,  8.22it/s, loss=0.0045]


Epoch  39 | Train Loss: 6.546074671554379


Epoch 41/50: 100%|██████████| 391/391 [00:47<00:00,  8.24it/s, loss=0.0040]


Epoch  40 | Train Loss: 5.916746824310394


Epoch 42/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0154]


Epoch  41 | Train Loss: 5.181086948228767


Epoch 43/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0939]


Epoch  42 | Train Loss: 5.308306066275691


Epoch 44/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0060]


Epoch  43 | Train Loss: 6.196098576619988


Epoch 45/50: 100%|██████████| 391/391 [00:47<00:00,  8.22it/s, loss=0.0073]


Epoch  44 | Train Loss: 4.248166912497254


Epoch 46/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0005]


Epoch  45 | Train Loss: 4.893800156722136


Epoch 47/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0022]


Epoch  46 | Train Loss: 4.294915258433321


Epoch 48/50: 100%|██████████| 391/391 [00:47<00:00,  8.22it/s, loss=0.0038]


Epoch  47 | Train Loss: 5.752033362463408


Epoch 49/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0058]


Epoch  48 | Train Loss: 4.627261932888359


Epoch 50/50: 100%|██████████| 391/391 [00:47<00:00,  8.23it/s, loss=0.0161]


Epoch  49 | Train Loss: 5.149438716907753
