<a href="https://colab.research.google.com/github/hanghae-plus-AI/AI-1-jhtwiz/blob/main/Chapter1-2_%EA%B3%BC%EC%A0%9CB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer 실습

이번 실습에서는 감정 분석 task에 RNN 대신 Transformer를 구현하여 적용해 볼 것입니다.
Library import나 dataloader 생성은 RNN 실습 때와 똑같기 때문에 설명은 넘어가도록 하겠습니다.

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [2]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [3]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


ds = load_dataset("stanfordnlp/imdb")
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')


def new_collate_fn(batch):
  max_len = 400
  texts, labels = [], []

  for row in batch:
    tokens = tokenizer(row['text'], truncation=True, max_length=max_len).input_ids
    labels.append(tokens[-2]) # 토큰화 하게 되면 [CLS], ... , 마지막 token_id,[SEP] 일거니 마지막 token_id 추출
    texts.append(tokens[:-2]) # labels 앞까지 추출.

  # (B, S)에서 S를 동일하게 맞추기 위해 가장 긴 Sequence를 기준으로 padding 추가
  texts = pad_sequence([torch.LongTensor(text) for text in texts], batch_first=True, padding_value=tokenizer.pad_token_id)
  labels = torch.LongTensor(labels)

  return texts, labels


new_train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=new_collate_fn
)
new_test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=new_collate_fn
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [4]:
dataiter = iter(new_train_loader)
texts, labels = next(dataiter)
print(tokenizer.convert_ids_to_tokens(labels))
print(labels)

['willing', '.', 'cox', 'really', '.', '.', 'good', '.', '.', 'place', '.', '.', '.', 'fl', '.', 's', 'was', '.', '.', ')', '.', '##cker', '.', 'f', 'done', 'snow', 'easy', ']', '.', 'by', '.', '.', '##able', '.', '.', 'ha', '.', '.', '5', '.', '.', '?', 'make', '.', '.', 'moving', 'all', '.', '.', '.', '.', '.', '.', 'stack', '.', '.', '.', '.', '.', 'i', '.', '.', '.', '(']
tensor([ 5627,  1012,  9574,  2428,  1012,  1012,  2204,  1012,  1012,  2173,
         1012,  1012,  1012, 13109,  1012,  1055,  2001,  1012,  1012,  1007,
         1012,  9102,  1012,  1042,  2589,  4586,  3733,  1033,  1012,  2011,
         1012,  1012,  3085,  1012,  1012,  5292,  1012,  1012,  1019,  1012,
         1012,  1029,  2191,  1012,  1012,  3048,  2035,  1012,  1012,  1012,
         1012,  1012,  1012,  9991,  1012,  1012,  1012,  1012,  1012,  1045,
         1012,  1012,  1012,  1006])


In [5]:
from torch import nn
from math import sqrt


class MultiHeadAttention(nn.Module):
  def __init__(self, input_dim, d_model, n_heads):
    super().__init__()
    assert d_model % n_heads == 0

    self.input_dim = input_dim
    self.d_model = d_model # D
    self.n_heads = n_heads # H
    self.head_dim = d_model // n_heads # D/H


    self.wq = nn.Linear(input_dim, d_model)
    self.wk = nn.Linear(input_dim, d_model)
    self.wv = nn.Linear(input_dim, d_model)
    self.dense = nn.Linear(d_model, d_model)

    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x, mask):
    batch_size = x.shape[0] # B
    seq_length = x.shape[1] # S

    q, k, v = self.wq(x), self.wk(x), self.wv(x) # (B, S, D)
    q = q.view(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2) # (B, S, D) -> (B, S, H, D/H) -> (B, H, S, D/H)
    k = k.view(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2) # (B, S, D) -> (B, S, H, D/H) -> (B, H, S, D/H)
    v = v.view(batch_size, seq_length, self.n_heads, self.head_dim).transpose(1, 2) # (B, S, D) -> (B, S, H, D/H) -> (B, H, S, D/H)
    score = torch.matmul(q, k.transpose(-1, -2)) # (B, H, S, D/H) * (B, H, D/H, S) = (B, H, S, S)
    score = score / sqrt(self.head_dim) # sqrt(D/H)

    if mask is not None:
      # mask 차원 추가는 TextClassifier에 적용했습니다
      score = score + (mask * -1e9)

    score = self.softmax(score)
    result = torch.matmul(score, v) # (B, H, S, S) * (B, H, S, D/H) = (B, H, S, D/H)
    result = result.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model) # (B, H, S, D/H) -> (B, S, D)
    result = self.dense(result)

    return result

In [6]:
class TransformerLayer(nn.Module):
  def __init__(self, input_dim, d_model, n_heads, dff, p_dropout):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model
    self.dff = dff

    self.MHA = MultiHeadAttention(input_dim, d_model, n_heads)
    self.FFN = nn.Sequential(
      nn.Linear(d_model, dff),
      nn.ReLU(),
      nn.Linear(dff, d_model)
    )
    self.dropout = nn.Dropout(p_dropout)
    self.layerNorm1 = nn.LayerNorm(d_model)
    self.layerNorm2 = nn.LayerNorm(d_model)

  def forward(self, x, mask):
    MHA = self.dropout(self.MHA(x, mask)) # MHA with dropout
    x = self.layerNorm1(x + MHA) # residual connection(Add+Norm)

    FFN = self.dropout(self.FFN(x)) # FFN with dropout
    x = self.layerNorm2(x + FFN) # residual connection(Add+Norm)

    return x

In [7]:
import numpy as np


def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)


max_len = 400
print(positional_encoding(max_len, 256).shape)

torch.Size([1, 400, 256])


In [8]:
class NewTextClassifier(nn.Module):
  def __init__(self, vocab_size, d_model, n_heads, n_layers, dff, p_dropout = 0.1):
    super().__init__()

    self.vocab_size = vocab_size
    self.d_model = d_model
    self.n_layers = n_layers
    self.dff = dff

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
    self.layers = nn.ModuleList([TransformerLayer(d_model, d_model, n_heads, dff, p_dropout) for _ in range(n_layers)])
    self.classification = nn.Linear(d_model, vocab_size)

  def forward(self, x):
    mask = (x == tokenizer.pad_token_id)
    mask = mask[:, None, None, :]
    batch_size = x.shape[0]
    seq_len = x.shape[1]

    x = self.embedding(x)
    x = x * sqrt(self.d_model)
    x = x + self.pos_encoding[:, :seq_len]

    for layer in self.layers:
      x = layer(x, mask)

    # padding이 아닌 부분 찾기
    non_pad_mask = (x != tokenizer.pad_token_id).any(dim=-1)
    valid_seq_index = non_pad_mask.sum(dim=1) - 1 # 각 시퀀스에서 마지막 실제 단어의 index

    # 배치의 각 시퀀스에서 마지막 단어 위치에 있는 임베딩 벡터를 추출
    # 이전의 x[:, -1]은 모든 sequence들의 마지막 것만 갖고 왔는데 padding이 잡히는걸 방지하기 위해 아래 방법 시도
    x = x[torch.arange(batch_size), valid_seq_index]  # [batch_size, d_model]

    x = self.classification(x)

    return x


In [9]:
from torch.optim import Adam

model_new = NewTextClassifier(len(tokenizer), 128, 4, 5, 256) # 5-layer-4-head, 기본과제에서 가장 성능이 좋았던 d_model=128, dff=256로
lr = 0.001
model_new = model_new.to('cuda')
loss_fn = nn.CrossEntropyLoss()

optimizer_new = Adam(model_new.parameters(), lr=lr)

In [10]:
import numpy as np
import matplotlib.pyplot as plt


def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)
    preds = torch.argmax(preds, dim=-1)

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt

In [None]:
n_epochs = 15

for epoch in range(n_epochs):
  total_loss = 0.
  model_new.train()
  for data in new_train_loader:
    model_new.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda').long(), labels.to('cuda').long()

    preds = model_new(inputs)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer_new.step()

    total_loss += loss.item()

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

  with torch.no_grad():
    model_new.eval()
    train_acc = accuracy(model_new, new_train_loader)
    test_acc = accuracy(model_new, new_test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

Epoch   0 | Train Loss: 1207.7502924203873
Epoch   1 | Train Loss: 967.0211309194565
Epoch   2 | Train Loss: 898.0276993513107
Epoch   3 | Train Loss: 828.9818677902222
Epoch   4 | Train Loss: 758.3847177028656
Epoch   5 | Train Loss: 697.6253930926323
Epoch   6 | Train Loss: 651.6636628508568
Epoch   7 | Train Loss: 613.0377010703087
Epoch   8 | Train Loss: 581.9323758482933
Epoch   9 | Train Loss: 547.5546559691429
Epoch  10 | Train Loss: 514.8803787231445
Epoch  11 | Train Loss: 479.25154423713684
Epoch  12 | Train Loss: 442.5511848330498
Epoch  13 | Train Loss: 404.9776583313942
Epoch  14 | Train Loss: 362.36754950881004


**train 데이터의 정확도는 올라가지만 test데이터는 변동이 없거나 떨어지는 느낌. 학습한 문장은 학습한대로 단어를 예측하지만 새로운 문장에 대해서는 잘 예측하지 못하는 듯**

In [None]:
def accuracyWithPrint(model, dataloader, needPrint=False):
  cnt = 0
  acc = 0

  for i, data in enumerate(dataloader):
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)
    preds = torch.argmax(preds, dim=-1)
    if needPrint and i % 100 == 0:
      print(f'{"="*50}predict{"="*50}')
      print(tokenizer.convert_ids_to_tokens(preds))
      print(preds)
      print(f'{"="*50}label{"="*50}')
      print(tokenizer.convert_ids_to_tokens(labels))
      print(labels)
      print('Is Correct?')
      print(labels == preds)
      print(f'{(labels == preds).sum().item()} / 64')

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt

In [None]:
with torch.no_grad():
    model_new.eval()
    result = accuracyWithPrint(model_new, new_test_loader, True)
    print(f"=========> Acc: {result:.3f}")

['.', '.', '.', 'for', '.', '.', '.', '.', '.', '10', '.', '.', '.', 't', '.', '.', '.', '.', '.', '.', '.', 'horse', ')', 'this', '.', '.', '.', '!', '.', '.', '.', '.', 'sexual', '.', '.', '.', 's', '.', '*', '.', '.', ')', '.', '.', '.', '.', 'themes', ')', '.', '.', 'and', '.', '.', '*', 'evil', 'ways', 'off', '##ge', '.', '"', ',', '.', '.', '10']
tensor([1012, 1012, 1012, 2005, 1012, 1012, 1012, 1012, 1012, 2184, 1012, 1012,
        1012, 1056, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 3586, 1007, 2023,
        1012, 1012, 1012,  999, 1012, 1012, 1012, 1012, 4424, 1012, 1012, 1012,
        1055, 1012, 1008, 1012, 1012, 1007, 1012, 1012, 1012, 1012, 6991, 1007,
        1012, 1012, 1998, 1012, 1012, 1008, 4763, 3971, 2125, 3351, 1012, 1000,
        1010, 1012, 1012, 2184], device='cuda:0')
['.', '.', 'quality', 'that', '!', '.', '.', '.', '.', '.', '.', '.', '!', 's', '.', '!', '.', '.', '.', '!', '.', 'the', '.', 'giving', '.', '.', '.', 'road', '.', '>', '!', '.', 'actors', '.', 

**문장이 끝나는 .들을 찍은건지 맞춘건지는 모르겠지만 어쨌든 정확도가 아쉽다. 그래도 test와 동일한 문장이 train에 있었는지 있었는지 'evil', 'off', '##ge', 이것들 맞춘거를 보면 학습된 문장은 잘 나오게하는듯? 그래도 문장이 중간에 끝난 경우 마침표가 아닌 단어를 뱉는 경우가 많은거 보면 문장의 비완결성은 인지하는듯... 추후에 더 공부해 봐야 겠다**

In [None]:
#위에 15 epoch 즈음부터 test가 떨어지는 추세가 보이니 더 학습시키면 overfitting이 더 심화되는지 해봄
n_epochs = 10

for epoch in range(n_epochs):
  total_loss = 0.
  model_new.train()
  for data in new_train_loader:
    model_new.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda').long(), labels.to('cuda').long()

    preds = model_new(inputs)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer_new.step()

    total_loss += loss.item()
  #앞에 15에폭했으니 더해줌
  print(f"Epoch {15+epoch:3d} | Train Loss: {total_loss}")

  with torch.no_grad():
    model_new.eval()
    train_acc = accuracy(model_new, new_train_loader)
    test_acc = accuracy(model_new, new_test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

Epoch  15 | Train Loss: 329.4386035203934
Epoch  16 | Train Loss: 289.3931440412998
Epoch  17 | Train Loss: 258.0846316218376
Epoch  18 | Train Loss: 230.9261675029993
Epoch  19 | Train Loss: 207.31783908605576
Epoch  20 | Train Loss: 187.2510266751051
Epoch  21 | Train Loss: 171.73167544603348
Epoch  22 | Train Loss: 153.3127126097679
Epoch  23 | Train Loss: 142.6881023272872
Epoch  24 | Train Loss: 130.12048771977425


역시 테스트에서는 계속 떨어지는 추세. 학습한 문장에 과하게 학습되어서 이외의 문제에 더 취약해지는 것 처럼 보임. 세상의 모든 문장을 학습 시킬 수 있는 상황이 아니라면
일정량 데이터의 반복 학습을 많이 하는 것 보다 다양한 데이터를 적당히 학습하는게 더 좋지 않을까? sequence 다른 위치에 구멍을 뚫어서 여러 케이스로 만들고 학습 시키는게 더 좋을듯

In [None]:
with torch.no_grad():
    model_new.eval()
    result = accuracyWithPrint(model_new, new_train_loader, True)
    print(f"=========> Acc: {result:.3f}")

['with', '.', '?', '.', '.', '.', '?', '!', '##ffi', '.', '.', ',', '!', 'there', ':', '.', '.', '.', 'with', '.', '.', 'good', '.', '.', '.', '.', '10', '.', 'inferior', '.', '.', '.', '.', '.', '.', '.', 'from', 'thing', '.', '.', 'she', '?', '!', '.', '!', '.', '.', '!', 'sorry', 'imagination', 'pretty', '.', 'to', '.', '.', '!', '.', '.', '.', '.', '.', '!', 'invariably', 'love']
tensor([ 2007,  1012,  1029,  1012,  1012,  1012,  1029,   999, 26989,  1012,
         1012,  1010,   999,  2045,  1024,  1012,  1012,  1012,  2007,  1012,
         1012,  2204,  1012,  1012,  1012,  1012,  2184,  1012, 14092,  1012,
         1012,  1012,  1012,  1012,  1012,  1012,  2013,  2518,  1012,  1012,
         2016,  1029,   999,  1012,   999,  1012,  1012,   999,  3374,  9647,
         3492,  1012,  2000,  1012,  1012,   999,  1012,  1012,  1012,  1012,
         1012,   999, 26597,  2293], device='cuda:0')
['with', '.', '?', '.', '.', '.', '?', '!', '##ffi', '!', ')', ',', '!', 'there', 'a', '.',

train data는 거의 답을 아예 외워버리고 test는 계속 떨어지는 전형적인 안 좋은.........

In [13]:
#학습이 좀 아쉬워서 마지막 테스트
model_last = NewTextClassifier(len(tokenizer), 256, 4, 5, 256) # 5-layer-4-head, d_model=256, dff=256 단어를 좀 더 고차원으로
lr = 0.001
model_last = model_last.to('cuda')
loss_fn = nn.CrossEntropyLoss()

optimizer_last = Adam(model_last.parameters(), lr=lr)

In [14]:
n_epochs = 10

for epoch in range(n_epochs):
  total_loss = 0.
  model_last.train()
  for data in new_train_loader:
    model_last.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda').long(), labels.to('cuda').long()

    preds = model_last(inputs)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer_last.step()

    total_loss += loss.item()

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

  with torch.no_grad():
    model_last.eval()
    train_acc = accuracy(model_last, new_train_loader)
    test_acc = accuracy(model_last, new_test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

Epoch   0 | Train Loss: 1180.28378033638
Epoch   1 | Train Loss: 993.0656158924103
Epoch   2 | Train Loss: 971.2966527938843
Epoch   3 | Train Loss: 1139.0431813001633
Epoch   4 | Train Loss: 1175.849408864975
Epoch   5 | Train Loss: 1151.7452158927917
Epoch   6 | Train Loss: 1143.401223063469
Epoch   7 | Train Loss: 1137.0073949098587
Epoch   8 | Train Loss: 1172.5368282794952
Epoch   9 | Train Loss: 1162.3838226795197


신기하게도 d_model=256일때 학습이 안되는 듯한 모습..;

In [15]:
model_last = NewTextClassifier(len(tokenizer), 128, 4, 5, 256)
lr = 0.001
model_last = model_last.to('cuda')
loss_fn = nn.CrossEntropyLoss()

optimizer_last = Adam(model_last.parameters(), lr=lr)

In [16]:
n_epochs = 10

for epoch in range(n_epochs):
  total_loss = 0.
  model_last.train()
  for data in new_train_loader:
    model_last.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda').long(), labels.to('cuda').long()

    preds = model_last(inputs)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer_last.step()

    total_loss += loss.item()

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

  with torch.no_grad():
    model_last.eval()
    train_acc = accuracy(model_last, new_train_loader)
    test_acc = accuracy(model_last, new_test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

Epoch   0 | Train Loss: 1210.2313394546509
Epoch   1 | Train Loss: 965.7652459144592
Epoch   2 | Train Loss: 894.5033469200134
Epoch   3 | Train Loss: 819.7876272201538
Epoch   4 | Train Loss: 751.9395959377289
Epoch   5 | Train Loss: 691.6853846311569
Epoch   6 | Train Loss: 643.6103574037552
Epoch   7 | Train Loss: 613.2135090827942
Epoch   8 | Train Loss: 584.8563593626022
Epoch   9 | Train Loss: 557.4770264029503


기존의 d_model=128은 여전히 loss를 잘 줄임. 신기함.