<a href="https://colab.research.google.com/github/jo1jun/Transformer/blob/main/Transformer_module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, device & dtype

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype = torch.long

% pip install einops

from einops import rearrange, repeat, reduce
# tensor 연산을 간편하게 하기 위해 einops moudule import.
# reference & tutorial : http://einops.rocks/pytorch-examples.html

Collecting einops
  Downloading https://files.pythonhosted.org/packages/5d/a0/9935e030634bf60ecd572c775f64ace82ceddf2f504a5fd3902438f07090/einops-0.3.0-py2.py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.0


# Token & Positional Embedding

In [None]:
class TokPosEmbedding(nn.Module):
  def __init__(self, input_dim, d_model, dropout_ratio):
    super().__init__()
    # input_dim : input 의 vocab 수.
    self.tokEmbedding = nn.Embedding(input_dim, d_model)
    self.posEmbedding = nn.Embedding(100, d_model) # cos & sin positional encoding 대신, learnable positional embedding 으로 구현. (max length = 100)
                                                   # max length : 입력 sequence 의 최대 길이.
    self.d_model = d_model
    self.dropout = nn.Dropout(dropout_ratio)

  def forward(self, src):
    batch_size = src.shape[0]
    src_len = src.shape[1]

    # 0 ~ src_len 등차 수열값 을 bactch 한개와 동일한 shape으로 생성 (positional embedding)
    pos = torch.arange(0, src_len, dtype=dtype) # pos: [src_len]
    pos = repeat(pos, 'l -> b l', b=batch_size).to(device) # pos: [batch_size, src_len]

    src = self.dropout((self.tokEmbedding(src) * np.sqrt(self.d_model)) + self.posEmbedding(pos))

    # src: [batch_size, src_len, d_model]

    return src

# Transformer


In [None]:
class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, n_layers, nhead, ff_dim, dropout_ratio):
        super().__init__()

        self.encEmbedding = TokPosEmbedding(input_dim, d_model, dropout_ratio)                # 외부에서 embedding
        self.encoderLayer = nn.TransformerEncoderLayer(d_model, nhead, ff_dim, dropout_ratio) # 구현되어있는 module 사용
        self.encoder = nn.TransformerEncoder(self.encoderLayer, n_layers)                     # 구현되어있는 module 사용
        self.decEmbedding = TokPosEmbedding(output_dim, d_model, dropout_ratio)               # 외부에서 embedding
        self.decoderLayer = nn.TransformerDecoderLayer(d_model, nhead, ff_dim, dropout_ratio) # 구현되어있는 module 사용
        self.decoder = nn.TransformerDecoder(self.decoderLayer, n_layers)                     # 구현되어있는 module 사용
        self.linear = nn.Linear(d_model, output_dim)                                          # 외부에서 마지막 fc-layer

    def make_pad_mask(self, src, pad):

        # src: [batch_size, src_len]

        # pad mask
        src_mask = (src.data.eq(pad))

        # src_mask: [batch_size, src_len]
        return src_mask

    def forward(self, src, tgt, pad):
        
        # src: [batch_size, src_len]
        # tgt: [batch_size, tgt_len]

        src_pad_mask = self.make_pad_mask(src, pad)
        tgt_pad_mask = self.make_pad_mask(tgt, pad)
        # chitting 방지 mask 의 경우, 아래와 같이 이미 구현되어있다. 기존과 mask 값이 다르므로 이미 구현된 것을 사용하면 된다.
        tgt_sub_mask = nn.Transformer.generate_square_subsequent_mask(self, sz=tgt.shape[1]).to(device)

        # src_pad_mask: [batch_size, src_len]
        # tgt_pad_mask: [batch_size, tgt_len]
        # tgt_sub_mask: [tgt_len, tgt_len]

        src = self.encEmbedding(src)
        tgt = self.decEmbedding(tgt)

        # 기존과 달리 input, output 의 dim 0, 1 이 바뀌어 있다.
        enc_src = self.encoder(src.transpose(0,1), src_key_padding_mask=src_pad_mask) 

        # enc_src: [src_len, batch_size, d_model]

        output = self.decoder(tgt.transpose(0,1), enc_src, tgt_sub_mask, None, tgt_pad_mask, src_pad_mask)

        # output: [tgt_len, batch_size, d_model]

        output = self.linear(output.transpose(0, 1))

        # output: [batch_size, tgt_len, output_dim]

        return output

    def generate(self, src, start_id, sample_size, pad):

        batch_size = src.shape[0]

        src_pad_mask = self.make_pad_mask(src, pad)

        src = self.encEmbedding(src)

        enc_src = self.encoder(src.transpose(0,1), src_key_padding_mask=src_pad_mask)
      
        sampled_tensor = torch.tensor([start_id], dtype=dtype)
        # sampled_tensor: [1]

        sampled_tensor = repeat(sampled_tensor, 's -> b s', b=batch_size).to(device)
        # sampled_tensor: [batch_size, 1]
        for _ in range(sample_size):

          tgt_pad_mask = self.make_pad_mask(sampled_tensor, pad)
          tgt = self.decEmbedding(sampled_tensor)
          
          # generate 할 때는 하나씩 단어를 생성하므로 tgt_sub_mask 의미 x
          output = self.decoder(tgt.transpose(0,1), enc_src, None, None, tgt_pad_mask, src_pad_mask)
          output = self.linear(output.transpose(0,1))

          pred_token = output.argmax(2)[:,-1].unsqueeze(1)

          sampled_tensor = torch.cat((sampled_tensor, pred_token), 1) # pred sentence 에 concat

        return sampled_tensor

# Date format Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/dataset/dateformat')
import sequence

# google mount 한 뒤 '/content/drive/MyDrive/dataset' 에 dateformat 폴더 저장 후 실행.

(x_train, t_train), (x_test, t_test) = sequence.load_data('date.txt')
# char -> id & id -> char dictionary
char_to_id, id_to_char = sequence.get_vocab()

print(x_train.shape)
print(t_train.shape)
print(x_test.shape)
print(t_test.shape)
print()

# 이미 id 화 되어있다.
print('question(id) : ', x_train[0])
print('correct(id)  : ',t_train[0])
print()

# sequence 확인 # ' ' : pad, '_' : start_char
print('question(char) : ', ' '.join([id_to_char[int(c)] for c in x_train[0]]))
print('correct(char)  : ', ' '.join([id_to_char[int(c)] for c in t_train[0]]))

vocab_size = len(char_to_id)
x_train = torch.tensor(x_train, dtype=dtype)
t_train = torch.tensor(t_train, dtype=dtype)
x_test = torch.tensor(x_test, dtype=dtype)
t_test = torch.tensor(t_test, dtype=dtype)
pad = 7 # pad token

(45000, 29)
(45000, 11)
(5000, 29)
(5000, 11)

question(id) :  [ 8 22  9 22  9  8  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7]
correct(id)  :  [14 11 12  9  8 15 16  8 15 16  9]

question(char) :  2 / 7 / 7 2                                              
correct(char)  :  _ 1 9 7 2 - 0 2 - 0 7


# hyperparameter

In [None]:
batch_size = 128
epoch = 20
input_dim = output_dim = vocab_size
d_model = 32
n_layers = 1
nhead = 2
ff_dim = 1024
dropout_ratio = 0.1
learning_rate = 0.0025

In [None]:
model = Transformer(input_dim, output_dim, d_model, n_layers, nhead, ff_dim, dropout_ratio)

In [None]:
import torch.optim as optim

optimizer = torch.optim.Adam(model.parameters(), learning_rate)

criterion = nn.CrossEntropyLoss()

# Loader

In [None]:
from torch.utils.data import DataLoader

loader_x_train = DataLoader(x_train, batch_size=batch_size)
loader_t_train = DataLoader(t_train, batch_size=batch_size)
loader_x_test = DataLoader(x_test, batch_size=batch_size)
loader_t_test = DataLoader(t_test, batch_size=batch_size)

# Trainer

In [None]:
def trainer(model, optimizer, epochs, pad):

    pad = torch.tensor([pad], dtype=dtype).to(device)

    model.train()
    model = model.to(device)
    for e in range(epochs):
        for iters, (batch_x, batch_t) in enumerate(zip(loader_x_train, loader_t_train)):

            batch_x = batch_x.to(device)
            batch_t = batch_t.to(device)

            # correct 값의 마지막 원소 배제 (end token 없음. 마지막 원소 다음 token 학습할 필요 x)
            scores = model(batch_x, batch_t[:, :-1], pad)

            # scores: [batch_size, tgt_len - 1, output_dim]

            scores = rearrange(scores, 'b l d -> (b l) d')

            # batch_t: [batch_size, tgt_len]
            
            # correct 값의 첫 원소 배제
            batch_t = rearrange(batch_t[:, 1:], 'b l -> (b l)')

            # scores  : [batch_size * tgt_len - 1, output_dim]
            # batch_t : [batch_size * tgt_len - 1]

            loss = criterion(scores, batch_t)

            optimizer.zero_grad()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) # gradient clippling

            optimizer.step()

            if iters % 100 == 0:
                print('epoch[%d/%d] loss = %.4f' % (e+1, epochs, loss.item()))

# Train

In [None]:
trainer(model, optimizer, epoch, pad)

epoch[1/20] loss = 4.2610
epoch[1/20] loss = 0.9519
epoch[1/20] loss = 0.6491
epoch[1/20] loss = 0.4814
epoch[2/20] loss = 0.4297
epoch[2/20] loss = 0.3528
epoch[2/20] loss = 0.2833
epoch[2/20] loss = 0.2240
epoch[3/20] loss = 0.2180
epoch[3/20] loss = 0.2269
epoch[3/20] loss = 0.1697
epoch[3/20] loss = 0.1589
epoch[4/20] loss = 0.1417
epoch[4/20] loss = 0.1569
epoch[4/20] loss = 0.1564
epoch[4/20] loss = 0.1080
epoch[5/20] loss = 0.1201
epoch[5/20] loss = 0.1165
epoch[5/20] loss = 0.1089
epoch[5/20] loss = 0.1020
epoch[6/20] loss = 0.0864
epoch[6/20] loss = 0.0868
epoch[6/20] loss = 0.0885
epoch[6/20] loss = 0.0656
epoch[7/20] loss = 0.0668
epoch[7/20] loss = 0.0690
epoch[7/20] loss = 0.0802
epoch[7/20] loss = 0.0639
epoch[8/20] loss = 0.0513
epoch[8/20] loss = 0.0560
epoch[8/20] loss = 0.0627
epoch[8/20] loss = 0.0661
epoch[9/20] loss = 0.0440
epoch[9/20] loss = 0.0610
epoch[9/20] loss = 0.0636
epoch[9/20] loss = 0.0511
epoch[10/20] loss = 0.0355
epoch[10/20] loss = 0.0418
epoch[10/2

# Checker

In [None]:
def checker(loader_x, loader_t, model, pad):

    pad = torch.tensor([pad], dtype=dtype).to(device)

    model.eval()
    with torch.no_grad():
      correct_num = 0
      for iters, (batch_x, batch_t) in enumerate(zip(loader_x, loader_t)):

        batch_x = batch_x.to(device)
        batch_t = batch_t.to(device)

        start_id = batch_t[0,0]
        correct = batch_t[:,1:]

        predict = model.generate(batch_x, start_id, correct.shape[1], pad)
        predict = predict[:,1:]

        correct_num += (predict == correct).sum()
        
    return correct_num

# Accuracy

In [None]:
correct_num = checker(loader_x_train, loader_t_train, model, pad)
acc = float(correct_num) / (t_train.shape[0] * (t_train.shape[1] - 1))
print('train accuracy %.3f%%' % (acc * 100))

correct_num = checker(loader_x_test, loader_t_test,model, pad)
acc = float(correct_num) / (t_test.shape[0] * (t_test.shape[1] - 1))
print('test accuracy %.3f%%' % (acc * 100))

train accuracy 99.811%
test accuracy 99.820%


# Sampling

In [None]:
for i in range(10):
  idx = [i]

  question = x_test[idx].to(device)
  correct = t_test[idx].to(device)
  pad = torch.tensor([pad], dtype=dtype).to(device)

  correct = correct.flatten()
  # 머릿글자
  start_id = correct[0]

  correct = correct[1:]
  with torch.no_grad():
    predict  = model.generate(question, start_id, len(correct), pad)
  predict = predict[:,1:]

  # 문자열로 변환
  question = [id_to_char[int(c)] for c in question.flatten()]
  correct = [id_to_char[int(c)] for c in correct.flatten()]
  predict = [id_to_char[int(c)] for c in predict.flatten()]

  print(f'question {i+1} : ', ' '.join(question))
  print(f'correct {i+1}  : ', ' '.join(correct))
  print(f'predict {i+1}  : ', ' '.join(predict))
  print()

question 1 :  1 0 / 1 5 / 9 4                                          
correct 1  :  1 9 9 4 - 1 0 - 1 5
predict 1  :  1 9 9 4 - 1 0 - 1 5

question 2 :  t h u r s d a y ,   n o v e m b e r   1 3 ,   2 0 0 8    
correct 2  :  2 0 0 8 - 1 1 - 1 3
predict 2  :  2 0 0 8 - 1 1 - 1 3

question 3 :  M a r   2 5 ,   2 0 0 3                                  
correct 3  :  2 0 0 3 - 0 3 - 2 5
predict 3  :  2 0 0 3 - 0 3 - 2 5

question 4 :  T u e s d a y ,   N o v e m b e r   2 2 ,   2 0 1 6      
correct 4  :  2 0 1 6 - 1 1 - 2 2
predict 4  :  2 0 1 6 - 1 1 - 2 2

question 5 :  S a t u r d a y ,   J u l y   1 8 ,   1 9 7 0            
correct 5  :  1 9 7 0 - 0 7 - 1 8
predict 5  :  1 9 7 0 - 0 7 - 1 8

question 6 :  o c t o b e r   6 ,   1 9 9 2                            
correct 6  :  1 9 9 2 - 1 0 - 0 6
predict 6  :  1 9 9 2 - 1 0 - 0 6

question 7 :  8 / 2 3 / 0 8                                            
correct 7  :  2 0 0 8 - 0 8 - 2 3
predict 7  :  2 0 0 8 - 0 8 - 2 3

question 8 : 