In [34]:
import torch
import torchvision
import torchtext

import torch.nn as nn

print(f'torch version: {torch.__version__}')
print(f'torchvision version: {torchvision.__version__}')
print(f'torchtext version: {torchtext.__version__}')

torch version: 1.13.1
torchvision version: 0.14.1
torchtext version: 0.6.0


In [35]:
print(torch.cuda.is_available())

True


In [36]:
# !python -m spacy download en
# !python -m spacy download de
# !pip install torchtext==0.6.0
# !pip install spacy

In [37]:
import spacy

spacy_en = spacy.load("en_core_web_sm") # 영어 토큰화(tokenization)
spacy_de = spacy.load("de_core_news_sm") # 독일어 토큰화(tokenization)

In [38]:
# 간단히 토큰화(tokenization) 기능 써보기
tokenized = spacy_en.tokenizer("I am a graduate student.")

for i, token in enumerate(tokenized):
    print(f"인덱스 {i}: {token.text}")
     

인덱스 0: I
인덱스 1: am
인덱스 2: a
인덱스 3: graduate
인덱스 4: student
인덱스 5: .


In [39]:

# 독일어(Deutsch) 문장을 토큰화 하는 함수 (순서를 뒤집지 않음)
def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]

# 영어(English) 문장을 토큰화 하는 함수
def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]
     

In [40]:
# 필드(Fielld) 라이브러리를 이용해 데이터셋에 대한 구체적인 전처리 내용을 명시.
# Seq2Seq 모델과 다르게 batch_first 속성의 값을 True 로 설정
# 번역 목표 : 소스(SRC)독일어 -> 목표(TRG)영어

from torchtext.data import Field, BucketIterator

SRC = Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)

In [41]:
from torchtext.datasets import Multi30k

train_dataset, valid_dataset, test_dataset = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))

In [42]:
print(f"학습 데이터셋(training dataset) 크기: {len(train_dataset.examples)}개")
print(f"평가 데이터셋(validation dataset) 크기: {len(valid_dataset.examples)}개")
print(f"테스트 데이터셋(testing dataset) 크기: {len(test_dataset.examples)}개")

학습 데이터셋(training dataset) 크기: 29000개
평가 데이터셋(validation dataset) 크기: 1014개
테스트 데이터셋(testing dataset) 크기: 1000개


In [43]:
# 학습 데이터 중 하나를 선택해 출력
print(vars(train_dataset.examples[30])['src'])
print(vars(train_dataset.examples[30])['trg'])

['ein', 'mann', ',', 'der', 'mit', 'einer', 'tasse', 'kaffee', 'an', 'einem', 'urinal', 'steht', '.']
['a', 'man', 'standing', 'at', 'a', 'urinal', 'with', 'a', 'coffee', 'cup', '.']


In [44]:
# 필드 객체인 build_vocab 메서드를 이용해 영어와 독어의 단어 사전을 생성 (최소 2번 이상 등장한 단어만을 선택)

SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

print(f"len(SRC): {len(SRC.vocab)}")
print(f"len(TRG): {len(TRG.vocab)}")

len(SRC): 7853
len(TRG): 5893


In [45]:

print(TRG.vocab.stoi["abcabc"]) # 없는 단어: 0
print(TRG.vocab.stoi[TRG.pad_token]) # 패딩(padding): 1
print(TRG.vocab.stoi["<sos>"]) # : 2 -> start token
print(TRG.vocab.stoi["<eos>"]) # : 3 -> end token
print(TRG.vocab.stoi["hello"])
print(TRG.vocab.stoi["world"])

print(SRC.vocab.stoi["abcabc"]) # 없는 단어: 0
print(SRC.vocab.stoi[SRC.pad_token])
print(SRC.vocab.stoi["<sos>"]) # : 2 -> start token
print(SRC.vocab.stoi["<eos>"]) # : 3 -> end token

0
1
2
3
4112
1752
0
1
2
3


In [46]:
# 한 문장에 포함된 단어가 순서대로 나열된 상태로 네트워크에 입력되어야 함
# 따라서 하나의 배치에 포함된 문장들이 가지는 단어의 개수가 유사하도록 만들면 좋음 -> 이를 위해 BucketIterator 사용

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset),
    batch_size=batch_size,
    device=device
)

In [47]:
for i, batch in enumerate(train_iterator):
    src = batch.src
    trg = batch.trg

    print(f"첫 번째 배치 크기: {src.shape}")

    # 현재 배치에 있는 하나의 문장에 포함된 정보 출력
    for i in range(src.shape[1]):
        print(f"인덱스 {i}: {src[0][i].item()}") # 여기에서는 [Seq_num, Seq_len]

    # 첫 번째 배치만 확인
    break
     

첫 번째 배치 크기: torch.Size([128, 27])
인덱스 0: 2
인덱스 1: 18
인덱스 2: 0
인덱스 3: 858
인덱스 4: 9
인덱스 5: 5
인덱스 6: 13
인덱스 7: 10
인덱스 8: 8
인덱스 9: 16
인덱스 10: 9
인덱스 11: 1028
인덱스 12: 12
인덱스 13: 15
인덱스 14: 136
인덱스 15: 8
인덱스 16: 2088
인덱스 17: 4
인덱스 18: 3
인덱스 19: 1
인덱스 20: 1
인덱스 21: 1
인덱스 22: 1
인덱스 23: 1
인덱스 24: 1
인덱스 25: 1
인덱스 26: 1


In [91]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device): # ex. d_model = 512, max_len = 100
        # d_model : embedding dimension
        # max_len : 전체 데이터 문장에 대한 최대길이
        
        super(PositionalEncoding, self).__init__()

        self.encoding = torch.zeros(max_len, d_model, device=device) # self.encoding -> (max_len , d_model)
        self.encoding.requires_grad = False  # we don't need to compute gradient (학습할 필요가 없는 값)

        pos = torch.arange(0, max_len, device=device) # pos -> (max_len) # ex. pos = [0,1,2,3,...,99]
        pos = pos.float().unsqueeze(dim=1) # pos -> (max_len, 1) 

        _2i = torch.arange(0, d_model, step=2, device=device).float() # _2i -> (d_model//2) # ex. _2i = [0,2,4,...,510]

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # self.encoding[i,j] -> j가 짝수 : torch.sin(i/(10000)**(j/512))
        #                    -> j가 홀수 : torch.cos(i/(10000)**((j-1)/512))

        # self.encoding 은 i번째 단어에 대해 i번째 단어라는것을 구분짓기 위한 encoding 값을 제공함
        
    def forward(self, x): # x -> (Batch,Length)

        batch_size, seq_len = x.size() # seq_len != max_len (seq_len : 이번 배치에서의 seq_len)

        # seq_len이 배치내의 문장 최대 길이이므로 seq_len까지 단어 순서를 구분해주기 위한 encoding 값을 가져감
        return self.encoding[:seq_len, :] 

In [92]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob,device): # max_len은 전체 데이터에 대한 max_len
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len,device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x): # x -> (Batch,Length) (Length : Batch내의 최대 문장 길이)
        tok_emb = self.tok_emb(x) # tok_emb -> (Batch,Length,d_model)
        pos_emb = self.pos_emb(x) # pos_emb -> (Length,d_model)
        return self.drop_out(tok_emb + pos_emb) # (Batch,Length,d_model)  # pos_emb가 broadcasting 됨

In [93]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, inf_value=1e12): 
        # q,k,v -> [batch_size, head, length, d_tensor]  (d_tensor = d_model // n_head)
        # mask -> [batch_size , 1 , len_query , len_key]

        batch_size, head, length, d_tensor = k.size()

        k_t = k.transpose(2, 3)  # transpose # (B,n_head,d,L_k)
        score = (q @ k_t) / math.sqrt(d_tensor) # (B,n_head,L_q,L_k)

        if mask is not None: # (B, 1, L, L)
            score = score.masked_fill(mask == False, (-1)*inf_value) 
            # softmax 적용시 e^(-inf) = 0이 되므로 0대신 -inf를 넣어줌 0을 넣으면 e^0 = 1 로 1이 나오게됨

        score = self.softmax(score) # (B,n_head,L_q,L_k)
 
        v = score @ v # (B,n_head,L_q,d_tensor)
        # @ = matmul , mul은 원소별 곱셈 

        return v, score  # v(attention output) -> (B,n_head,L_q,d_tensor) , score -> (B,n_head,L,L)

In [94]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None): 
        # q,k,v -> [Batch,Length,d_model] , mask -> [batch_size , 1 , len_query , len_key]
        
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v) # [batch_size, length, d_model]
        # 합쳐서 연산하나 나눠서 연산하나 Linear 적용하는 부분은 똑같기때문에 한번에 연산함.
        
        q, k, v = self.split(q), self.split(k), self.split(v) 
        # [batch_size, head, length, d_tensor]  (d_tensor = d_model // self.n_head)

        out, attention = self.attention(q, k, v, mask=mask) 
        # v(attention output) -> (B,n_head,L_q,d_tensor) , score -> (B,n_head,L,L)
        # v-> out , score -> attention

        out = self.concat(out) # (batch_size, length, d_model)
        out = self.w_concat(out) # (batch_size, length, d_model)

        return out # (batch_size, length, d_model)

    def split(self, tensor): # tensor -> [batch_size, length, d_model]

        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)

        return tensor # tensor -> [batch_size, head, length, d_tensor]

    def concat(self, tensor): # tensor-> [batch_size, head, length, d_tensor]

        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor # tensor -> [batch_size, length, d_model]

In [95]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [96]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, src_mask):
        # 1. compute self attention
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=src_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 3. positionwise feed forward network
        _x = x
        x = self.ffn(x)
      
        # 4. add and norm
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x

In [97]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob,device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob,
                                        device=device
                                        )

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, src_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, src_mask)

        return x

In [98]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):    
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
            
            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

In [99]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device
                                        )

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)

        # pass to LM head
        output = self.linear(trg)
        return output

In [100]:
class Transformer(nn.Module):

    def __init__(self, pad_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                ffn_hidden, n_layers, drop_prob,device):
        super().__init__()
        self.pad_idx = pad_idx # 길이 맞춰주기 위한 패딩 보통 1 사용
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                            n_head=n_head,
                            max_len=max_len,
                            ffn_hidden=ffn_hidden,
                            enc_voc_size=enc_voc_size,
                            drop_prob=drop_prob,
                            n_layers=n_layers,
                            device=device
                            )

        self.decoder = Decoder(d_model=d_model,
                            n_head=n_head,
                            max_len=max_len,
                            ffn_hidden=ffn_hidden,
                            dec_voc_size=dec_voc_size,
                            drop_prob=drop_prob,
                            n_layers=n_layers,
                            device=device
                            )

    def forward(self, src, trg): # src,trg -> [Batch,Length]
        src_mask = self.make_pad_mask(src, src) # [batch_size , 1 , len_src , len_src] 
        
        src_trg_mask = self.make_pad_mask(trg, src) # [batch_size , 1 , len_trg , len_src] 
        
        trg_mask = self.make_pad_mask(trg, trg) * self.make_no_peak_mask(trg, trg) # [batch_size , 1 , len_trg , len_trg]
        # make_pad_mask(trg, trg) -> [batch_size , 1 , len_trg , len_trg]
        # make_no_peak_mask(trg, trg) -> [len_trg , len_trg]  (broadcasting 적용)
        
        enc_src = self.encoder(src, src_mask) # enc_src -> (batch_size, length, d_model)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output

    
    def make_pad_mask(self, q, k): # q,k -> [Batch,Length]
        len_q, len_k = q.size(1), k.size(1)

        # 참고 : https://github.com/kyungmin1212/Data_Science/blob/main/study/pytorch_code.md#5

        # batch_size x 1 x 1 x len_k (unsqueeze는 강제로 그 차원에 1차원을 넣어줌)
        k = k.ne(self.pad_idx).unsqueeze(1).unsqueeze(2)
        # batch_size x 1 x len_q x len_k
        k = k.repeat(1, 1, len_q, 1) # bx1 , 1x1 , 1xlen_1 , len_k x 1 차원이 되는것임 (repeat)

        # batch_size x 1 x len_q x 1
        q = q.ne(self.pad_idx).unsqueeze(1).unsqueeze(3)
        # batch_size x 1 x len_q x len_k
        q = q.repeat(1, 1, 1, len_k)

        mask = k & q # 둘다 True일경우 True 반환 나머지는 모두 False
        return mask

    def make_no_peak_mask(self, q, k): # q,k -> [Batch,Length]
        len_q, len_k = q.size(1), k.size(1)

        # tril 은 대각선 윗부분을 0으로 만들어주는것
        # len_q x len_k
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)

        return mask

In [215]:
# 여기서는 xavier_uniform_이 더 빠르게 수렴함. (모델 초기화는 상황에 따라 다르므로 다 해보고 좋은것을 선택)
# def initialize_weights(m):
#     # convolution kernel의 weight를 He initialization을 적용한다.
#     if isinstance(m, nn.Conv2d):
#         nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')

#         # bias는 상수 0으로 초기화 한다.
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)

#     elif isinstance(m, nn.BatchNorm2d):
#         nn.init.constant_(m.weight, 1)
#         nn.init.constant_(m.bias, 0)

#     elif isinstance(m, nn.Linear):
#         nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')

#         # bias는 상수 0으로 초기화 한다.
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)

In [237]:
def initialize_weights(m):
    # convolution kernel의 weight를 He initialization을 적용한다.
    if isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight.data)

        # bias는 상수 0으로 초기화 한다.
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

    elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)

    elif isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight.data)

        # bias는 상수 0으로 초기화 한다.
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

# def initialize_weights_base(m):
#     if hasattr(m, 'weight') and m.weight.dim() > 1:
#         nn.init.xavier_uniform_(m.weight.data)

## initialize_weights_base
# Epoch: 10 | Time: 0m 8s 
# 	Train Loss: 2.251 | Train PPL: 9.498
# 	Validation Loss: 2.151 | Validation PPL: 8.590

## initialize_weights
# Epoch: 10 | Time: 0m 8s
# 	Train Loss: 1.565 | Train PPL: 4.781
# 	Validation Loss: 1.723 | Validation PPL: 5.601

In [226]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)


In [238]:
enc_voc_size = len(SRC.vocab)
dec_voc_size = len(TRG.vocab)

batch_size = 128
max_len = 256 # 임의로 100이라고 지정(동작 안하면 더크게 만들면 됨)
d_model = 256
n_layers = 3
n_heads = 8
ffn_hidden = 512
drop_prob = 0.1

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] # 1
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] # 1
print(SRC_PAD_IDX)
# 없는단어 0 , 패딩 1 , START 2 , END 3

1


In [239]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [240]:
model = Transformer(SRC_PAD_IDX, enc_voc_size, dec_voc_size, d_model, n_heads, max_len, ffn_hidden, n_layers, drop_prob,device).to(device)

In [241]:
model.apply(initialize_weights)

Transformer(
  (encoder): Encoder(
    (emb): TransformerEmbedding(
      (tok_emb): Embedding(7853, 256)
      (pos_emb): PositionalEncoding()
      (drop_out): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (attention): ScaleDotProductAttention(
            (softmax): Softmax(dim=-1)
          )
          (w_q): Linear(in_features=256, out_features=256, bias=True)
          (w_k): Linear(in_features=256, out_features=256, bias=True)
          (w_v): Linear(in_features=256, out_features=256, bias=True)
          (w_concat): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
      

In [242]:
import torch.optim as optim

# Adam optimizer로 학습 최적화
LEARNING_RATE = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 뒷 부분의 패딩(padding)에 대해서는 값 무시
criterion = nn.CrossEntropyLoss(ignore_index = SRC_PAD_IDX)

In [243]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src # Batch,Length
        trg = batch.trg # Batch,Length

        optimizer.zero_grad()
        
        # 출력 단어의 마지막 인덱스는 제외 
        output = model(src, trg[:, :-1]) # output -> [Batch, trg_len - 1, output_dim]
        output_reshape = output.contiguous().view(-1, output.shape[-1]) # output_reshape -> [Batch*(trg_len-1), output_dim]
        trg = trg[:, 1:].contiguous().view(-1) # trg -> [Batch*(trg_len-1)]

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)

In [244]:
def evaluate(model, iterator, criterion):
    model.eval() # 평가 모드
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src # Batch,Length
            trg = batch.trg # Batch,Length

            output = model(src, trg[:,:-1]) # output -> [Batch, trg_len - 1, output_dim]

            output_reshape = output.contiguous().view(-1, output.shape[-1]) # output_reshape -> [Batch*(trg_len-1), output_dim]
            
            # start token 제외
            trg = trg[:,1:].contiguous().view(-1) # trg -> [Batch*(trg_len-1)]

            # 모델의 출력 결과와 타겟 문장을 비교하여 손실 계산
            loss = criterion(output_reshape, trg)

            # 전체 손실 값 계산
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [245]:
import math
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [246]:
import time
import math
import random

N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time() # 시작 시간 기록

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time() # 종료 시간 기록
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer_german_to_english.pt')

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')

step : 0.0 % , loss : 8.801682472229004
step : 0.44 % , loss : 8.295473098754883
step : 0.88 % , loss : 8.074953079223633
step : 1.32 % , loss : 7.89055871963501
step : 1.76 % , loss : 7.775366306304932
step : 2.2 % , loss : 7.650823593139648
step : 2.64 % , loss : 7.541178226470947
step : 3.08 % , loss : 7.315464019775391
step : 3.52 % , loss : 7.229629039764404
step : 3.96 % , loss : 7.145270824432373
step : 4.41 % , loss : 7.002967357635498
step : 4.85 % , loss : 6.839483737945557
step : 5.29 % , loss : 6.7777862548828125
step : 5.73 % , loss : 6.670000076293945
step : 6.17 % , loss : 6.5554680824279785
step : 6.61 % , loss : 6.426968097686768
step : 7.05 % , loss : 6.329578876495361
step : 7.49 % , loss : 6.259642124176025
step : 7.93 % , loss : 6.123231410980225
step : 8.37 % , loss : 6.055042743682861
step : 8.81 % , loss : 5.967118740081787
step : 9.25 % , loss : 5.815183162689209
step : 9.69 % , loss : 5.829502582550049
step : 10.13 % , loss : 5.724875450134277
step : 10.57 % ,

step : 87.22 % , loss : 3.789759635925293
step : 87.67 % , loss : 3.76476788520813
step : 88.11 % , loss : 3.740016222000122
step : 88.55 % , loss : 3.762610912322998
step : 88.99 % , loss : 3.682814359664917
step : 89.43 % , loss : 3.7117550373077393
step : 89.87 % , loss : 3.7508819103240967
step : 90.31 % , loss : 3.7757744789123535
step : 90.75 % , loss : 3.776050090789795
step : 91.19 % , loss : 3.5930793285369873
step : 91.63 % , loss : 3.739190101623535
step : 92.07 % , loss : 3.6405460834503174
step : 92.51 % , loss : 3.874835252761841
step : 92.95 % , loss : 3.641096830368042
step : 93.39 % , loss : 3.7318990230560303
step : 93.83 % , loss : 3.83461594581604
step : 94.27 % , loss : 3.726825475692749
step : 94.71 % , loss : 3.857269287109375
step : 95.15 % , loss : 3.8498497009277344
step : 95.59 % , loss : 3.5008888244628906
step : 96.04 % , loss : 3.5836448669433594
step : 96.48 % , loss : 3.7650539875030518
step : 96.92 % , loss : 3.7191152572631836
step : 97.36 % , loss : 3

step : 71.81 % , loss : 3.3474395275115967
step : 72.25 % , loss : 3.3069040775299072
step : 72.69 % , loss : 3.2128946781158447
step : 73.13 % , loss : 3.3499045372009277
step : 73.57 % , loss : 3.3387703895568848
step : 74.01 % , loss : 3.1667299270629883
step : 74.45 % , loss : 3.3701772689819336
step : 74.89 % , loss : 3.124979257583618
step : 75.33 % , loss : 3.2104411125183105
step : 75.77 % , loss : 3.4442923069000244
step : 76.21 % , loss : 3.249596118927002
step : 76.65 % , loss : 3.0336382389068604
step : 77.09 % , loss : 3.1652135848999023
step : 77.53 % , loss : 3.207982301712036
step : 77.97 % , loss : 3.3095028400421143
step : 78.41 % , loss : 3.341764211654663
step : 78.85 % , loss : 3.238327980041504
step : 79.3 % , loss : 3.215526819229126
step : 79.74 % , loss : 3.115304470062256
step : 80.18 % , loss : 3.431718587875366
step : 80.62 % , loss : 3.2817583084106445
step : 81.06 % , loss : 3.1962320804595947
step : 81.5 % , loss : 3.1206374168395996
step : 81.94 % , loss

step : 57.71 % , loss : 2.8210604190826416
step : 58.15 % , loss : 2.8008761405944824
step : 58.59 % , loss : 2.9240756034851074
step : 59.03 % , loss : 2.695463180541992
step : 59.47 % , loss : 2.8344340324401855
step : 59.91 % , loss : 3.1152236461639404
step : 60.35 % , loss : 2.8179244995117188
step : 60.79 % , loss : 2.803516387939453
step : 61.23 % , loss : 2.8730926513671875
step : 61.67 % , loss : 3.0276968479156494
step : 62.11 % , loss : 2.7417185306549072
step : 62.56 % , loss : 2.740041732788086
step : 63.0 % , loss : 2.899197578430176
step : 63.44 % , loss : 2.743548631668091
step : 63.88 % , loss : 2.8847649097442627
step : 64.32 % , loss : 2.9106006622314453
step : 64.76 % , loss : 2.8033502101898193
step : 65.2 % , loss : 2.8529601097106934
step : 65.64 % , loss : 2.9090232849121094
step : 66.08 % , loss : 2.849066734313965
step : 66.52 % , loss : 2.891796827316284
step : 66.96 % , loss : 2.649324893951416
step : 67.4 % , loss : 2.8628969192504883
step : 67.84 % , loss 

step : 42.29 % , loss : 2.675055742263794
step : 42.73 % , loss : 2.6089251041412354
step : 43.17 % , loss : 2.558142900466919
step : 43.61 % , loss : 2.5719571113586426
step : 44.05 % , loss : 2.5877208709716797
step : 44.49 % , loss : 2.5588619709014893
step : 44.93 % , loss : 2.479030132293701
step : 45.37 % , loss : 2.666048765182495
step : 45.81 % , loss : 2.5640385150909424
step : 46.26 % , loss : 2.430650472640991
step : 46.7 % , loss : 2.439948320388794
step : 47.14 % , loss : 2.4175829887390137
step : 47.58 % , loss : 2.611564874649048
step : 48.02 % , loss : 2.5779378414154053
step : 48.46 % , loss : 2.4727261066436768
step : 48.9 % , loss : 2.565427780151367
step : 49.34 % , loss : 2.5440425872802734
step : 49.78 % , loss : 2.6068530082702637
step : 50.22 % , loss : 2.5398619174957275
step : 50.66 % , loss : 2.6005334854125977
step : 51.1 % , loss : 2.635530471801758
step : 51.54 % , loss : 2.342151403427124
step : 51.98 % , loss : 2.572678804397583
step : 52.42 % , loss : 2

step : 28.63 % , loss : 2.410935163497925
step : 29.07 % , loss : 2.426849126815796
step : 29.52 % , loss : 2.3442254066467285
step : 29.96 % , loss : 2.2287514209747314
step : 30.4 % , loss : 2.120941400527954
step : 30.84 % , loss : 2.308159112930298
step : 31.28 % , loss : 2.3409712314605713
step : 31.72 % , loss : 2.392653703689575
step : 32.16 % , loss : 2.37748122215271
step : 32.6 % , loss : 2.199157953262329
step : 33.04 % , loss : 2.198206901550293
step : 33.48 % , loss : 2.362462282180786
step : 33.92 % , loss : 2.3515005111694336
step : 34.36 % , loss : 2.257678747177124
step : 34.8 % , loss : 2.1659750938415527
step : 35.24 % , loss : 2.3233070373535156
step : 35.68 % , loss : 2.282806873321533
step : 36.12 % , loss : 2.424041509628296
step : 36.56 % , loss : 2.2796854972839355
step : 37.0 % , loss : 2.185537099838257
step : 37.44 % , loss : 2.3973941802978516
step : 37.89 % , loss : 2.2983126640319824
step : 38.33 % , loss : 2.390939235687256
step : 38.77 % , loss : 2.3097

step : 13.22 % , loss : 2.0229721069335938
step : 13.66 % , loss : 2.1024558544158936
step : 14.1 % , loss : 2.136223793029785
step : 14.54 % , loss : 2.2031219005584717
step : 14.98 % , loss : 2.0916271209716797
step : 15.42 % , loss : 2.1149911880493164
step : 15.86 % , loss : 2.0195858478546143
step : 16.3 % , loss : 2.138136386871338
step : 16.74 % , loss : 2.056593418121338
step : 17.18 % , loss : 2.093890428543091
step : 17.62 % , loss : 2.1512436866760254
step : 18.06 % , loss : 2.137345552444458
step : 18.5 % , loss : 2.262291431427002
step : 18.94 % , loss : 2.111815929412842
step : 19.38 % , loss : 2.2616701126098633
step : 19.82 % , loss : 2.089376211166382
step : 20.26 % , loss : 2.1547341346740723
step : 20.7 % , loss : 2.2561228275299072
step : 21.15 % , loss : 2.148977041244507
step : 21.59 % , loss : 2.141474962234497
step : 22.03 % , loss : 1.9826527833938599
step : 22.47 % , loss : 2.187077045440674
step : 22.91 % , loss : 2.1756575107574463
step : 23.35 % , loss : 2.

Epoch: 06 | Time: 0m 8s
	Train Loss: 2.108 | Train PPL: 8.230
	Validation Loss: 1.985 | Validation PPL: 7.281
step : 0.0 % , loss : 1.791021704673767
step : 0.44 % , loss : 1.9374475479125977
step : 0.88 % , loss : 1.7574045658111572
step : 1.32 % , loss : 1.9116246700286865
step : 1.76 % , loss : 1.9793123006820679
step : 2.2 % , loss : 1.932409644126892
step : 2.64 % , loss : 1.8902413845062256
step : 3.08 % , loss : 2.011996030807495
step : 3.52 % , loss : 1.8121767044067383
step : 3.96 % , loss : 1.972497820854187
step : 4.41 % , loss : 1.9084155559539795
step : 4.85 % , loss : 2.012742280960083
step : 5.29 % , loss : 2.073146343231201
step : 5.73 % , loss : 1.9084762334823608
step : 6.17 % , loss : 1.9319900274276733
step : 6.61 % , loss : 1.887380599975586
step : 7.05 % , loss : 2.0409748554229736
step : 7.49 % , loss : 1.8498364686965942
step : 7.93 % , loss : 1.9673994779586792
step : 8.37 % , loss : 2.0053622722625732
step : 8.81 % , loss : 2.0065128803253174
step : 9.25 % , l

step : 86.34 % , loss : 1.977333664894104
step : 86.78 % , loss : 1.745039463043213
step : 87.22 % , loss : 1.8117177486419678
step : 87.67 % , loss : 1.9380615949630737
step : 88.11 % , loss : 1.988111138343811
step : 88.55 % , loss : 1.9332693815231323
step : 88.99 % , loss : 1.9654865264892578
step : 89.43 % , loss : 1.8776130676269531
step : 89.87 % , loss : 2.004777193069458
step : 90.31 % , loss : 1.9109052419662476
step : 90.75 % , loss : 1.857208251953125
step : 91.19 % , loss : 1.9543542861938477
step : 91.63 % , loss : 1.9462770223617554
step : 92.07 % , loss : 2.021052598953247
step : 92.51 % , loss : 1.913454294204712
step : 92.95 % , loss : 2.0148308277130127
step : 93.39 % , loss : 1.8390083312988281
step : 93.83 % , loss : 1.9096697568893433
step : 94.27 % , loss : 1.8311681747436523
step : 94.71 % , loss : 2.058652877807617
step : 95.15 % , loss : 1.858422875404358
step : 95.59 % , loss : 1.9519941806793213
step : 96.04 % , loss : 1.9052109718322754
step : 96.48 % , los

step : 70.93 % , loss : 1.8446755409240723
step : 71.37 % , loss : 1.6834428310394287
step : 71.81 % , loss : 1.857479453086853
step : 72.25 % , loss : 1.7722527980804443
step : 72.69 % , loss : 1.9396270513534546
step : 73.13 % , loss : 1.8946176767349243
step : 73.57 % , loss : 1.8714238405227661
step : 74.01 % , loss : 1.8350355625152588
step : 74.45 % , loss : 1.7523598670959473
step : 74.89 % , loss : 1.7938815355300903
step : 75.33 % , loss : 1.794954776763916
step : 75.77 % , loss : 1.8068286180496216
step : 76.21 % , loss : 1.7856336832046509
step : 76.65 % , loss : 1.976407766342163
step : 77.09 % , loss : 1.842067003250122
step : 77.53 % , loss : 1.6985212564468384
step : 77.97 % , loss : 1.7602702379226685
step : 78.41 % , loss : 1.9434304237365723
step : 78.85 % , loss : 1.8620011806488037
step : 79.3 % , loss : 1.63624906539917
step : 79.74 % , loss : 1.8865019083023071
step : 80.18 % , loss : 1.944370985031128
step : 80.62 % , loss : 1.7525254487991333
step : 81.06 % , lo

step : 55.07 % , loss : 1.7824145555496216
step : 55.51 % , loss : 1.716921091079712
step : 55.95 % , loss : 1.6825144290924072
step : 56.39 % , loss : 1.7136036157608032
step : 56.83 % , loss : 1.6969246864318848
step : 57.27 % , loss : 1.6115700006484985
step : 57.71 % , loss : 1.6992357969284058
step : 58.15 % , loss : 1.6133757829666138
step : 58.59 % , loss : 1.6904641389846802
step : 59.03 % , loss : 1.5912444591522217
step : 59.47 % , loss : 1.582250952720642
step : 59.91 % , loss : 1.6831692457199097
step : 60.35 % , loss : 1.6673469543457031
step : 60.79 % , loss : 1.6320255994796753
step : 61.23 % , loss : 1.8425604104995728
step : 61.67 % , loss : 1.525709867477417
step : 62.11 % , loss : 1.723475694656372
step : 62.56 % , loss : 1.7071369886398315
step : 63.0 % , loss : 1.6712079048156738
step : 63.44 % , loss : 1.6610499620437622
step : 63.88 % , loss : 1.7917404174804688
step : 64.32 % , loss : 1.8531543016433716
step : 64.76 % , loss : 1.6413521766662598
step : 65.2 % , 

step : 39.21 % , loss : 1.6745364665985107
step : 39.65 % , loss : 1.496378779411316
step : 40.09 % , loss : 1.615454077720642
step : 40.53 % , loss : 1.5667877197265625
step : 40.97 % , loss : 1.441941738128662
step : 41.41 % , loss : 1.4123860597610474
step : 41.85 % , loss : 1.574682354927063
step : 42.29 % , loss : 1.5712674856185913
step : 42.73 % , loss : 1.4710636138916016
step : 43.17 % , loss : 1.4264535903930664
step : 43.61 % , loss : 1.5095493793487549
step : 44.05 % , loss : 1.4539296627044678
step : 44.49 % , loss : 1.7464901208877563
step : 44.93 % , loss : 1.5833818912506104
step : 45.37 % , loss : 1.596808671951294
step : 45.81 % , loss : 1.5507242679595947
step : 46.26 % , loss : 1.6513992547988892
step : 46.7 % , loss : 1.5212410688400269
step : 47.14 % , loss : 1.4610700607299805
step : 47.58 % , loss : 1.5668747425079346
step : 48.02 % , loss : 1.3979589939117432
step : 48.46 % , loss : 1.6900055408477783
step : 48.9 % , loss : 1.5669993162155151
step : 49.34 % , l

In [208]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')
     

The model has 8,987,141 trainable parameters


In [209]:
test_loss = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.3f}')

Test Loss: 1.933 | Test PPL: 6.910


In [213]:
# 번역(translation) 함수
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50, logging=True):
    model.eval() # 평가 모드

    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # 처음에  토큰, 마지막에  토큰 붙이기
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    if logging:
        print(f"전체 소스 토큰: {tokens}")

    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    if logging:
        print(f"소스 문장 인덱스: {src_indexes}")

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    # 소스 문장에 따른 마스크 생성
    src_mask = model.make_pad_mask(src_tensor,src_tensor)

    # 인코더(endocer)에 소스 문장을 넣어 출력 값 구하기
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    # 처음에는  토큰 하나만 가지고 있도록 하기 --> start token으로만 시작해야함 타겟문장이 뭔지 모르기때문에.
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len): # 출력하고 싶은 문장의 최대길이
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        # 출력 문장에 따른 마스크 생성 , 여기서는 trg_tensor의 길이를 모르기때문에 
        trg_mask = model.make_pad_mask(trg_tensor, trg_tensor) * model.make_no_peak_mask(trg_tensor, trg_tensor)
        src_trg_mask = model.make_pad_mask(trg_tensor, src_tensor)
        with torch.no_grad():
            output = model.decoder(trg_tensor, enc_src, trg_mask, src_trg_mask)

        # 출력 문장에서 가장 마지막 단어만 사용 # trg가 한단어라면 한단어가 결과로 나옴. 4단어라면 4단어 결과가나옴.
        pred_token = output.argmax(-1)[:,-1].item() # output -> [Batch, trg_len - 1, output_dim] -> [Batch, trg_len - 1]
        trg_indexes.append(pred_token) # 출력 문장에 더하기

        # end token을 만나는 순간 끝
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    # 각 출력 단어 인덱스를 실제 단어로 변환
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    # 첫 번째 는 제외하고 출력 문장 반환
    return trg_tokens[1:]

In [214]:
example_idx = 10

src = vars(test_dataset.examples[example_idx])['src']
trg = vars(test_dataset.examples[example_idx])['trg']

print(f'소스 문장: {src}')
print(f'타겟 문장: {trg}')

translation = translate_sentence(src, SRC, TRG, model, device, logging=True)

print("모델 출력 결과:", " ".join(translation))

소스 문장: ['eine', 'mutter', 'und', 'ihr', 'kleiner', 'sohn', 'genießen', 'einen', 'schönen', 'tag', 'im', 'freien', '.']
타겟 문장: ['a', 'mother', 'and', 'her', 'young', 'song', 'enjoying', 'a', 'beautiful', 'day', 'outside', '.']
전체 소스 토큰: ['<sos>', 'eine', 'mutter', 'und', 'ihr', 'kleiner', 'sohn', 'genießen', 'einen', 'schönen', 'tag', 'im', 'freien', '.', '<eos>']
소스 문장 인덱스: [2, 8, 364, 10, 134, 70, 624, 565, 19, 780, 200, 20, 88, 4, 3]
모델 출력 결과: a mother and her daughter are enjoying a beautiful day outdoors . <eos>


In [None]:
# # 실제로 길이가 다른 문장을 이용할때 사용하는 방법 예시

# train_src_data = [
# [62, 13, 47, 39, 78, 33, 56, 13],
# [60, 96, 51, 32, 90],
# [35, 45, 48, 65, 91, 99, 92, 10,  3, 21],
# [66, 88, 98, 47],
# [77, 65, 51, 77, 19, 15, 35, 19, 23]
# ]

# train_trg_data = [
# [1, 33, 11, 49, 10],
# [1, 88, 34,  5, 29, 99, 45, 11, 25],
# [1, 67, 25, 15, 90, 54,  4, 92, 10, 46, 20, 88, 19],
# [1, 16, 58, 91, 47, 12,  5,  8],
# [1, 71, 63, 62,  7,  9, 11, 55, 91, 32, 48]
# ]

# class CustomDataset(torch.utils.data.Dataset):
#     def __init__(self,src_data,trg_data):
#         super().__init__()
#         self.src_data = src_data
#         self.trg_data = trg_data

#     def __getitem__(self,index):
#         return torch.LongTensor(self.src_data[index]),torch.LongTensor(self.trg_data[index])

#     def __len__(self):
#         return len(self.src_data)

# def collate_fn(batch):
#     src_batch, tgt_batch = [], []
#     for src_sample, tgt_sample in batch:
#         src_batch.append(src_sample)
#         tgt_batch.append(tgt_sample)

#     src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=0)
#     tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=0)
#     return src_batch.transpose(0,1), tgt_batch.transpose(0,1)

# dataset = CustomDataset(src_data,trg_data)
# train_loader = torch.utils.data.DataLoader(dataset,batch_size=2,shuffle=True,collate_fn=collate_fn)
# src,trg = next(iter(train_loader))
# print(src)
# print(src.size()) # Batch,Length
# print(trg)
# print(trg.size()) # Batch,Length
# '''
# tensor([[60, 96, 51, 32, 90,  0,  0,  0],
#         [62, 13, 47, 39, 78, 33, 56, 13]])
# torch.Size([2, 8])
# tensor([[ 1, 88, 34,  5, 29, 99, 45, 11, 25],
#         [ 1, 33, 11, 49, 10,  0,  0,  0,  0]])
# torch.Size([2, 9])
# '''