# Transformer

Reference : [https://github.com/ndb796/Deep-Learning-Paper-Review-and-Practice/blob/master/code_practices/Attention_is_All_You_Need_Tutorial_(German_English).ipynb](https://github.com/ndb796/Deep-Learning-Paper-Review-and-Practice/blob/master/code_practices/Attention_is_All_You_Need_Tutorial_(German_English).ipynb)

In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import time
import math

import spacy
from torchtext.data import Field
from torchtext.datasets import Multi30k
from torchtext.data import BucketIterator

In [36]:
def get_multi30k(lower=True, batch_first=True, min_freq=2):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')


    def tokenize_de(text):    
        return [token.text for token in spacy_de.tokenizer(text)]


    def tokenize_en(text):
        return [token.text for token in spacy_en.tokenizer(text)]
        

    SRC = Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>", lower=lower, batch_first=batch_first)
    TRG = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>", lower=lower, batch_first=batch_first)

    train_dataset, valid_dataset, test_dataset = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))

    SRC.build_vocab(train_dataset, min_freq=min_freq)
    TRG.build_vocab(train_dataset, min_freq=min_freq)

    return train_dataset, valid_dataset, test_dataset, SRC, TRG

In [78]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, d_model, n_heads, dropout, device='cuda'):
        super().__init__()

        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads 

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.wo = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)
        
        self.attention = None
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask = None):

        batch_size = query.shape[0]

        Q = self.wq(query)
        K = self.wk(key)
        V = self.wv(value)

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # [batch_size, n_heads, query_len, head_dim]
        # [batch_size, n_heads, key_len  , head_dim]
        # [batch_size, n_heads, value_len, head_dim]

        # Self Attention
        # Softmax(dot(Q, K.T) / scale) * V
        ##############################################################
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        if mask is not None:
            energy = energy.masked_fill(mask==0, -1e10)
            
        self.attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(self.dropout(self.attention), V)
        ##############################################################
        # x: [batch_size, n_heads, query_len, head_dim]

        x = x.permute(0, 2, 1, 3).contiguous()

        # x: [batch_size, query_len, n_heads, head_dim]

        x = x.view(batch_size, -1, self.d_model)

        # x: [batch_size, query_len, hidden_dim]

        x = self.wo(x)

        # x: [batch_size, query_len, hidden_dim]

        return x

In [79]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.w1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.w2(x)

        return x

In [89]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout, device):
        super().__init__()

        self.self_attention_norm = nn.LayerNorm(d_model)
        self.self_attention = MultiHeadAttentionLayer(d_model, n_heads, dropout, device)
        
        self.feedforward = FeedForward(d_model, d_ff, dropout)
        self.feedforward_norm = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # x    : [batch_size, src_len, d_model]
        # mask : [batch_size, src_len]

        # 필요한 경우 마스크(mask) 행렬을 이용하여 어텐션(attention)할 단어를 조절 가능
        y = self.self_attention(x, x, x, mask)
        x = self.self_attention_norm(x + self.dropout(y))

        # [batch_size, src_len, d_model]
        y = self.feedforward(x)
        x = self.feedforward_norm(x + self.dropout(y))

        # [batch_size, src_len, d_model]

        return x


class Encoder(nn.Module):
    def __init__(self, source_size, d_model, N, n_heads, d_ff, dropout, device, max_length=100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(source_size, d_model)
        self.pos_embedding = nn.Embedding(max_length, d_model)

        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout, device) for _ in range(N)])

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(device)

    def forward(self, x, mask):
        # x    : [batch_size, src_len]
        # mask : [batch_size, src_len]

        batch_size = x.shape[0]
        x_len = x.shape[1]
        
        pos = torch.arange(0, x_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # x([batch_size, x_len, d_model]) + pos([batch_size, x_len, d_model])
        x = self.dropout((self.tok_embedding(x) * self.scale) + self.pos_embedding(pos))

        # x: [batch_size, src_len, d_model]
        for layer in self.layers:
            x = layer(x, mask)

        # x: [batch_size, src_len, d_model]

        return x

In [90]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout, device):
        super().__init__()

        self.self_attention = MultiHeadAttentionLayer(d_model, n_heads, dropout, device)
        self.self_attention_norm = nn.LayerNorm(d_model)
        
        self.encoder_attention = MultiHeadAttentionLayer(d_model, n_heads, dropout, device)
        self.encoder_attention_norm = nn.LayerNorm(d_model)
        
        self.feedforward = FeedForward(d_model, d_ff, dropout)
        self.feedforward_norm = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)

    # 인코더의 출력 값(enc_src)을 어텐션(attention)하는 구조
    def forward(self, trg, enc_src, trg_mask, src_mask):

        # trg     : [batch_size, trg_len, hidden_dim]
        # enc_src : [batch_size, src_len, hidden_dim]
        # trg_mask: [batch_size, trg_len]
        # src_mask: [batch_size, src_len]

        # self attention
        # 자기 자신에 대하여 어텐션(attention)
        _trg = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attention_norm(trg + self.dropout(_trg))

        # trg: [batch_size, trg_len, hidden_dim]

        # encoder attention
        # 디코더의 쿼리(Query)를 이용해 인코더를 어텐션(attention)
        _trg = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.encoder_attention_norm(trg + self.dropout(_trg))

        # trg: [batch_size, trg_len, hidden_dim]

        _trg = self.feedforward(trg)
        trg = self.feedforward_norm(trg + self.dropout(_trg))

        # trg: [batch_size, trg_len, hidden_dim]

        return trg


class Decoder(nn.Module):
    def __init__(self, target_size, d_model, N, n_heads, d_ff, dropout, device, max_length=100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(target_size, d_model)
        self.pos_embedding = nn.Embedding(max_length, d_model)

        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout, device) for _ in range(N)])

        self.fc_out = nn.Linear(d_model, target_size)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        # trg     : [batch_size, trg_len]
        # enc_src : [batch_size, src_len, d_model]
        # trg_mask: [batch_size, trg_len]
        # src_mask: [batch_size, src_len]

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        # pos: [batch_size, trg_len]

        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))

        # trg: [batch_size, trg_len, hidden_dim]

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)

        # trg: [batch_size, trg_len, hidden_dim]

        output = self.fc_out(trg)

        # output: [batch_size, trg_len, output_dim]

        return output

In [91]:
class Transformer(nn.Module):
    def __init__(self, source_size, target_size, d_model, enc_N, dec_N, n_heads, d_ff, dropout, device):
        super().__init__()
        self.encoder = Encoder(source_size, d_model, enc_N, n_heads, d_ff, dropout, device)
        self.decoder = Decoder(target_size, d_model, dec_N, n_heads, d_ff, dropout, device)
        self.device = device

    def forward(self, src, trg):

        # src: [batch_size, src_len]
        # trg: [batch_size, trg_len]

        src_mask = make_src_mask(src, 1)
        trg_mask = make_trg_mask(trg, 1, self.device)

        # src_mask: [batch_size, 1, 1, src_len]
        # trg_mask: [batch_size, 1, trg_len, trg_len]

        enc_src = self.encoder(src, src_mask)

        # enc_src: [batch_size, src_len, d_model]

        output = self.decoder(trg, enc_src, trg_mask, src_mask)

        # output: [batch_size, trg_len, target_size]
        # attention: [batch_size, n_heads, trg_len, src_len]

        return output

In [92]:
def make_src_mask(src, src_pad_idx):
    src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
    
    return src_mask


# 타겟 문장에서 각 단어는 다음 단어가 무엇인지 알 수 없도록(이전 단어만 보도록) 만들기 위해 마스크를 사용
def make_trg_mask(trg, trg_pad_idx, device):

    # trg: [batch_size, trg_len]

    trg_pad_mask = (trg != trg_pad_idx).unsqueeze(1).unsqueeze(2)
    
    # trg_pad_mask: [batch_size, 1, 1, trg_len]

    trg_len = trg.shape[1]

    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = device)).bool()

    # trg_sub_mask: [trg_len, trg_len]

    trg_mask = trg_pad_mask & trg_sub_mask

    # trg_mask: [batch_size, 1, trg_len, trg_len]

    return trg_mask

In [93]:
make_src_mask(torch.Tensor([[2,3,4,5,6], [2,3,4,5,6]]), 1)

tensor([[[[True, True, True, True, True]]],


        [[[True, True, True, True, True]]]])

In [85]:
make_trg_mask(torch.Tensor([[2,1,4,5,6], [2,3,4,5,6]]), 1, 'cpu')

tensor([[[[ True, False, False, False, False],
          [ True, False, False, False, False],
          [ True, False,  True, False, False],
          [ True, False,  True,  True, False],
          [ True, False,  True,  True,  True]]],


        [[[ True, False, False, False, False],
          [ True,  True, False, False, False],
          [ True,  True,  True, False, False],
          [ True,  True,  True,  True, False],
          [ True,  True,  True,  True,  True]]]])

In [68]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        # source : <sos>     ~   <eos>
        # target : <sos>     ~   <eos> -1
        # output : <sos> + 1 ~   <eos>
        
        output = model(src, trg[:,:-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)

        # output : [batch size * trg_len - 1, output_dim]
        # target : [batch size * trg len - 1]

        loss = criterion(output, trg)
        loss.backward()

        # 기울기(gradient) clipping 진행
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [97]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            # source : <sos>     ~   <eos>
            # target : <sos>     ~   <eos> -1
            # output : <sos> + 1 ~   <eos>
            
            output = model(src, trg[:,:-1])

            # output: [배치 크기, trg_len - 1, output_dim]
            # trg: [배치 크기, trg_len]

            output_dim = output.shape[-1]

            output = output.contiguous().view(-1, output_dim)
            # 출력 단어의 인덱스 0(<sos>)은 제외
            trg = trg[:,1:].contiguous().view(-1)

            # output : [batch size * trg_len - 1, output_dim]
            # target : [batch size * trg len - 1]

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [96]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50, logging=True):
    model.eval()

    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # tokens : <sos> ~ <eos>
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    if logging:
        print(f"전체 소스 토큰: {tokens}")

    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    if logging:
        print(f"소스 문장 인덱스: {src_indexes}")

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = make_src_mask(src_tensor, 1)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = make_trg_mask(trg_tensor, 1, device)

        with torch.no_grad():
            output = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

        # 출력 문장에서 가장 마지막 단어만 사용
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)

        # endpoint : get <eos>
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    # index to word
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    return trg_tokens[1:]


In [71]:
device = 'cuda'

# Get Data
train_dataset, valid_dataset, test_dataset, SRC, TRG = get_multi30k()

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_dataset, valid_dataset, test_dataset),
                                                                       batch_size=128,
                                                                       device=device)
                                                                       
source_size = len(SRC.vocab)
target_size = len(TRG.vocab)
d_model = 256
enc_N = 3
dec_N = 3
n_heads = 8
d_ff = 512
dropout = 0.1

# Seq2Seq
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

In [94]:
# Transformer 객체 선언
model = Transformer(source_size, 
                    target_size, 
                    d_model, 
                    enc_N, 
                    dec_N, 
                    n_heads, 
                    d_ff, 
                    dropout,
                    device).to(device)

In [73]:
clip = 1
epochs = 10
lr = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=1).to(device)

In [95]:
best_valid_loss = float('inf')

for epoch in range(epochs):
    start_time = time.time() # 시작 시간 기록

    train_loss = train(model, train_iterator, optimizer, criterion, clip)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time() # 종료 시간 기록
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer_german_to_english.pt')

    print(f'Epoch: {epoch + 1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')

    src = vars(test_dataset.examples[0])['src']
    trg = vars(test_dataset.examples[0])['trg']

    print(f'소스 문장: {src}')
    print(f'타겟 문장: {trg}')

    translation = translate_sentence(src, SRC, TRG, model, device, logging=True)

    print("모델 출력 결과:", " ".join(translation))

Epoch: 1 | Time: 0m 16s
	Train Loss: 8.837 | Train PPL: 6885.658
	Validation Loss: 8.835 | Validation PPL: 6868.007
소스 문장: ['ein', 'mann', 'mit', 'einem', 'orangefarbenen', 'hut', ',', 'der', 'etwas', 'anstarrt', '.']
타겟 문장: ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']
전체 소스 토큰: ['<sos>', 'ein', 'mann', 'mit', 'einem', 'orangefarbenen', 'hut', ',', 'der', 'etwas', 'anstarrt', '.', '<eos>']
소스 문장 인덱스: [2, 5, 13, 11, 6, 175, 106, 9, 15, 75, 0, 4, 3]


TypeError: make_src_mask() missing 1 required positional argument: 'src_pad_idx'