In [1]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 9.0MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 11.7MB/s 
Installing collected packages: sentencepiece, torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.95 torchtext-0.6.0


In [2]:
%%capture
!python -m spacy download en
!python -m spacy download de

In [3]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim
import math

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
        print('and then re-execute this cell.')
    else:
        print(gpu_info)
print('device :',device)
print('torch.version :',torch.__version__)

Thu Mar  4 07:34:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import spacy
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

spacy_en = spacy.load('en') # 영어 토큰화(tokenization)
spacy_de = spacy.load('de') # 독일어 토큰화(tokenization)


def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]

# 영어(English) 문장을 토큰화 하는 함수
def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]





SRC = Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
train_dataset, valid_dataset, test_dataset = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))


SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.04MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 276kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 265kB/s]


In [5]:

class PositionalEmbedding(object):
    def __init__(self, d_model, max_len=512):
        pe = torch.zeros(max_len, d_model, requires_grad=False).float()
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def get_embedding_like(self, x):
        return self.pe[:, :x.size(1)].expand(x.size(0), -1, -1).to(device)


class TokenEmbedding(nn.Module):
    def __init__(self, n_vocab, d_model):
        super().__init__()
        self.emb = nn.Embedding(n_vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.emb(x) * math.sqrt(self.d_model)



class LayerNorm(nn.Module): 

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features, 1))
        self.b_2 = nn.Parameter(torch.zeros(features, 1))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-2, keepdim=True)
        std = x.std(-2, keepdim=True)

        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class EncoderBlock(nn.Module):
    def __init__(self, d_hidn, n_head, dropout_ratio):
        super().__init__()
        
        assert d_hidn % n_head == 0

        self.d_hidn = d_hidn
        self.d_head = d_hidn // n_head
        self.n_head = n_head
        
        self.projs = nn.Conv1d(d_hidn, d_hidn * 3, 1)
        self.dropout = nn.Dropout(dropout_ratio)
        self.layer_norm1 = LayerNorm(d_hidn)
        self.feedforward = nn.Sequential(nn.Conv1d(d_hidn, d_hidn, 1),
                                         nn.ReLU(),
                                         nn.Conv1d(d_hidn, d_hidn, 1))
        
        self.layer_norm2 = LayerNorm(d_hidn)


    def forward(self, x, mask): 

        mbsz, n_seq = x.size(0), x.size(2)

        key, que, val = self.projs(x).chunk(3, dim=1)

        key = key.view(mbsz, self.n_head, -1, n_seq) #[mbsz, n_head, d_head, n_seq']
        que = que.view(mbsz, self.n_head, -1, n_seq).transpose(2, 3) #[mbsz, n_head, n_seq, d_head]
        
        val = val.view(mbsz, self.n_head, -1, n_seq).transpose(2, 3) #[mbsz, n_head, n_seq', d_head]


        w = torch.matmul(que, key) / math.sqrt(self.d_head) # [mbsz, n_head, n_seq, n_seq']
        w = w.masked_fill(mask, -np.inf)
        w = torch.softmax(w, dim=3)

        w = self.dropout(w)
        w = torch.matmul(w, val) # [mbsz, n_head, n_seq, d_head]
        w = w.transpose(2, 3) # [mbsz, n_head, d_head, n_seq]
        w = w.flatten(1, 2) # [mbsz, n_head * d_head, n_seq]
        
        x = self.layer_norm1(x + w)
        x = x + self.feedforward(x)
        x = self.layer_norm2(x)
        return x




class Encoder(nn.Module):
    def __init__(self, n_vocab, d_hidn, num_layers, n_head, dropout_ratio):
        super().__init__()
        self.te = TokenEmbedding(n_vocab, d_hidn)
        self.pe = PositionalEmbedding(d_hidn, 512)
        self.dropout = nn.Dropout(dropout_ratio)
        self.layers = nn.ModuleList([EncoderBlock(d_hidn, n_head, dropout_ratio) for _ in range(num_layers)])
        

    def generate_mask(self, is_pad):

        mbsz, n_seq = is_pad.size()
        pad_mask = src_pad.view(mbsz, 1, n_seq)
        pad_mask = pad_mask.expand(-1, n_seq, -1)
        pad_mask = pad_mask.unsqueeze(1)
        return pad_mask


    def forward(self, x, is_pad):

        mbsz, n_seq = src.size()    
        pad_mask = self.generate_mask(is_pad)

        x = self.te(x)
        x = x + self.pe.get_embedding_like(x)
        x = self.dropout(x)
        x = x.transpose(1, 2).contiguous()
        for layer in self.layers:
            x = layer(x, pad_mask)

        x = x.transpose(1, 2).contiguous()

        return x



In [10]:
class DecoderBlock(nn.Module):
    def __init__(self, d_hidn, n_head, dropout_ratio):
        super().__init__()
        
        assert d_hidn % n_head == 0

        self.d_hidn = d_hidn
        self.n_head = n_head
        self.d_head = d_hidn // n_head


        self.projs = nn.Conv1d(d_hidn, d_hidn * 3, 1)
        self.dropout1 = nn.Dropout(dropout_ratio)
        self.layer_norm1 = LayerNorm(d_hidn)
        
        self.enc_projs = nn.Conv1d(d_hidn, d_hidn * 2, 1)
        self.dropout2 = nn.Dropout(dropout_ratio)
        self.layer_norm2 = LayerNorm(d_hidn)
        
        self.feedforward = nn.Sequential(nn.Conv1d(d_hidn, d_hidn, 1),
                                         nn.ReLU(),
                                         nn.Conv1d(d_hidn, d_hidn, 1))
        
        self.layer_norm3 = LayerNorm(d_hidn)


    def forward(self, input, input_mask, e_out, e_mask):

        mbsz, n_seq = input.size(0), input.size(2)

        x = input
        key, que, val = self.projs(x).chunk(3, dim=1)
        key = key.view(mbsz, self.n_head, -1, n_seq) #[mbsz, n_head, d_head, n_seq']
        que = que.view(mbsz, self.n_head, -1, n_seq).transpose(2, 3) #[mbsz, n_head, n_seq, d_head]
        val = val.view(mbsz, self.n_head, -1, n_seq).transpose(2, 3) #[mbsz, n_head, n_seq', d_head]

        w = torch.matmul(que, key) / math.sqrt(self.d_head) # [mbsz, n_head, n_seq, n_seq']
        w = w.masked_fill(input_mask, -np.inf)
        w = torch.softmax(w, dim=3)
        w = self.dropout1(w)
        w = torch.matmul(w, val) # [mbsz, n_head, n_seq, d_head]
        w = w.transpose(2, 3) # [mbsz, n_head, d_head, n_seq]
        w = w.flatten(1, 2) # [mbsz, n_head * d_head, n_seq]
        x = self.layer_norm1(x + w)


        enc_n_seq = e_out.size(2)

        dec_que = x.view(mbsz, self.n_head, self.d_head, n_seq).transpose(2, 3) #[mbsz, n_head, n_seq, d_head]
        enc_key, enc_val = self.enc_projs(e_out).chunk(2, dim=1)
        enc_key = enc_key.view(mbsz, self.n_head, self.d_head, enc_n_seq) #[mbsz, n_head, d_head, enc_n_seq]
        enc_val = enc_val.view(mbsz, self.n_head, self.d_head, enc_n_seq).transpose(2, 3) #[mbsz, n_head, enc_n_seq, d_head]

        w = torch.matmul(dec_que, enc_key) / math.sqrt(self.d_head) #[mbsz, n_head, n_seq, enc_n_seq]
        w = w.masked_fill(e_mask, -np.inf)
        w = torch.softmax(w, dim=3)
        w = self.dropout2(w)
        w = torch.matmul(w, enc_val) #[mbsz, n_head, n_seq, d_head]
        w = w.transpose(2, 3) # [mbsz, n_head, d_head, n_seq]
        w = w.flatten(1, 2) # [mbsz, n_head * d_head, n_seq]

        x = self.layer_norm2(x + w)
        x = x + self.feedforward(x)
        x = self.layer_norm3(x)
        return x


class Decoder(nn.Module):
    def __init__(self, n_vocab, d_hidn, num_layers, n_head, dropout_ratio):
        super().__init__()
        self.te = TokenEmbedding(n_vocab, d_hidn)
        self.pe = PositionalEmbedding(d_hidn, 512)
        self.dropout = nn.Dropout(dropout_ratio)
        self.layers = nn.ModuleList([DecoderBlock(d_hidn, n_head, dropout_ratio) for _ in range(num_layers)])

        self.classifier = nn.Conv1d(d_hidn, n_vocab, 1)
        

    def generate_enc_mask(self, is_enc_pad, input_n_seq):
        mbsz, n_seq = is_enc_pad.size()
        pad_mask = src_pad.view(mbsz, 1, n_seq)
        pad_mask = pad_mask.expand(-1, input_n_seq, -1)
        pad_mask = pad_mask.unsqueeze(1)

        return pad_mask #[mbsz, 1, input_n_seq, enc_n_seq]

    
    def generate_input_mask(self, is_input_pad):
        mbsz, input_n_seq = is_input_pad.size()
        is_input_pad = is_input_pad.to(device)
        pad_mask = is_input_pad.view(mbsz, 1, input_n_seq)
        pad_mask = pad_mask.expand(-1, input_n_seq, -1) #[mbsz, input_n_seq, input_n_seq]

        ar_mask = torch.tril(torch.ones(input_n_seq, input_n_seq), diagonal=-1).transpose(0, 1)
        ar_mask = ar_mask.view(1, input_n_seq, input_n_seq).expand(mbsz, -1, -1).to(device)

        input_mask = torch.logical_or(pad_mask, ar_mask)
        input_mask = input_mask.unsqueeze(1)
        return input_mask


    def forward(self, input, is_input_pad, enc, is_enc_pad):

        mbsz, input_n_seq = input.size()    
        
        e_mask = self.generate_enc_mask(is_enc_pad, input_n_seq)
        input_mask = self.generate_input_mask(is_input_pad)
        x = self.te(input)
        x = x + self.pe.get_embedding_like(x)
        x = self.dropout(x)
        x = x.transpose(1, 2).contiguous()

        enc = enc.transpose(1, 2).contiguous()
        for layer in self.layers:
            x = layer(x, input_mask, enc, e_mask)
        x = self.classifier(x)
        x = x.transpose(1, 2).contiguous()
        
        return x


    def generate(self, max_len, trg_field, enc, is_enc_pad):

        assert enc.size(0) == 1

        input = torch.zeros(1, max_len).long().to(device)
        input[0][0] = trg_field.vocab['<sos>']
        
        input_pad_mask = torch.zeros_like(input).bool().to(device)

        seq = []
        self.eval()
        with torch.no_grad():
            for i in range(max_len):
                x = input
                out = self.forward(x, input_pad_mask, enc, is_enc_pad)
                argmax = out[0][i].argmax().item()
                if i != max_len - 1:
                    input[0][i + 1] = argmax
                
            
                if argmax == trg_field.vocab['<eos>']:
                    break

                seq.append(argmax)
        self.train()

        str_seq = [trg_field.vocab.itos[idx] for idx in seq]
        
    return " ".join(str_seq)

    
        




        
        


In [22]:
mbsz = 128
train_iterator, test_iterator = BucketIterator.splits((train_dataset, test_dataset), batch_size=mbsz)
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HIDDEN_DIM = 256
ENC_LAYERS = 6
DEC_LAYERS = 6
ENC_HEADS = 8
DEC_HEADS = 8
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM,  d_hidn=HIDDEN_DIM, num_layers=ENC_LAYERS, n_head=ENC_HEADS, dropout_ratio=ENC_DROPOUT).to(device)
dec = Decoder(OUTPUT_DIM, d_hidn=HIDDEN_DIM, num_layers=DEC_LAYERS, n_head=DEC_HEADS, dropout_ratio=DEC_DROPOUT).to(device)
enc_solver = optim.Adam(enc.parameters(), lr=0.0005)
dec_solver = optim.Adam(dec.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index = TRG.vocab.stoi['<pad>'])

In [10]:
TRG.vocab['<sos>']

2

In [40]:
num_epochs = 50

for epoch in range(num_epochs):
    train_loss = 0.0
    for i, batch in enumerate(train_iterator):


        src = batch.src.to(device)

        trg = batch.trg.to(device)


        dec_input = trg[:, :-1]
        dec_target = trg[:,  1:]
        
        src_pad = (src == SRC.vocab.stoi['<pad>']).to(device)
        dec_input_pad = (dec_input == TRG.vocab.stoi['<pad>']).to(device)

        enc_solver.zero_grad()
        dec_solver.zero_grad()

        e_out = enc(src, src_pad)

        
        decoded = dec(dec_input, dec_input_pad, e_out, src_pad)

        loss = criterion(decoded.flatten(0, 1), dec_target.flatten(0, 1))
        loss.backward()

        enc_solver.step()
        dec_solver.step()
        train_loss += loss.item()
        
    print(train_loss / len(train_iterator))
    

0.105519136044685
0.1056904771451383
0.1053124478472487
0.10749061619001338
0.10598416895593316


KeyboardInterrupt: ignored

In [41]:
example_idx = 5

src = vars(test_dataset.examples[example_idx])['src']
trg = vars(test_dataset.examples[example_idx])['trg']

print(" ".join(src))
print(" ".join(trg))

enc.eval()
with torch.no_grad():
    src = [SRC.vocab.stoi['<sos>']] + [SRC.vocab.stoi[token.lower()] for token in src] + [SRC.vocab.stoi['<eos>']]
    src = torch.LongTensor(src).unsqueeze(0).to(device)
    src_pad = (src == SRC.vocab.stoi['<pad>']).to(device)
    encoded = enc(src, src_pad)
enc.train()

encoded = enc(src, src_pad)
max_len = 50
dec.generate(max_len, TRG, encoded, src_pad)

ein hell gekleideter mann fotografiert eine gruppe von männern in dunklen anzügen und mit hüten , die um eine frau in einem trägerlosen kleid herum stehen .
a man in light colored clothing photographs a group of men wearing dark suits and hats standing around a woman dressed in a strapless gown .
a group of men in white dress and a man take a picture of a woman dressed in a dark dress and feathered wait around them .
