In [None]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |█████                           | 10kB 8.3MB/s eta 0:00:01[K     |██████████▏                     | 20kB 13.2MB/s eta 0:00:01[K     |███████████████▎                | 30kB 12.7MB/s eta 0:00:01[K     |████████████████████▍           | 40kB 10.4MB/s eta 0:00:01[K     |█████████████████████████▌      | 51kB 8.2MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61kB 9.4MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 4.0MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 37.0MB/s 
Installing collected packages: sentencepiece, torchtext
  Fo

In [None]:
%%capture
!python -m spacy download en
!python -m spacy download de

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
        print('and then re-execute this cell.')
    else:
        print(gpu_info)
print('device :',device)
print('torch.version :',torch.__version__)

Mon Feb 15 18:02:30 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    27W /  70W |   1122MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import spacy
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

spacy_en = spacy.load('en') # 영어 토큰화(tokenization)
spacy_de = spacy.load('de') # 독일어 토큰화(tokenization)


def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]

# 영어(English) 문장을 토큰화 하는 함수
def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]





SRC = Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
train_dataset, valid_dataset, test_dataset = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))


SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

In [None]:
mbsz = 128
train_iterator, test_iterator = BucketIterator.splits((train_dataset, test_dataset), batch_size=mbsz)

In [None]:
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))


29000
1014
1000


In [None]:
class PositionalEmbedding(object):
    def __init__(self, n_seq, d_hid):
        self.n_seq, self.d_hid = n_seq, d_hid
        self.table = torch.zeros(1, n_seq, d_hid).to(device)
        self.fn = [np.sin, np.cos]
        for t in range(n_seq):
            for i in range(d_hid):
                self.table[0][t][i] = self.fn[i % 2](t/np.power(10000, (i // 2)/d_hid))

    def get(self, batch_size, n_seq):
        if n_seq > self.n_seq:
            new_pos = torch.zeros(1, n_seq - self.n_seq, self.d_hid).to(device)
            for t in range(n_seq - self.n_seq):
                for i in range(self.d_hid):
                    new_pos[0][t][i] = self.fn[i % 2]((t + self.n_seq)/np.power(10000, (i // 2)/self.d_hid))
            
            self.table = torch.cat([self.table, new_pos], dim=1)
            self.n_seq = n_seq
        return self.table[:, :n_seq, :].expand(batch_size, -1, -1)



In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_hidn, n_head, dropout_ratio):
        super().__init__()
        
        assert d_hidn % n_head == 0

        self.d_hidn = d_hidn
        self.d_head = d_hidn // n_head
        self.n_head = n_head
        self.projs = nn.Parameter(nn.init.xavier_uniform_(torch.empty(d_hidn, d_hidn * 3)))
        self.dropout = nn.Dropout(dropout_ratio)
        self.layer_norm1 = nn.LayerNorm(d_hidn)
        self.feedforward = nn.Sequential(nn.Linear(d_hidn, d_hidn),
                                         nn.ReLU(),
                                         nn.Linear(d_hidn, d_hidn))
        self.layer_norm2 = nn.LayerNorm(d_hidn)
        


    def forward(self, x, mask): # mask.shape = [mbsz, 1, n_seq, n_seq]
        
        mbsz, n_seq = x.size(0), x.size(1)
        key, que, val = (x @ self.projs).chunk(3, 2)
        key = key.view(mbsz, n_seq, self.n_head, -1).permute(0, 2, 1, 3).contiguous() #[mbsz, n_head, n_seq, d_head]
        que = que.view(mbsz, n_seq, self.n_head, -1).permute(0, 2, 3, 1).contiguous() #[mbsz, n_head, d_head, n_seq]
        val = val.view(mbsz, n_seq, self.n_head, -1).permute(0, 2, 1, 3).contiguous() #[mbsz, n_head, n_seq, d_head]

        x1 = torch.matmul(key, que / np.sqrt(self.d_head)) #[mbsz, n_head, n_seq, n_seq']
        x1 = x1.masked_fill(mask, -np.inf)
        x1 = torch.softmax(x1, dim=3)
        x1 = self.dropout(x1)
        x1 = torch.matmul(x1, val) #[mbsz, n_head, n_seq, n_seq']
        x1 = x1.permute(0, 2, 3, 1).contiguous().flatten(2, 3)

        x = (x + x1).view(-1, self.d_hidn) #[mbsz * n_seq, d_hidn]
        x = self.layer_norm1(x)
        x = x + self.feedforward(x)
        x = self.layer_norm2(x)
        x = x.view(mbsz, n_seq, -1)
        return x


class Encoder(nn.Module):
    def __init__(self, d_input, d_hidn, num_layers, n_head, dropout_ratio):
        super().__init__()
        self.embedding = nn.Embedding(d_input, d_hidn)
        self.dropout = nn.Dropout(dropout_ratio)
        self.positional_embedding = PositionalEmbedding(n_seq=2, d_hid=d_hidn)
        self.layers = nn.ModuleList([EncoderBlock(d_hidn, n_head, dropout_ratio) for _ in range(num_layers)])
        

    def forward(self, x, mask):

        x = self.embedding(x)
        x += self.positional_embedding.get(x.size(0), x.size(1))
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return x
        

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_hidn, n_head, dropout_ratio, enc_d_hidn, enc_n_head):
        super().__init__()
        
        assert d_hidn % n_head == 0

        self.d_hidn = d_hidn
        self.d_head = d_hidn // n_head
        self.n_head = n_head

        self.enc_d_hidn = enc_d_hidn
        self.enc_n_head = enc_n_head

        self.projs = nn.Parameter(nn.init.xavier_uniform_(torch.empty(d_hidn, d_hidn * 3)))
        self.to_query = nn.Linear(d_hidn, enc_d_hidn)
                
        self.dropout = nn.Dropout(dropout_ratio)
        self.layer_norm1 = nn.LayerNorm(d_hidn)
        
        self.enc_projs = nn.Parameter(nn.init.xavier_uniform_(torch.empty(enc_d_hidn, enc_d_hidn * 2)))

        self.enc_to_dec = nn.Linear(enc_d_hidn, d_hidn)
    

        self.feedforward = nn.Sequential(nn.Linear(d_hidn, d_hidn),
                                         nn.ReLU(),
                                         nn.Linear(d_hidn, d_hidn))
        self.layer_norm2 = nn.LayerNorm(d_hidn)
        


    def forward(self, x, enc_out, trg_mask, enc_mask): # mask.shape = [mbsz, 1, n_seq, n_seq]
        
        mbsz, n_seq = x.size(0), x.size(1)
        enc_n_seq = enc_out.size(1)
        key, que, val = (x @ self.projs).chunk(3, 2)
        key = key.view(mbsz, n_seq, self.n_head, -1).permute(0, 2, 1, 3).contiguous() #[mbsz, n_head, n_seq, d_head]
        que = que.view(mbsz, n_seq, self.n_head, -1).permute(0, 2, 3, 1).contiguous() #[mbsz, n_head, d_head, n_seq]
        val = val.view(mbsz, n_seq, self.n_head, -1).permute(0, 2, 1, 3).contiguous() #[mbsz, n_head, n_seq, d_head]

        x1 = torch.matmul(key, que) / np.sqrt(self.d_head) #[mbsz, n_head, n_seq, n_seq']
        x1 = x1.masked_fill(trg_mask, -np.inf)
        x1 = torch.softmax(x1, dim=3)
        x1 = self.dropout(x1)
        x1 = torch.matmul(x1, val) #[mbsz, n_head, n_seq, n_seq']
        x1 = x1.permute(0, 2, 3, 1).contiguous().flatten(2, 3) #[mbsz, n_seq, n_head x n_seq']


        x = (x + x1).view(-1, self.d_hidn) #[mbsz * n_seq, d_hidn]
        x = self.layer_norm1(x)



        dec_que = self.to_query(x) #[mbsz, dec_n_seq, enc_d_hidn]

        dec_que = dec_que.view(mbsz, n_seq, self.enc_n_head, -1).permute(0, 2, 1, 3).contiguous() #[mbsz, enc_n_head, dec_n_seq, enc_d_head]


        
        enc_key, enc_val = (enc_out @ self.enc_projs).chunk(2, 2)
 
        enc_key = enc_key.view(mbsz, enc_n_seq, self.enc_n_head, -1).permute(0, 2, 3, 1).contiguous() #[mbsz, enc_n_head, enc_d_head, enc_n_seq]
        enc_val = enc_val.view(mbsz, enc_n_seq, self.enc_n_head, -1).permute(0, 2, 1, 3).contiguous() #[mbsz, enc_n_head, enc_n_seq, enc_d_head]


        x2 = torch.matmul(dec_que, enc_key) / np.sqrt(self.enc_d_hidn / self.enc_n_head) #[mbsz, enc_n_head, dec_n_seq, enc_n_seq]


        x2 = x2.masked_fill(enc_mask, -np.inf)

        x2 = torch.softmax(x2, dim=3)


        x2 = torch.matmul(x2, enc_val) #[mbsz, enc_n_head, dec_n_seq, enc_d_head]
        x2 = x2.permute(0, 2, 3, 1).contiguous().flatten(2, 3) #[mbsz, dec_n_seq, enc_n_head x enc_d_head']

        x2 = x2.view(-1, self.enc_d_hidn)
        x2 = self.enc_to_dec(x2)
        
        x = x + x2

        x = x + self.feedforward(x)
        x = self.layer_norm2(x)
        x = x.view(mbsz, n_seq, -1)
        return x

class Decoder(nn.Module):
    def __init__(self, d_input, d_hidn, d_out, num_layers, n_head, dropout_ratio, enc_d_hidn, enc_n_head):
        super().__init__()
        self.embedding = nn.Embedding(d_input, d_hidn)
        self.dropout = nn.Dropout(dropout_ratio)
        self.positional_embedding = PositionalEmbedding(n_seq=2, d_hid=d_hidn)
        self.layers = nn.ModuleList([DecoderBlock(d_hidn, n_head, dropout_ratio, enc_d_hidn, enc_n_head) for _ in range(num_layers)])
        self.decoder_out = nn.Linear(d_hidn, d_out)

    def forward(self, x, enc_out, trg_mask, enc_mask):

        mbsz, n_seq = x.size(0), x.size(1)
        x = self.embedding(x)
        x += self.positional_embedding.get(x.size(0), x.size(1))
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, enc_out, trg_mask, enc_mask)
        
        x = x.view(mbsz * n_seq, -1)
        x = self.decoder_out(x)
        x = x.view(mbsz, n_seq, -1)

        return x



        

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HIDDEN_DIM = 512
ENC_LAYERS = 6
DEC_LAYERS = 6
ENC_HEADS = 8
DEC_HEADS = 8

ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

In [None]:
enc = Encoder(d_input=INPUT_DIM,  d_hidn=HIDDEN_DIM, num_layers=ENC_LAYERS, n_head=ENC_HEADS, dropout_ratio=ENC_DROPOUT).to(device)
dec = Decoder(d_input=OUTPUT_DIM, d_hidn=HIDDEN_DIM, d_out=OUTPUT_DIM, num_layers=DEC_LAYERS, n_head=DEC_HEADS, dropout_ratio=DEC_DROPOUT, enc_d_hidn=HIDDEN_DIM, enc_n_head=ENC_HEADS).to(device)


enc_solver = optim.Adam(enc.parameters(), lr=0.0005)
dec_solver = optim.Adam(dec.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index = TRG.vocab.stoi['<pad>'])

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
    train_loss = 0.0
    for i, batch in enumerate(train_iterator):
        src = batch.src.to(device)
        trg_ = batch.trg.to(device)
        
        trg = trg_[:, :-1]

        target = trg_[:, 1:]

        src_data_pad = src == SRC.vocab.stoi['<pad>']
        trg_data_pad = trg == TRG.vocab.stoi['<pad>']

        enc_mask = src_data_pad.unsqueeze(1).expand(-1, src.size(1), -1).unsqueeze(1).to(device)

        dec_mask = torch.tril(torch.ones(trg.size(1), trg.size(1)), diagonal=-1).transpose(0, 1)
        dec_mask = dec_mask.unsqueeze(0).expand(trg.size(0), -1, -1).to(device)
        dec_mask = dec_mask.logical_or(trg_data_pad.unsqueeze(1).expand(-1, trg.size(1), -1).to(device)).unsqueeze(1)

        dec_src_mask = src_data_pad.unsqueeze(1).expand(-1, trg.size(1), -1).unsqueeze(1).to(device)

        enc_solver.zero_grad()
        dec_solver.zero_grad()
        encoded = enc(src, enc_mask)
        decoded = dec(trg, encoded, dec_mask, dec_src_mask)
        
        loss = criterion(decoded.flatten(0, 1), target.flatten(0, 1))
        loss.backward()

        enc_solver.step()
        dec_solver.step()
        train_loss += loss.item()
    print(train_loss / len(train_iterator))
    

0.09564689103727299
0.09580299742355745
0.09529580961962103
0.09508279907939718
0.09416684388523584
0.08791264235185632
0.08960615868150926
0.08981538905314937
0.09041737760150485
0.09108656492199141
0.09055035999496078
0.08808559026684004
0.08582361039217348
0.08288997667739045
0.08471085244404061
0.08546623507194583
0.08675657055188905
0.0813773715712688
0.0821534363872942
0.08224148723391184


In [None]:
example_idx = 2

src = vars(test_dataset.examples[example_idx])['src']
trg = vars(test_dataset.examples[example_idx])['trg']

print(" ".join(src))
print(" ".join(trg))

src = [SRC.vocab.stoi['<sos>']] + [SRC.vocab.stoi[token.lower()] for token in src] + [SRC.vocab.stoi['<eos>']]
src = torch.LongTensor(src).unsqueeze(0).to(device)
enc_mask = torch.zeros(1, src.size(1), src.size(1)).bool().to(device)

enc.eval()

encoded = enc(src, enc_mask)
max_len = 50

target = torch.zeros(1, max_len).long().to(device)
target[0][0] = TRG.vocab.stoi["<sos>"]

dec_mask = torch.tril(torch.ones(max_len, max_len), diagonal=-1).transpose(0, 1)
dec_mask = dec_mask.unsqueeze(0).bool().to(device)
dec_src_mask = torch.zeros(1, max_len, src.size(1)).bool().to(device)

enc.train()

dec.eval()


seq = []
for i in range(max_len):
    decoded = dec(target, encoded, dec_mask, dec_src_mask)
    argmax = decoded[0][i].argmax().item()
    
    if i != max_len - 1:
        target[0][i + 1] = argmax

    if argmax == TRG.vocab.stoi['<eos>']:
        break

    seq.append(argmax)

dec.train()

seq_trans = [TRG.vocab.itos[idx] for idx in seq]
print(" ".join(seq_trans))


ein mädchen in einem karateanzug bricht ein brett mit einem tritt .
a girl in karate uniform breaking a stick with a front kick .
a girl in a karate uniform uses a snowboard to toe .
