#### Positional Encoding
$$PE_{(pos,2i)}=sin(pos/10000^{2i/d_{model}})$$
$$PE_{(pos,2i+1)}=cos(pos/10000^{2i/d_{model}})$$

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self,max_len,dim_model,device):
        super().__init__()
        self.encoding = torch.zeros(max_len,dim_model,device=device,dtype=torch.float32)
        self.encoding.requires_grad=False
        pos = torch.arange(0,max_len,device=device,dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,dim_model,2,device=device,dtype=torch.float32)*(math.log(10000)/dim_model))
        self.encoding[:,0::2] = torch.sin(pos/div_term)
        self.encoding[:,1::2] = torch.cos(pos/div_term)
    
    def forward(self,x):
        _, seq_len = x.size()
        
        return self.encoding[:seq_len,:]

#### Embedding

In [4]:
class TokenEmbedding(nn.Embedding):
    def __init__(self,num_embeddings,embedding_dim,padding_idx):
        super(TokenEmbedding,self).__init__(num_embeddings,embedding_dim,padding_idx)
#相当于nn.Embedding(num_embeddings=num_embeddings,embedding_dim=embedding_dim,padding_idx=padding_idx,device=device)

### TransformerEmbedding

<img src="./data/Transformer_figure/PosEmb.png" width="500" height="200">

In [5]:
class TransformerEmbedding(nn.Module):
    def __init__(self,max_len,dim_model,vocab_size,padding_idx,drop_prob,device):
        super().__init__()
        self.pos = PositionalEncoding(max_len=max_len,dim_model=dim_model,device=device)
        self.emb = TokenEmbedding(num_embeddings=vocab_size,embedding_dim=dim_model,
                                  padding_idx=padding_idx)
        self.drop = nn.Dropout(p=drop_prob)
    
    def forward(self,x):
        x_pos = self.pos(x)
        x_emb = self.emb(x)
        
        return self.drop(x_pos+x_emb)

##### ScaledDotProductAttention
<img src="./data/Transformer_figure/SDPA.png" width="200" height="200">
$$\mathrm{Attention}(Q,K,V)=\mathrm{softmax}(\frac {QK^{T}} {\sqrt{d_{k}}})V$$

In [6]:
class ScaledDotProductAttention(nn.Module):
    """
    input q,k,v shape=[batch_size,num_heads,seq_len,split_dim_model]
    output v shape=[batch_size,num_heads,seq_len,split_dim_model]
           score shape=[batch_size,num_heads,seq_len,seq_len]
    """
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self,q,k,v,mask=None):
        d_k = k.size(-1)
        k_t = k.transpose(2,3)
        score = (q@k_t)/math.sqrt(d_k)
        if mask is not None:
            mask = mask.to(torch.float32)
            score = score.masked_fill(mask==0,-1e6)
        score = self.softmax(score)
        v = score@v
        
        return v,score 

#### MultiheadAttention
<img src="./data/Transformer_figure/MHA.png" width="200" height="200">
$$\mathrm{MultiHead}(Q,K,V)=\mathrm{Concat}(\mathrm{head_1},...,\mathrm{head_n})W^O$$
$$\mathrm{head_i}=\mathrm{Attention}(QW_i^Q,KW_i^K,VW_i^V)$$

In [7]:
class MultiheadAttention(nn.Module):
    def __init__(self,dim_model,num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.attention = ScaledDotProductAttention()
        self.w_q = nn.Linear(dim_model,dim_model)
        self.w_k = nn.Linear(dim_model,dim_model)
        self.w_v = nn.Linear(dim_model,dim_model)
        self.w_o = nn.Linear(dim_model,dim_model)
    
    def forward(self,q,k,v,mask):
        q = self.w_q(q)
        k = self.w_k(k)
        v = self.w_v(v)
        q = self.split(q)
        k = self.split(k)
        v = self.split(v)
        out, attention = self.attention(q,k,v,mask)
        out = self.concat(out)
        out = self.w_o(out)
        
        return out
    
    def split(self,x):
        batch_size, seq_len, dim_model = x.size()
        split_dim_model = dim_model//self.num_heads
        
        return x.reshape(batch_size,self.num_heads,seq_len,split_dim_model)
    
    def concat(self,x):
        batch_size, num_heads, seq_len, split_dim_model = x.size()
        x = x.transpose(1,2)
        
        return x.reshape(batch_size,seq_len,num_heads*split_dim_model)

#### LayerNorm
$$y=\frac{x-\mathrm{E}[x]}{\sqrt{\mathrm{Var}[x]+\epsilon}}*\gamma+\beta$$

In [8]:
class LayerNorm(nn.Module):
    def __init__(self,dim_model,epsilon=1e-6):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(dim_model))
        self.beta = nn.Parameter(torch.zeros(dim_model))
        self.epsilon = epsilon
    
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        var = x.var(-1,unbiased=False,keepdim=True)
        
        return (x-mean)/torch.sqrt(var+self.epsilon)*self.gamma+self.beta

#### PWFFN
$$\mathrm{FFN}(x)=\mathrm{max}(0,xW_1+b_1)W_2+b_2$$

In [9]:
class PWFFN(nn.Module):
    def __init__(self,dim_model,ffn_hidden,drop_prob=0.1):
        super().__init__()
        self.fc1 = nn.Linear(dim_model,ffn_hidden)
        self.fc2 = nn.Linear(ffn_hidden,dim_model)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=drop_prob)
    
    def forward(self,x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.fc2(x)
        
        return x

### TransformerEncoderLayer
<img src="./data/Transformer_figure/EncoderLayer.png" width="200" height="200">
$$\mathrm{LayerNorm}(x+\mathrm{Sublayer}(x))$$

In [10]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self,dim_model,num_heads,ffn_hidden,drop_prob):
        super().__init__()
        self.attention =  MultiheadAttention(dim_model=dim_model,num_heads=num_heads)
        self.pwffn = PWFFN(dim_model=dim_model,ffn_hidden=ffn_hidden,drop_prob=drop_prob)
        self.norm1 = LayerNorm(dim_model=dim_model)
        self.norm2 = LayerNorm(dim_model=dim_model)
        self.drop1 = nn.Dropout(p=drop_prob)
        self.drop2 = nn.Dropout(p=drop_prob)
    
    def forward(self,src,src_mask):
        # layer1
        x1 = src
        q,k,v = src,src,src
        out1 = self.attention(q,k,v,src_mask)
        out1 = self.drop1(out1)
        out1 = self.norm1(x1+out1)
        
        # layer2
        x2 = out1
        out = self.pwffn(x2)
        out = self.drop2(out)
        out = self.norm2(x2+out)
        
        return out

## TransformerEncoder
<img src="./data/Transformer_figure/Encoder.png" width="200" height="200">

In [11]:
class TransformerEncoder(nn.Module):
    def __init__(self,max_len,dim_model,num_heads,num_layers,ffn_hidden,vocab_size,padding_idx,drop_prob,device):
        super().__init__()
        self.posemb = TransformerEmbedding(max_len=max_len,dim_model=dim_model,vocab_size=vocab_size,
                                           padding_idx=padding_idx,drop_prob=drop_prob,device=device)
        self.layers = nn.ModuleList([TransformerEncoderLayer(dim_model=dim_model,num_heads=num_heads,
                                     ffn_hidden=ffn_hidden,drop_prob=drop_prob) for _ in range(num_layers)])
        
    def forward(self,src,src_mask):
        src = self.posemb(src)
        for layer in self.layers:
            src = layer(src,src_mask)
        
        return src

### TransformerDecoderLayer
<img src="./data/Transformer_figure/DecoderLayer.png" width="200" height="200">
$$\mathrm{LayerNorm}(x+\mathrm{Sublayer}(x))$$

In [12]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self,dim_model,num_heads,ffn_hidden,drop_prob):
        super().__init__()
        self.attention1 = MultiheadAttention(dim_model=dim_model,num_heads=num_heads)
        self.attention2 = MultiheadAttention(dim_model=dim_model,num_heads=num_heads)
        self.pwffn = PWFFN(dim_model=dim_model,ffn_hidden=ffn_hidden,drop_prob=drop_prob)
        self.norm1 = LayerNorm(dim_model=dim_model)
        self.norm2 = LayerNorm(dim_model=dim_model)
        self.norm3 = LayerNorm(dim_model=dim_model)
        self.drop1 = nn.Dropout(p=drop_prob)
        self.drop2 = nn.Dropout(p=drop_prob)
        self.drop3 = nn.Dropout(p=drop_prob)
        
    def forward(self,enc_src,tgt,src_mask,tgt_mask):
        # layer1
        x1 = tgt
        q,k,v = tgt,tgt,tgt
        out1 = self.attention1(q,k,v,tgt_mask)
        out1 = self.drop1(out1)
        out1 = self.norm1(x1+out1)
        
        # layer2
        x2 = out1
        q = out1
        k,v = enc_src,enc_src
        out2 = self.attention2(q,k,v,src_mask)
        out2 = self.drop2(out2)
        out2 = self.norm2(x2+out2)
        
        # layer3
        x3 = out2
        out = self.pwffn(out2)
        out = self.drop3(out)
        out = self.norm3(x3+out)
        
        return out

## TransformerDecoder
<img src="./data/Transformer_figure/Decoder.png" width="200" height="200">

In [13]:
class TransformerDecoder(nn.Module):
    def __init__(self,max_len,dim_model,num_heads,num_layers,ffn_hidden,vocab_size,padding_idx,drop_prob,device):
        super().__init__()
        self.posemb = TransformerEmbedding(max_len=max_len,dim_model=dim_model,vocab_size=vocab_size,
                                           padding_idx=padding_idx,drop_prob=drop_prob,device=device)
        self.layers = nn.ModuleList(TransformerDecoderLayer(dim_model=dim_model,num_heads=num_heads,
                                    ffn_hidden=ffn_hidden,drop_prob=drop_prob) for _ in range(num_layers))
        self.generator = nn.Linear(dim_model,vocab_size)
        
    def forward(self,enc_src,tgt,src_mask,tgt_mask):
        tgt = self.posemb(tgt)
        for layer in self.layers:
            tgt = layer(enc_src,tgt,src_mask,tgt_mask)
        out = self.generator(tgt)
        
        return out

# Transformer
<img src="./data/Transformer_figure/Model.png" width="400" height="200">

In [14]:
class Transformer(nn.Module):
    def __init__(self,max_len,dim_model,num_heads,num_layers,ffn_hidden,src_vocab_size,src_padding_idx,tgt_vocab_size,tgt_padding_idx,drop_prob,device):
        super().__init__()
        self.src_padding_idx = src_padding_idx
        self.tgt_padding_idx = tgt_padding_idx
        self.encoder = TransformerEncoder(max_len=max_len,dim_model=dim_model,num_heads=num_heads,
                       num_layers=num_layers,ffn_hidden=ffn_hidden,vocab_size=src_vocab_size,
                       padding_idx=src_padding_idx,drop_prob=drop_prob,device=device)
        self.decoder = TransformerDecoder(max_len=max_len,dim_model=dim_model,num_heads=num_heads,
                       num_layers=num_layers,ffn_hidden=ffn_hidden,vocab_size=tgt_vocab_size,
                       padding_idx=tgt_padding_idx,drop_prob=drop_prob,device=device)
        
    def forward(self,src,tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        enc_src = self.encoder(src,src_mask)
        out = self.decoder(enc_src,tgt,src_mask,tgt_mask)
        
        return out
    # make mask
    def make_src_mask(self,src):
        src_mask = (src!=self.src_padding_idx).unsqueeze(1).unsqueeze(2)
        
        return src_mask
    
    def make_tgt_mask(self,tgt):
        tgt_pad_mask = (tgt!=self.tgt_padding_idx).unsqueeze(1).unsqueeze(2)
        tgt_seq_len = tgt.size(1)
        tgt_seq_mask = torch.tril(torch.ones(tgt_seq_len,tgt_seq_len)).type(torch.ByteTensor).to(device)
        tgt_mask = tgt_pad_mask & tgt_seq_mask
        
        return tgt_mask

In [1]:
import os
import math
import torch
import torch.nn as nn
import torch.optim as optim

#### test

In [22]:
# make mask
def make_src_mask(src,src_pad_idx):
    src_mask = (src!=src_pad_idx).unsqueeze(1).unsqueeze(2)
    return src_mask
def make_tgt_mask(tgt,tgt_pad_idx):
    tgt_pad_mask = (tgt!=tgt_pad_idx).unsqueeze(1).unsqueeze(2)
    tgt_seq_len = tgt.size(1)
    tgt_seq_mask = torch.tril(torch.ones(tgt_seq_len,tgt_seq_len)).type(torch.ByteTensor).to(device)
    tgt_mask = tgt_pad_mask & tgt_seq_mask
    return tgt_mask

In [2]:
virtual_src = torch.Tensor([
    [9,4,3,2,5,6,1,1],
    [5,3,2,6,8,4,1,1],
    [2,3,4,5,6,7,8,9],
    [5,6,7,4,8,6,2,1]
])
virtual_tgt = torch.Tensor([
    [10,6,4,3,9,7,8,5,1,1],
    [5,7,4,2,8,6,9,3,1,1],
    [3,5,2,6,7,4,11,9,8,10],
    [5,7,2,6,4,9,3,5,6,1]
])
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
virtual_src,virtual_tgt = virtual_src.long().to(device),virtual_tgt.long().to(device)
virtual_src_voc_size = 10
virtual_tgt_voc_size = 11
virtual_batch_size = 4
virtual_dmodel = 6
virtual_num_layers = 3
virtual_nheads = 2
virtual_ffn_hidden = 24
virtual_max_len = 100
virtual_drop_prob=0.1
virtual_src_pad_idx=1
virtual_tgt_pad_idx=1
virtual_tgt_sos_idx=2
#virtual_src_mask = make_src_mask(virtual_src,virtual_src_pad_idx)
#virtual_tgt_mask = make_tgt_mask(virtual_tgt,virtual_tgt_pad_idx)

In [15]:
test = Transformer(virtual_max_len,virtual_dmodel,virtual_nheads,virtual_num_layers,virtual_ffn_hidden,
                  virtual_src_voc_size,virtual_src_pad_idx,virtual_tgt_voc_size,virtual_tgt_pad_idx,
                  virtual_drop_prob,device).to(device)

In [16]:
test(virtual_src,virtual_tgt)

  nonzero_finite_vals = torch.masked_select(


tensor([[[-3.2804e-01, -6.5257e-01,  5.7021e-01,  2.4681e-01,  8.0556e-02,
          -5.4721e-01,  3.5284e-01, -1.0073e+00, -2.7481e-01, -9.5489e-03,
          -4.8646e-01],
         [-5.9465e-02, -7.1724e-01,  1.0543e+00, -7.3162e-01,  1.8026e-01,
           5.6468e-01, -1.0772e-01, -1.0095e+00, -5.5105e-01,  4.2806e-01,
           3.4945e-01],
         [-2.2491e-01, -8.6839e-01,  7.6564e-01, -2.6864e-01,  2.8544e-02,
          -3.9882e-01, -2.5482e-01, -9.5177e-01, -8.8793e-01, -2.0861e-01,
          -1.5761e-01],
         [-2.0514e-01, -8.9940e-01,  8.0265e-01, -2.7720e-01, -1.0644e-01,
           1.9816e-01,  9.3696e-01, -7.9800e-01,  3.6999e-01, -1.4054e-01,
           1.8910e-01],
         [-2.2643e-02, -2.7954e-01,  3.9777e-01,  1.4855e-02,  7.3926e-02,
           1.9396e-01,  4.2662e-01, -7.1609e-01,  7.8967e-02,  4.3733e-01,
          -9.7409e-02],
         [ 8.1475e-01,  5.6603e-01, -1.2859e+00,  2.2307e-01, -1.1293e+00,
          -9.5517e-02,  1.2320e-01,  1.2554e+00,  4.799

#### test

## data

In [17]:
#config
# model parameter setting
batch_size = 128
max_len = 256
dim_model = 512
num_layers = 6
num_heads = 8
ffn_hidden = 2048
drop_prob = 0.1
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 1000
clip = 1.0
weight_decay = 5e-4
inf = float('inf')


In [18]:
import spacy
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

class DataLoaderWrapper:
    def __init__(self, tokenize_en, tokenize_de, init_token, eos_token):
        self.tokenize_en = tokenize_en
        self.tokenize_de = tokenize_de
        self.init_token = init_token
        self.eos_token = eos_token
        print('Dataset initializing start')

    def make_dataset(self):
        # Load the dataset
        train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'))
        return train_data, valid_data, test_data

    def build_vocab(self, train_data, min_freq):
        def yield_tokens(data_iter, tokenizer):
            for src, trg in data_iter:
                yield tokenizer(src)
                yield tokenizer(trg)

        self.src_vocab = build_vocab_from_iterator(yield_tokens(train_data, self.tokenize_de),
                                              min_freq=min_freq, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
        self.src_vocab.set_default_index(self.src_vocab['<unk>'])

        self.tgt_vocab = build_vocab_from_iterator(yield_tokens(train_data, self.tokenize_en),
                                              min_freq=min_freq, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
        self.tgt_vocab.set_default_index(self.tgt_vocab['<unk>'])

        return self.src_vocab, self.tgt_vocab

    def collate_batch(self, batch):
        src_batch, tgt_batch = zip(*batch)
        src_batch = [torch.tensor(self.src_vocab(self.tokenize_de(x))) for x in src_batch]
        tgt_batch = [torch.tensor(self.tgt_vocab(self.tokenize_en(x))) for x in tgt_batch]

        src_batch = pad_sequence(src_batch, padding_value=self.src_vocab['<pad>'],batch_first=True)
        tgt_batch = pad_sequence(tgt_batch, padding_value=self.tgt_vocab['<pad>'],batch_first=True)

        return src_batch, tgt_batch

    def make_iter(self, train_data, valid_data, test_data, batch_size, device):
        train_iter = DataLoader(train_data, batch_size=batch_size, collate_fn=self.collate_batch, shuffle=True)
        valid_iter = DataLoader(valid_data, batch_size=batch_size, collate_fn=self.collate_batch)
        test_iter = DataLoader(test_data, batch_size=batch_size, collate_fn=self.collate_batch)
        print('Dataset initializing done')
        return train_iter, valid_iter, test_iter

class Tokenizer:
    def __init__(self):
        self.spacy_de = spacy.load('de_core_news_sm')
        self.spacy_en = spacy.load('en_core_web_sm')

    def tokenize_de(self, text):
        """
        Tokenizes German text from a string into a list of strings
        """
        return [tok.text for tok in self.spacy_de.tokenizer(text)]

    def tokenize_en(self, text):
        """
        Tokenizes English text from a string into a list of strings
        """
        return [tok.text for tok in self.spacy_en.tokenizer(text)]

# Example usage
tokenizer = Tokenizer()
loader = DataLoaderWrapper(
    tokenize_en=tokenizer.tokenize_en,
    tokenize_de=tokenizer.tokenize_de,
    init_token='<sos>',
    eos_token='<eos>'
)

train, valid, test = loader.make_dataset()
src_vocab, tgt_vocab = loader.build_vocab(train_data=train, min_freq=2)


train_iter, valid_iter, test_iter = loader.make_iter(train, valid, test,
                                                     batch_size=batch_size,
                                                     device=device)
# data parameters
src_pad_idx = src_vocab['<pad>']
tgt_pad_idx = tgt_vocab['<pad>']
tgt_sos_idx = tgt_vocab['<sos>']

enc_voc_size = len(src_vocab)
dec_voc_size = len(tgt_vocab)

len_train_iter = sum(1 for _ in train_iter)
#len_test_iter = sum(1 for _ in test_iter)
len_val_iter = sum(1 for _ in valid_iter)




Dataset initializing start


################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



Dataset initializing done


In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)


model = Transformer(max_len,dim_model,num_heads,num_layers,ffn_hidden,enc_voc_size,src_pad_idx,dec_voc_size,
                    tgt_pad_idx,drop_prob,device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')
model.apply(initialize_weights)
optimizer = optim.Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)


def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, tgt) in enumerate(iterator):
        src,tgt = src.to(device),tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        tgt = tgt[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        #len_iterator=227
        #break
        print('step :', round((i / len_train_iter) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len_train_iter


def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, (src, tgt) in enumerate(iterator):
            src,tgt = src.to(device),tgt.to(device)
            output = model(src, tgt[:, :-1])
            total_bleu = []
            for j in range(tgt.shape[0]):
                try:
                    tgt_words = idx_to_word(tgt[j], tgt_vocab)
                    output_words = output[j].max(dim=1)[1]
                    output_words = idx_to_word(output_words, tgt_vocab)
                    bleu = get_bleu(hypotheses=output_words.split(), reference=tgt_words.split())
                    total_bleu.append(bleu)
                except Exception as e:
                    print(e)
                    pass

            total_bleu = sum(total_bleu) / len(total_bleu)
            batch_bleu.append(total_bleu)
            
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            tgt = tgt[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, tgt)
            epoch_loss += loss.item()
            
    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len_val_iter, batch_bleu

The model has 66,472,633 trainable parameters


  nn.init.kaiming_uniform(m.weight.data)


In [20]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def idx_to_word(x, vocab):
    words = []
    for i in x:
        word = vocab.get_itos()[i]
        if '<' not in word:
            words.append(word)
    words = " ".join(words)
    return words

def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for hyp, ref in zip(hypotheses, reference):
        stats += np.array(bleu_stats(hyp, ref))
    return 100 * bleu(stats)

In [21]:
def run(total_epoch, best_loss):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_iter, optimizer, criterion, clip)
        valid_loss, bleu = evaluate(model, valid_iter, criterion)
        end_time = time.time()

        if step > warmup:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), f'saved/model_{step+1}-{valid_loss}.pt')

        f = open('result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()

        f = open('result/bleu.txt', 'w')
        f.write(str(bleus))
        f.close()

        f = open('result/test_loss.txt', 'w')
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')


if __name__ == '__main__':
    run(total_epoch=epoch, best_loss=inf)




step : 0.0 % , loss : 10.579936027526855
step : 0.44 % , loss : 10.550848007202148
step : 0.88 % , loss : 10.5911865234375
step : 1.32 % , loss : 10.553374290466309
step : 1.76 % , loss : 10.47957992553711
step : 2.2 % , loss : 10.550291061401367
step : 2.64 % , loss : 10.464485168457031
step : 3.08 % , loss : 10.503978729248047
step : 3.52 % , loss : 10.500747680664062
step : 3.96 % , loss : 10.517138481140137
step : 4.41 % , loss : 10.474889755249023
step : 4.85 % , loss : 10.471578598022461
step : 5.29 % , loss : 10.437241554260254
step : 5.73 % , loss : 10.37551498413086
step : 6.17 % , loss : 10.390289306640625
step : 6.61 % , loss : 10.425599098205566
step : 7.05 % , loss : 10.3930082321167
step : 7.49 % , loss : 10.3561372756958
step : 7.93 % , loss : 10.35879898071289


KeyboardInterrupt: 