<a href="https://colab.research.google.com/github/leedhn/papago/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline

In [2]:
from google.colab import drive #edit
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NAVER 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/NAVER


In [3]:
import pandas as pd #edit
import numpy as np
from glob import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import logging

In [4]:
NUM_EPOCHS = 100

#Logger setting

In [5]:
logger = logging.getLogger('my_logger')

logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
logger.info('This message has a date/time timestamp')


logger.propagate = False # do not pass logs to the default logger

# Create handlers
c_handler = logging.StreamHandler()
EPOCH=10
f_handler = logging.FileHandler(f'transformer_{NUM_EPOCHS}.log', mode='w')
c_handler.setLevel(logging.INFO)
f_handler.setLevel(logging.INFO)

# Create formatters and add it to handlers
c_format = logging.Formatter('%(asctime)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(message)s')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)



2021-06-10 13:30:35,885 - This message has a date/time timestamp


#Loading data

In [6]:
logger.info('='*50)
logger.info('LOADING DATA')
logger.info('')
%cd data
txt_list = glob('*.txt') #edit
datas = {}
for txt in txt_list:
    datas[txt] = pd.read_csv(txt,header=None)
    name = txt[-10:-4]
    datas[txt].columns = [name]
    for i in range(len(datas[txt][name])):
        datas[txt][name][i] = np.fromstring(datas[txt][name][i] ,dtype=int,sep=' ').tolist()
        #datas[txt][name][i].append(1)
    logger.info(f'Load {txt} finished')
%cd ../

logger.info('='*50)

2021-06-10 13:30:35,907 - LOADING DATA
2021-06-10 13:30:35,909 - 


/content/drive/MyDrive/NAVER/data


2021-06-10 13:30:36,974 - Load train_source.txt finished
2021-06-10 13:30:38,183 - Load train_target.txt finished
2021-06-10 13:30:38,419 - Load test_target.txt finished
2021-06-10 13:30:38,644 - Load test_source.txt finished


/content/drive/MyDrive/NAVER



Language Translation with Transformer
=====================================

This tutorial shows, how to train a translation model from scratch using
Transformer. We will be using Multi30k dataset to train a German to English translation model.



Data Processing
---------------

torchtext has utilities for creating datasets that can be easily
iterated through for the purposes of creating a language translation
model. In this example, we show how to tokenize a raw text sentence,
build vocabulary, and numericalize tokens into tensor.

To run this tutorial, first install spacy using pip or conda. Next,
download the raw data for the English and German Spacy tokenizers from
https://spacy.io/usage/models



In [7]:
import math
import torchtext
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
from torch import Tensor
import io
import time

In [8]:
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128
PAD_IDX = 3#de_vocab['<pad>']
BOS_IDX = 2#de_vocab['<bos>']
EOS_IDX = 1#de_vocab['<eos>']

In [9]:
logger.info('')
logger.info('='*50)
logger.info('DATA PROCESSING')
logger.info('')

train_data_new = []
for iter in range(len(datas['train_source.txt'])):
    train_data_new.append((torch.Tensor(datas['train_source.txt']['source'][iter]).type(torch.long),torch.Tensor(datas['train_target.txt']['target'][iter]).type(torch.long)))

logger.info('Train data processing finished')

test_data_new = []
for iter in range(len(datas['test_source.txt'])):
    test_data_new.append((torch.Tensor(datas['test_source.txt']['source'][iter]).type(torch.long),torch.Tensor(datas['test_target.txt']['target'][iter]).type(torch.long)))

logger.info('Test data processing finished')
logger.info('-'*50)

2021-06-10 13:30:38,801 - 
2021-06-10 13:30:38,805 - DATA PROCESSING
2021-06-10 13:30:38,812 - 
2021-06-10 13:30:39,164 - Train data processing finished
2021-06-10 13:30:39,245 - Test data processing finished
2021-06-10 13:30:39,247 - --------------------------------------------------


DataLoader
----------

The last torch specific feature we’ll use is the DataLoader, which is
easy to use since it takes the data as its first argument. Specifically,
as the docs say: DataLoader combines a dataset and a sampler, and
provides an iterable over the given dataset. The DataLoader supports
both map-style and iterable-style datasets with single- or multi-process
loading, customizing loading order and optional automatic batching
(collation) and memory pinning.

Please pay attention to collate_fn (optional) that merges a list of
samples to form a mini-batch of Tensor(s). Used when using batched
loading from a map-style dataset.




In [10]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  de_batch, en_batch = [], []
  for (de_item, en_item) in data_batch:
    
    de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)#, batch_first=True)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)#, batch_first=True)

  return de_batch, en_batch



In [11]:

train_iter_new = DataLoader(train_data_new, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter_new = DataLoader(test_data_new, batch_size=BATCH_SIZE,
                       shuffle=False, collate_fn=generate_batch)

In [12]:
logger.info('DATA LOAD FINISHED')
logger.info('='*50)
logger.info('')

2021-06-10 13:30:39,280 - DATA LOAD FINISHED
2021-06-10 13:30:39,284 - 


Transformer!
------------

Transformer is a Seq2Seq model introduced in `“Attention is all you
need” <https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf>`__
paper for solving machine translation task. Transformer model consists
of an encoder and decoder block each containing fixed number of layers.

Encoder processes the input sequence by propogating it, through a series
of Multi-head Attention and Feed forward network layers. The output from
the Encoder referred to as ``memory``, is fed to the decoder along with
target tensors. Encoder and decoder are trained in an end-to-end fashion
using teacher forcing technique.




In [13]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)


class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
                
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

Text tokens are represented by using token embeddings. Positional
encoding is added to the token embedding to introduce a notion of word
order.




In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + 
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

We create a ``subsequent word`` mask to stop a target word from
attending to its subsequent words. We also create masks, for masking
source and target padding tokens




In [15]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
  src_seq_len = src.shape[0]
  tgt_seq_len = tgt.shape[0]

  tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
  src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

  src_padding_mask = (src == PAD_IDX).transpose(0, 1)
  tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
  return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

Define model parameters and instantiate model 




In [16]:
# SRC_VOCAB_SIZE = len(de_vocab)
# TGT_VOCAB_SIZE = len(en_vocab)

SRC_VOCAB_SIZE = 1000#len(de_vocab)
TGT_VOCAB_SIZE = 1000#len(en_vocab)

EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 100#64#16

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, 
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)

In [17]:
# 모델의 state_dict 출력
logger.info("Model's state_dict:")
for param_tensor in transformer.state_dict():
    logger.info(f"{param_tensor}\t {transformer.state_dict()[param_tensor].size()}")

# 옵티마이저의 state_dict 출력
logger.info("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    logger.info(f"{var_name} \t {optimizer.state_dict()[var_name]}")

2021-06-10 13:30:42,144 - Model's state_dict:
2021-06-10 13:30:42,149 - transformer_encoder.layers.0.self_attn.in_proj_weight	 torch.Size([1536, 512])
2021-06-10 13:30:42,152 - transformer_encoder.layers.0.self_attn.in_proj_bias	 torch.Size([1536])
2021-06-10 13:30:42,155 - transformer_encoder.layers.0.self_attn.out_proj.weight	 torch.Size([512, 512])
2021-06-10 13:30:42,158 - transformer_encoder.layers.0.self_attn.out_proj.bias	 torch.Size([512])
2021-06-10 13:30:42,161 - transformer_encoder.layers.0.linear1.weight	 torch.Size([512, 512])
2021-06-10 13:30:42,164 - transformer_encoder.layers.0.linear1.bias	 torch.Size([512])
2021-06-10 13:30:42,167 - transformer_encoder.layers.0.linear2.weight	 torch.Size([512, 512])
2021-06-10 13:30:42,170 - transformer_encoder.layers.0.linear2.bias	 torch.Size([512])
2021-06-10 13:30:42,173 - transformer_encoder.layers.0.norm1.weight	 torch.Size([512])
2021-06-10 13:30:42,176 - transformer_encoder.layers.0.norm1.bias	 torch.Size([512])
2021-06-10 13:

In [18]:
def train_epoch(model, train_iter, optimizer):
  model.train()
  losses = 0
  for idx, (src, tgt) in enumerate(train_iter):
      src = src.to(device)
      tgt = tgt.to(device)
      #print(src.shape)
      tgt_input = tgt[:-1, :]

      src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

      logits = model(src, tgt_input, src_mask, tgt_mask,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)
      
      optimizer.zero_grad()
      
      tgt_out = tgt[1:,:]
      loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
      loss.backward()

      optimizer.step()
      losses += loss.item()
  return losses / len(train_iter)


def evaluate(model, val_iter):
  model.eval()
  losses = 0
  for idx, (src, tgt) in (enumerate(val_iter)):
    src = src.to(device)
    tgt = tgt.to(device)

    tgt_input = tgt[:-1, :]

    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

    logits = model(src, tgt_input, src_mask, tgt_mask,
                              src_padding_mask, tgt_padding_mask, src_padding_mask)
    tgt_out = tgt[1:,:]
    loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
    losses += loss.item()
  return losses / len(val_iter)

In [19]:
def evaluate_new(model, val_iter):
  model.eval()
  losses = 0
  for idx, (src, tgt) in (enumerate(val_iter)):
    src = src.to(device)
    tgt = tgt.to(device)

    tgt_input = tgt[:-1, :]

    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

    logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
    tgt_out = tgt[1:,:]
    #loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
    #losses += loss.item()

    #print(logits.reshape(-1, logits.shape[-1]).shape)
    pred = torch.argmax(logits.reshape(-1, logits.shape[-1]), dim=1).view(tgt_out.shape)

    y = (torch.transpose(tgt_out, 0, 1))
    new_pred = torch.transpose(pred,0,1)

    #print(y)
    new_losses = 0
    for i in range(len(new_pred)):
        try:
            fin = new_pred[i].tolist().index(1)
        except:
            fin = len(new_pred[i])
            if fin==0:
                fin=1
        new_loss=0
        new_loss = np.linalg.norm(new_pred[i][:fin].cpu()-y[i][:fin].cpu())
        #fin = len(new_pred[i])
        # for k in range(fin):
        #     output = new_pred[i][k]
        #     target = y[i][k]
        #     new_loss += ((output - target)**2)#.mean()
        #     #print(new_loss)
        new_losses += new_loss#((new_loss/fin).item())
        
    losses += (new_losses / len(new_pred))    
    
  return losses / len(val_iter)


##Train model 




In [20]:
logger.info('='*50)
logger.info('TRAIN')
logger.info('-'*50)
train_losses = []
logger.info(f'NUM EPOCHS: {NUM_EPOCHS}')
for epoch in range(1, NUM_EPOCHS+1):
  start_time = time.time()
  train_loss = train_epoch(transformer, train_iter_new, optimizer)
  end_time = time.time()
  train_losses.append(train_loss)
  #val_loss = evaluate(transformer, test_iter_new)
  #norm_loss = evaluate_new(transformer,test_iter_new)
  #print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}"))#, Val loss: {val_loss:.3f}, "
          #f"Epoch time = {(end_time - start_time):.3f}s"),'norm_loss :',norm_loss)
  logger.info((f'Epoch: {epoch}, Train loss: {train_loss:.3f} Epoch time = {(end_time - start_time):.3f}s'))
logger.info('-'*50)
logger.info('TRAIN FINISHED')
logger.info('='*50)
logger.info('')

2021-06-10 13:30:42,559 - TRAIN
2021-06-10 13:30:42,561 - --------------------------------------------------
2021-06-10 13:30:42,566 - NUM EPOCHS: 100
2021-06-10 13:30:50,110 - Epoch: 1, Train loss: 4.253 Epoch time = 7.543s
2021-06-10 13:30:57,616 - Epoch: 2, Train loss: 3.167 Epoch time = 7.504s
2021-06-10 13:31:05,138 - Epoch: 3, Train loss: 2.719 Epoch time = 7.520s
2021-06-10 13:31:12,606 - Epoch: 4, Train loss: 2.386 Epoch time = 7.466s
2021-06-10 13:31:20,184 - Epoch: 5, Train loss: 2.099 Epoch time = 7.576s
2021-06-10 13:31:27,717 - Epoch: 6, Train loss: 1.841 Epoch time = 7.531s
2021-06-10 13:31:35,237 - Epoch: 7, Train loss: 1.615 Epoch time = 7.517s
2021-06-10 13:31:42,714 - Epoch: 8, Train loss: 1.420 Epoch time = 7.474s
2021-06-10 13:31:50,191 - Epoch: 9, Train loss: 1.262 Epoch time = 7.475s
2021-06-10 13:31:57,607 - Epoch: 10, Train loss: 1.125 Epoch time = 7.414s
2021-06-10 13:32:05,179 - Epoch: 11, Train loss: 1.018 Epoch time = 7.570s
2021-06-10 13:32:12,716 - Epoch: 

In [21]:
torch.save(transformer, f'trained_model/transformer_epoch_{NUM_EPOCHS}.pth')
logger.info(f'MODEL SAVED AS transformer_epoch_{NUM_EPOCHS}.pth')
logger.info('')
test_loss = evaluate(transformer, test_iter_new)
logger.info(f'TEST LOSS : {test_loss}')
logger.info('')

2021-06-10 13:43:14,065 - MODEL SAVED AS transformer_epoch_100.pth
2021-06-10 13:43:14,070 - 
2021-06-10 13:43:14,813 - TEST LOSS : 0.3388785906136036
2021-06-10 13:43:14,815 - 


#Decode and evaluate

In [22]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
          break
    return ys


In [23]:
def translate(model, idx):
  model.eval()
  tokens = test_data_new[idx][0]
  num_tokens = len(tokens)
  src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
  src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
  tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()

  return tokens.tolist(),test_data_new[idx][1].tolist(),tgt_tokens[1:-1].tolist()

In [24]:
import random

logger.info(f"PREDICTION {10} EXAMPLES ")
logger.info('='*50)
for i in range(10):
    idx = random.randrange(1,len(test_data_new))
    source, target, predict = translate(transformer,idx)
    logger.info(f"> {source}")
    logger.info(f"= {target}")
    logger.info(f"< {predict}")
    logger.info('-'*40)
logger.info('')

2021-06-10 13:43:14,862 - PREDICTION 10 EXAMPLES 
2021-06-10 13:43:14,963 - > [271, 584, 68, 311, 584, 342, 68, 227, 200, 156, 437, 68, 263, 105, 95, 140, 227, 271, 200, 311, 68, 52, 200, 95, 271, 35, 113]
2021-06-10 13:43:14,964 - = [370, 68, 5, 68, 81, 68, 25, 85, 70, 158, 68, 331, 108]
2021-06-10 13:43:14,966 - < [370, 68, 5, 68, 81, 68, 247, 85, 70, 158, 68, 331, 149, 551]
2021-06-10 13:43:14,969 - ----------------------------------------
2021-06-10 13:43:15,057 - > [33, 95, 437, 156, 105, 584, 342, 35, 68, 437, 157, 402, 200, 402, 437, 78, 437, 157, 140]
2021-06-10 13:43:15,059 - = [240, 211, 405, 189, 85, 68, 430, 16, 158, 199, 438, 359]
2021-06-10 13:43:15,062 - < [240, 211, 266, 158, 108, 68, 430, 16, 158, 199, 438, 359]
2021-06-10 13:43:15,063 - ----------------------------------------
2021-06-10 13:43:15,131 - > [140, 105, 52, 619, 437, 140]
2021-06-10 13:43:15,132 - = [222, 476]
2021-06-10 13:43:15,134 - < [222, 476, 476, 476, 476, 476, 476, 476, 476]
2021-06-10 13:43:15,137

In [25]:
def correct_num(model): #whole answer
  model.eval()
  corrects = 0
  for idx in range(len(test_data_new)):
    tokens = test_data_new[idx][0]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()

    if test_data_new[idx][1].tolist()==tgt_tokens[1:-1].tolist():
        corrects +=1
  return corrects

In [26]:
def correct_num_wide(model): #sliced answer 
  model.eval()
  corrects = 0
  for idx in range(len(test_data_new)):
    tokens = test_data_new[idx][0]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()

    if test_data_new[idx][1].tolist()==tgt_tokens[1:len(test_data_new[idx][1])+1].tolist():
        corrects +=1
  return corrects

In [27]:
acc_c = (correct_num(transformer)) #epoch 16
acc_cw = (correct_num_wide(transformer))
logger.info('='*50)
logger.info(f'ACCURACY of model transformer_epoch_{NUM_EPOCHS}.pth')
logger.info('-'*50)
logger.info(f'Truely corrected target sequence score : {acc_c/len(test_data_new)} {acc_c} / {len(test_data_new)}')

logger.info(f'Widely corrected target sequence score : {acc_cw/len(test_data_new)} {acc_cw} / {len(test_data_new)}')
logger.info('')
logger.info('FINISHED')
logger.info('')

2021-06-10 13:49:01,411 - ACCURACY of model transformer_epoch_100.pth
2021-06-10 13:49:01,415 - --------------------------------------------------
2021-06-10 13:49:01,417 - Truely corrected target sequence score : 0.2455 491 / 2000
2021-06-10 13:49:01,421 - Widely corrected target sequence score : 0.6435 1287 / 2000
2021-06-10 13:49:01,423 - 
2021-06-10 13:49:01,426 - FINISHED
2021-06-10 13:49:01,427 - 


References
----------

1. Attention is all you need paper.
   https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding 

