reference:
- https://github.com/jungyeul/korean-parallel-corpora
- https://towardsdatascience.com/intuitive-understanding-of-attention-mechanism-in-deep-learning-6c9482aecf4f

#### concatenate kor.txt and eng.txt with tab delimiter.

In [1]:
import sys
sys.path.insert(0, '../KoGPT2/')

In [2]:
import glob
import re
import os
import string
import time
import tqdm
import sklearn
import torch
import numpy as np
import plotly.graph_objects as go
import tensorflow.compat.v1 as tf

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from plotly.offline import iplot
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers import BertModel, BertTokenizer
import pandas as pd
import torch

In [3]:
data_dir = '../korean-parallel-corpora/korean-english-news-v1'
targets = ['train', 'test', 'dev']

In [4]:
for target in targets:
    output_filename = '{}/{}.txt'.format(data_dir, target)
    if os.path.exists(output_filename):
        print('already have {}'.format(output_filename))
        continue
        
    ko_file, en_file = glob.glob('{}/*{}.??'.format(data_dir, target))
    
    # read korean/english files
    ko_lines = open(ko_file).read().strip().split('\n')
    en_lines = open(en_file).read().strip().split('\n')
    
    # write to output file
    with open(output_filename, 'w') as out:
        for en, kr in zip(en_lines, ko_lines):
            oneline = '\t'.join([en, kr])
            out.write(oneline + '\n')
            
    print('{} was written'.format(output_filename))

already have ../korean-parallel-corpora/korean-english-news-v1/train.txt
already have ../korean-parallel-corpora/korean-english-news-v1/test.txt
already have ../korean-parallel-corpora/korean-english-news-v1/dev.txt


In [5]:
model_nm = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_nm)
eng_tokenizer = tokenizer.tokenize

embedding_size = 10
hidden_size = 32
n_batch = 8

In [6]:
tok_path = get_tokenizer()
kor_tokenizer = SentencepieceTokenizer(tok_path)
_, vocab = get_pytorch_kogpt2_model()

using cached model
using cached model
using cached model


In [7]:
vocab(['<pad>'])[0], tokenizer.vocab['[PAD]']

(3, 0)

In [8]:
def convert_string_to_index(kr=None, en=None, kr_pad=tokenizer.vocab['[PAD]'], en_pad=vocab(['<pad>'])[0], maxlen=50):
    '''
        convert korean/english sentence into its own indices.
        maximum length of converted indices is `maxlen`
    '''
    assert (kr != None) or (en != None), 'one of either kr or en should have a value'
    kr_index, en_index = None, None
    
    if kr:
        kr_index = vocab(kor_tokenizer(kr))
        if len(kr_index) > maxlen:
            kr_index = kr_index[:maxlen]
        else:
            kr_index = kr_index + [kr_pad] * (maxlen-len(kr_index))
    if en:
        en_index = tokenizer.convert_tokens_to_ids(eng_tokenizer(en))
        if len(en_index) > maxlen:
            en_index = en_index[:maxlen]
        else:
            en_index = en_index + [en_pad] * (maxlen-len(en_index))
        
    return kr_index, en_index

In [9]:
class TranslationText():
    def __init__(self, kr, en):
        self.kr = kr
        self.en = en

In [10]:
class TranslationDataset(Dataset):
    """Translation dataset."""

    def __init__(self, csv_file, transform=None, names=['kor', 'eng'], sep='\t'):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
            names (list): A list of column names
            sep (string): A string that is used for a delimiter.
        """
        #self.landmarks_frame = pd.read_csv(csv_file)
        self.df = pd.read_csv(csv_file, names=names, sep=sep)
        self.df = self.df[(self.df.kor.notnull()) & (self.df.eng.notnull())]
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        onerow =  self.df.iloc[idx]
        kr_index, en_index = convert_string_to_index(kr=onerow.kor, en=onerow.eng)
        kor_tensor = torch.LongTensor(kr_index)
        eng_tensor = torch.LongTensor(en_index)
        return (kor_tensor, eng_tensor)

In [11]:
dataset = TranslationDataset('../korean-parallel-corpora/korean-english-news-v1/train.txt')

In [12]:
trainloader = DataLoader(dataset, batch_size = n_batch, shuffle = True)

In [13]:
for i, (input_kr, input_en) in enumerate(trainloader):
    print(input_kr.shape)
    print(input_en.shape)
    print('------------')
    break

torch.Size([8, 50])
torch.Size([8, 50])
------------


In [14]:
input_kr.shape, input_en.shape

(torch.Size([8, 50]), torch.Size([8, 50]))

In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, n_batch):
        super(Encoder, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, self.embedding_size)
        self.gru = nn.GRU(embedding_size, 
                          hidden_size,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=False)
        self.init_hidden = torch.zeros(1, n_batch, self.hidden_size).cuda()
        
    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.gru(x, self.init_hidden)
        return output, hidden

In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, n_batch):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.gru = nn.GRU(embedding_size+hidden_size,
                         hidden_size,
                         num_layers=1,
                         bias=True, 
                         batch_first=True,
                         bidirectional=False)
        self.init_hidden = torch.zeros(1, n_batch, self.hidden_size).cuda()
        self.tahn = nn.Tanh()
        
        self.W1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.W2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.V = nn.Linear(self.hidden_size, 1)
        self.softmax = nn.Softmax(dim=1)
        self.fc = nn.Linear(self.hidden_size, vocab_size)
        
    def forward(self, x, hidden, enc_output):
        score = self.V(self.tahn(self.W1(enc_output) + self.W2(hidden)))
        attention_weights = self.softmax(score)
        context_vector = attention_weights * enc_output
        context_vector = context_vector.sum(dim=1)
        
        x = self.embedding(x)
        x = torch.cat((x, context_vector.unsqueeze(dim=1)), dim=-1)
        output, state = self.gru(x, self.init_hidden)
        output = output.squeeze(dim=1)
        output = self.fc(output)
        return output, state, attention_weights

In [17]:
class EncoderDecoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, src_vocab_size, dst_vocab_size):
        super(EncoderDecoder, self).__init__()
        
        # tokenizer.vocab_size == src_vocab_size
        # len(vocab) == dst_vocab_size
        self.encoder = Encoder(src_vocab_size, embedding_size, hidden_size, n_batch=n_batch)
        self.decoder = Decoder(dst_vocab_size, embedding_size, hidden_size, n_batch=n_batch)
        self.loss_fn = DecoderLoss()
        
    def forward(self, src, dst):
        enc_output, enc_hidden = self.encoder(src)

        loss = 0
        hidden = enc_hidden.transpose(0, 1)
        dec_input = torch.LongTensor([vocab(['<start>'])] * n_batch).to('cuda')
        for t in range(1, dst.size()[1]):
            output, state, _ = self.decoder(dec_input, hidden, enc_output)
            loss += self.loss_fn(dst[:, t], output)
        return loss

In [18]:
class DecoderLoss(nn.Module):
    def __init__(self):
        super(DecoderLoss, self).__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, real, pred):
        mask = 1 - (real == 0).type(torch.LongTensor)
        loss = self.loss_fn(pred, real) * mask
        loss = loss.sum()
        return loss

In [19]:
input_kr.shape, input_en.shape
torch.cat((input_kr, input_en), dim=-1).shape

torch.Size([8, 100])

In [20]:
seq2seq = EncoderDecoder(embedding_size, hidden_size, tokenizer.vocab_size, len(vocab))
seq2seq.cuda()

EncoderDecoder(
  (encoder): Encoder(
    (embedding): Embedding(30522, 10)
    (gru): GRU(10, 32, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(50000, 10)
    (gru): GRU(42, 32, batch_first=True)
    (tahn): Tanh()
    (W1): Linear(in_features=32, out_features=32, bias=True)
    (W2): Linear(in_features=32, out_features=32, bias=True)
    (V): Linear(in_features=32, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
    (fc): Linear(in_features=32, out_features=50000, bias=True)
  )
  (loss_fn): DecoderLoss(
    (loss_fn): CrossEntropyLoss()
  )
)

In [21]:
seq2seq.train()

EncoderDecoder(
  (encoder): Encoder(
    (embedding): Embedding(30522, 10)
    (gru): GRU(10, 32, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(50000, 10)
    (gru): GRU(42, 32, batch_first=True)
    (tahn): Tanh()
    (W1): Linear(in_features=32, out_features=32, bias=True)
    (W2): Linear(in_features=32, out_features=32, bias=True)
    (V): Linear(in_features=32, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
    (fc): Linear(in_features=32, out_features=50000, bias=True)
  )
  (loss_fn): DecoderLoss(
    (loss_fn): CrossEntropyLoss()
  )
)

In [22]:
n_epoch = 5
for epoch in range(n_epoch):
    total_loss = 0
    
    tbar = tqdm(enumerate(trainloader), desc='training at {}th epoch'.format(i))
    for i, (input_kr, input_en) in tbar:
        input_en = input_en.cuda()
        input_kr = input_kr.cuda()
        loss = seq2seq(input_en, input_kr)
#         enc_output, enc_hidden = encoder(input_en)

#         loss = 0
#         hidden = enc_hidden.transpose(0, 1)
#         dec_input = torch.LongTensor([vocab(['<pad>'])] * n_batch)
#         for t in range(1, input_kr.size()[1]):
#             output, state, _ = decoder(dec_input, hidden, enc_output)
#             loss += loss_fn(input_kr[:, t], output)
        
        # calculate loss
        batch_loss = loss / input_kr.size()[1]
        total_loss += batch_loss
        tbar.set_postfix(loss=batch_loss)
        
    print('Loss at {}th epoch: {:.4f}'.format(epoch, total_loss / n_batch))

training at 0th epoch: 34it [00:01, 19.40it/s, loss=tensor(43.3575, grad_fn=<DivBackward0>)]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 5.93 GiB total capacity; 5.43 GiB already allocated; 5.75 MiB free; 5.44 GiB reserved in total by PyTorch)

training at 0th epoch: 34it [00:20, 19.40it/s, loss=tensor(43.3575, grad_fn=<DivBackward0>)]

In [30]:
loss.shape

torch.Size([10])

In [29]:
from torch.autograd import Variable

In [30]:
input_size = 10 # input dimension (word embedding) D
hidden_size = 30 # hidden dimension H
batch_size = 3
length = 4

rnn = nn.GRU(input_size,hidden_size,num_layers=1,bias=True,batch_first=True,bidirectional=True)
inputs = Variable(torch.randn(batch_size,length,input_size)) # B,T,D
hidden = Variable(torch.zeros(2,batch_size,hidden_size)) # 2,B,H

print(inputs.shape)
print(hidden.size())
print('----')
output, hidden = rnn(inputs, hidden)

print(inputs.shape)
print(output.size())
print(hidden.size())

torch.Size([3, 4, 10])
torch.Size([2, 3, 30])
----
torch.Size([3, 4, 10])
torch.Size([3, 4, 60])
torch.Size([2, 3, 30])


In [5]:
# Set the file path
file_path = os.path.join(data_dir, 'train.txt')

# read the file
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines = lines[:20000]

# perform basic cleaning
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [6]:
def preprocess_kor_sentence(sent):
    '''Function to preprocess Marathi sentence'''
    sent = re.sub("'", '', sent)
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.strip()
    sent = re.sub(" +", " ", sent)
    sent = '<start> ' + sent + ' <end>'
    return sent

def preprocess_eng_sentence(sent):
    '''Function to preprocess English sentence'''
    sent = sent.lower()
    sent = re.sub("'", '', sent)
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits)
    sent = sent.strip()
    sent = re.sub(" +", " ", sent)
    sent = '<start> ' + sent + ' <end>'
    return sent

In [7]:
# Generate pairs of cleaned English and Marathi sentences
sent_pairs = []
for line in lines:
    sent_pair = []
    ko, en = line.split('\t')
    
    # append korean
    ko = preprocess_kor_sentence(ko)
    sent_pair.append(ko)
    
    # append english
    en = preprocess_kor_sentence(en)
    sent_pair.append(en)
    
    # append sentence pair
    sent_pairs.append(sent_pair)

In [8]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [9]:
# Function to calculate maximum length of the sequence
def max_length(tensor):
    return max(len(t) for t in tensor)

In [10]:
def load_dataset(pairs, num_examples):
    # pairs => already created cleaned input, output pairs

    # index language using the class defined above    
    inp_lang = LanguageIndex(en for en, ma in pairs)
    targ_lang = LanguageIndex(ma for en, ma in pairs)
    
    # Vectorize the input and target languages
    
    # English sentences
    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, ma in pairs]
    
    # Marathi sentences
    target_tensor = [[targ_lang.word2idx[s] for s in ma.split(' ')] for en, ma in pairs]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    
    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [11]:
# Create the tensors
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state = 101)

In [12]:
# Set the parameters of the model
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 128
units = 256
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

In [13]:
# Create batch generator to be used by modle to load data in batches
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [14]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
    if tf.test.is_gpu_available():
        return tf.keras.layers.CuDNNGRU(units, 
                                        return_sequences=True, 
                                        return_state=True, 
                                        recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(units, 
                                   return_sequences=True, 
                                   return_state=True, 
                                   recurrent_activation='sigmoid', 
                                   recurrent_initializer='glorot_uniform')

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        # this is the step 1 described in the blog to compute scores s1, s2, ...
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        # this is the step 2 described in the blog to compute attention weights e1, e2, ...
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        # this is the step 3 described in the blog to compute the context_vector = e1*h1 + e2*h2 + ...
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        # this is the step 4 described in the blog to concatenate the context vector with the output of the previous time step
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        # this is the step 5 in the blog, to compute the next output word in the sequence
        x = self.fc(output)
        
        # return current output, current state and the attention weights
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [15]:
# Create objects of Class Encoder and Class Decoder
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


In [16]:
optimizer = tf.compat.v1.train.AdamOptimizer()

In [69]:
def loss_function(real, pred):
    mask = 1 - (real == 0).type(torch.LongTensor)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [70]:
checkpoint_dir = '{}/training_checkpoints'.format(data_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

NameError: name 'optimizer' is not defined

In [19]:
EPOCHS = 15

In [20]:
for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    tbar = tqdm.tqdm(enumerate(dataset), desc='training {}th epoch'.format(epoch))
    for (batch, (inp, targ)) in tbar:
        loss = 0
        
        with tf.GradientTape() as tape:
            print('input shape: {}'.format(inp.shape))
            enc_output, enc_hidden = encoder(inp, hidden)
            print('encoder output shape: {}'.format(enc_output.shape))
            print('encoder hidden shape: {}'.format(enc_hidden.shape))
            print('---------------')
            break
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        tbar.set_postfix(loss=batch_loss)
        '''
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
        '''
    # saving (checkpoint) the model every epoch
    checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH))
    #print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

training 0th epoch: 0it [00:00, ?it/s]

input shape: (64, 69)



training 1th epoch: 0it [00:00, ?it/s][A
training 0th epoch: 0it [00:06, ?it/s]

training 2th epoch: 0it [00:00, ?it/s][A
training 1th epoch: 0it [00:00, ?it/s]

encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 1 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 2 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------




training 3th epoch: 0it [00:00, ?it/s][A
training 2th epoch: 0it [00:00, ?it/s]

training 4th epoch: 0it [00:00, ?it/s][A
training 3th epoch: 0it [00:00, ?it/s]

training 5th epoch: 0it [00:00, ?it/s][A
training 4th epoch: 0it [00:00, ?it/s]

Epoch 3 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 4 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 5 Loss 0.0000




training 6th epoch: 0it [00:00, ?it/s][A
training 5th epoch: 0it [00:00, ?it/s]

training 7th epoch: 0it [00:00, ?it/s][A
training 6th epoch: 0it [00:00, ?it/s]


input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 6 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 7 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------



training 8th epoch: 0it [00:00, ?it/s][A
training 7th epoch: 0it [00:00, ?it/s]

training 9th epoch: 0it [00:00, ?it/s][A
training 8th epoch: 0it [00:00, ?it/s]

training 10th epoch: 0it [00:00, ?it/s][A
training 9th epoch: 0it [00:00, ?it/s] 

Epoch 8 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 9 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 10 Loss 0.0000




training 11th epoch: 0it [00:00, ?it/s][A
training 10th epoch: 0it [00:00, ?it/s]

training 12th epoch: 0it [00:00, ?it/s][A
training 11th epoch: 0it [00:00, ?it/s]


input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 11 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 12 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------



training 13th epoch: 0it [00:00, ?it/s][A
training 12th epoch: 0it [00:00, ?it/s]

training 14th epoch: 0it [00:00, ?it/s][A
training 13th epoch: 0it [00:00, ?it/s]


Epoch 13 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 14 Loss 0.0000
input shape: (64, 69)
encoder output shape: (64, 69, 256)
encoder hidden shape: (64, 256)
---------------
Epoch 15 Loss 0.0000


In [None]:
def evaluate(inputs, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        sentence = sentence + inp_lang.idx2word[i] + ' '
    sentence = sentence[:-1]
    
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)

    # start decoding
    for t in range(max_length_targ): # limit the length of the decoded sequence
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        # stop decoding if '<end>' is predicted
        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
def predict_random_val_sentence():
    actual_sent = ''
    k = np.random.randint(len(input_tensor_val))
    random_input = input_tensor_val[k]
    random_output = target_tensor_val[k]
    random_input = np.expand_dims(random_input,0)
    
    result, sentence, attention_plot = evaluate(random_input, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
    print('Input: {}'.format(sentence[8:-6]))
    print('Predicted translation: {}'.format(result[:-6]))
    for i in random_output:
        if i == 0:
            break
        actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
    actual_sent = actual_sent[8:-7]
    print('Actual translation: {}'.format(actual_sent))
    attention_plot = attention_plot[:len(result.split(' '))-2, 1:len(sentence.split(' '))-1]
    sentence, result = sentence.split(' '), result.split(' ')
    sentence = sentence[1:-1]
    result = result[:-2]
    # use plotly to plot the heatmap
    #trace = go.Heatmap(z = attention_plot, x = sentence, y = result, colorscale='Reds')
    #data=[trace]
    #iplot(data)
    return attention_plot

In [30]:
import seaborn as sns

In [40]:
# Finally call the function multiple times to visualize random results from the test set
attention = predict_random_val_sentence()

Input: As the yearend employment season is approaching college graduatestobe or graduates are busy seeking jobs but their rate of employment at large enterprises remains at a low level
Predicted translation: 대선레이스에 바카스를 마련했지만 샌안젤로 예정이지만 버클리 중동국가는 스탈린에 경험했던 헤엄을 달러가 프로스트 계획”이라며 화성탐사선과의 일년 입법권에 닷오알지org 소요될 허가 라키왓 진입하도록 Match 지불했다고 젊고 강씨에게 ‘노인을 이르기까지 구조하고 거슬리고 급성장에도 방문일정에 31의 동일하다고 정보들은 달러가 프로스트 계획”이라며 화성탐사선과의 일년 입법권에 닷오알지org 소요될 허가 라키왓 진입하도록 Match 지불했다고 젊고 강씨에게 ‘노인을 이르기까지 구조하고 거슬리고 급성장에도 방문일정에 31의 동일하다고 정보들은 달러가 프로스트 계획”이라며 화성탐사선과의 일년 입법권에 닷오알지org 소요될 허가 라키왓 
Actual translation: 연말 취업 시즌이 다가오면서 대학졸업 예정자들이나 졸업생들은 일자리를 찾느라 분주하지만 대기업의 취업율은 아직도 미미한 수준이다


In [41]:
train = go.Heatmap(z = attention, colorscale='Reds')
data=[train]
iplot(data)

In [50]:
vocab_size = 10
embedding_dim = 20
units = 40
x = np.expand_dims(np.random.randint(0, vocab_size, (5)), axis=0)

# passing encoder
embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
x = embedding(x)
gru_cell = gru(units)
output, state = gru_cell(x)

# passing decoder
W1 = tf.keras.layers.Dense(units)
W2 = tf.keras.layers.Dense(units)
V = tf.keras.layers.Dense(1)

hidden_with_time_axis = tf.expand_dims(state, 1)
score = V(tf.nn.tanh(W1(output) + W2(hidden_with_time_axis)))

# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
attention_weights

<tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
array([[[0.20362999],
        [0.2018812 ],
        [0.1988714 ],
        [0.19788313],
        [0.19773428]]], dtype=float32)>

In [53]:
output.shape, state.shape

(TensorShape([1, 5, 40]), TensorShape([1, 40]))

In [56]:
(W1(output) + W2(state)).shape

TensorShape([1, 5, 40])

In [57]:
score.shape

TensorShape([1, 5, 1])

In [58]:
context_vector = attention_weights * output
context_vector = tf.reduce_sum(context_vector, axis=1)

In [59]:
context_vector

<tf.Tensor: shape=(1, 40), dtype=float32, numpy=
array([[ 0.00450457, -0.000479  ,  0.01776177, -0.00874093, -0.00549831,
        -0.00403002, -0.00280772,  0.01280778, -0.00150356,  0.01241918,
        -0.01647402, -0.00602317,  0.00513975,  0.00845177, -0.01333129,
         0.00751389,  0.00228994,  0.01548713, -0.00374114,  0.00115751,
        -0.00692876,  0.00238407, -0.00133134,  0.01229093,  0.01858362,
        -0.00297811, -0.0060442 ,  0.00627679,  0.00151788,  0.01066009,
        -0.02028783,  0.01406133, -0.02786346,  0.01104608,  0.01019504,
         0.00855909, -0.0151533 , -0.00260982,  0.02325839,  0.00457436]],
      dtype=float32)>

In [49]:
attention_weights

<tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
array([[[0.20131408],
        [0.20278044],
        [0.19993325],
        [0.1975839 ],
        [0.19838838]]], dtype=float32)>

In [16]:
checkpoint_path = "/data/korean-parallel-corpora/korean-english-news-v1/training_checkpoints/ckpt-10."
checkpoint_dir = os.path.dirname(checkpoint_path)# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                   save_weights_only=True,
                                   verbose=1)# Train the model with 

#### 낙서장

In [5]:
import sys
sys.path.insert(0, '../')

In [8]:
import numpy as np
import torch
from modeling.albert_modeling import AlbertMultiHeadAttention
from utils import get_num_params
from config.albert_config import AlbertConfig

In [12]:
conf = AlbertConfig()
np.random.seed(100)
n_batch = 3
inp = np.random.random((n_batch, 10, conf.n_hidden))
inp = torch.FloatTensor(inp)
inp.shape

torch.Size([3, 10, 40])

In [15]:
self_attention = AlbertMultiHeadAttention(conf)
out = self_attention(inp)
print('output shape: {}'.format(out.shape))
print(out.sum())

output shape: torch.Size([3, 10, 40])
tensor(-1.8835e-05, grad_fn=<SumBackward0>)


In [19]:
qo = self_attention.query(inp)
ko = self_attention.key(inp)
vo = self_attention.value(inp)

In [21]:
self_attention.split_tensor(qo).shape, self_attention.split_tensor(qo).transpose(1, 2).shape

(torch.Size([3, 10, 8, 5]), torch.Size([3, 8, 10, 5]))

qo = torch.Size([3, 10, 8, 5]<br>
→ 문장이 3개가 있고, <br>
→ 각 문장은 10개의 단어로 이루어져 있다.<br>
→ 그런데 각각의 단어는 그 단어를 표현하는 8개의 Attention으로 이루어져 있고,<br>
→ 그 Attention의 사이즈는 5이다.

In [26]:
(qo @ ko.transpose(-2,-1)).shape, qo.shape, ko.transpose(-2, -1).shape

(torch.Size([3, 10, 10]), torch.Size([3, 10, 40]), torch.Size([3, 40, 10]))

In [29]:
score = (qo @ ko.transpose(-2,-1)) / np.sqrt(self_attention.attention_size)
prob = self_attention.softmax(score)
score.shape, prob.shape

(torch.Size([3, 10, 10]), torch.Size([3, 10, 10]))

In [31]:
(prob @ vo).shape

torch.Size([3, 10, 40])

In [1]:
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer

ModuleNotFoundError: No module named 'mxnet'

In [None]:
tok_path = get_tokenizer()
tok = SentencepieceTokenizer(tok_path)
sent = '2019년 한해를 보내며,'
toked = tok(sent)