In [129]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.auto import tqdm

import os

# device

In [130]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# I get "RuntimeError: CUDA error: device-side assert triggered" with cuda. So, I run the code on CPU

device = 'cpu'
device

'cpu'

# download the data

In [131]:
# !!curl -O http://www.manythings.org/anki/fra-eng.zip
# !!unzip fra-eng.zip

# configuration

In [132]:
batch_size = 16
n_epochs = 20
latent_dim = 256 # Latent dimensionality of the encoding space.
num_samples = 10000 # Number of samples to train on.

# Path to the data txt file on disk.
data_path = r'fra.txt'

# prepare dataset

In [133]:
# vectorize data

input_texts = []
target_texts = []

input_characters = set()
target_characters = set()

with open(data_path, 'r') as f:
    lines = f.read().split('\n')

In [134]:
# read -> eng / french split
# input_texts = eng, target_texts = french
# lines[: min(num_samples, len(lines)-1)] : num_samples를 lines수보다 높게 잡았을 경우, 에러 방지용 코드
# len(lines)-1 : lines가 Read 과정에서 가장 마지막 줄이 공백으로 포함되어 나옴. 따라서, read_data[-2]까지만 사용해야함
for i, line in enumerate(lines[: min(num_samples, len(lines)-1)]):
    ### line : "I paid in cash. 	J'ai payé en espèce.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)"
    
    try : 
        input_text, target_text, _ = line.split(sep='\t')
    except:
        print(i)
#         break
    ### input_text : "I paid in cash."
    ### target_text : "J'ai payé en espèce."

    
    # We use "tab" as the "start sequence" character ("\t" == "BOS")
    # for the targets, and "\n" as "end sequence" character. ("\n" == "EOS")
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    ### input_texts : ['I paid in cash.', '', ..]
    ### target_texts : ["\tJ'ai payé en espèce.\n", '\t+t+\n', ..]

    # word(or sentence) -> characterize!

    # put the char into the set type (중복 방지)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
            
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
    ### input_characters = {'c', 'a', 's', 'n', 'p', '.', 'i', 'I', 'd', 'h', '-', '-', ..}
    ### target_characters = {'é', "'", 'a', 'J', 's', 'c', 'y', 'n', 'e', 'p', '\t', '.', 'è', '\n', 'i', ' ', ..}
    
                
# set type -> list type (sorted 유지)
# set('d','c','b','a') -> {'a','b','c','d'}
# list(set('d','c','b','a')) -> ['a', 'h', 'd', 'b', 'c']   *정렬 깨짐
input_characters = sorted(list(input_characters))  # input_characters == encoder token
target_characters = sorted(list(target_characters)) # target_charcters == decoder token == '\t'+target_text+'\n'
### input_characters = [' ', '!', '"', '$', '%', '&', "'", ',', '-', '.', '0', '1', '2', '3', '5', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'é']
### target_characters = ['\t', '\n', ' ', '!', '$', '%', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '5', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'À', 'Ç', 'É', 'Ê', 'à', 'â', 'ç', 'è', 'é', 'ê', 'î', 'ï', 'ô', 'ù', 'û', 'œ', '\u2009', '’', '\u202f']

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])


# ## check for max length target sentence == 57?
# max_val = 0
# max_idx = 0
# max_txt = str()
# for i, txt in enumerate(target_texts):
#     if len(txt) >= max_val:
#         max_txt = txt
#         max_val = len(txt)
#         max_idx = i+1
        
# print(f"max length of target sentence : {max_val}, index of line : {max_idx}",end='\n \n')
# print(max_txt)

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)


Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 93
Max sequence length for inputs: 15
Max sequence length for outputs: 59


In [135]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [136]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [137]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    
    # one-hot vectorization
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    
    # one-hot vectorization
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character. (because t > 0)
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        
    # padding (space char) -> 
    decoder_input_data[i, t + 1:, target_token_index[" "]] = 1.0
    # padding (space char) -> [0][0] 에서 2 index가 1로 채워져있다면, padding 된 것.
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

In [138]:
print(f"|encoder input data| : {encoder_input_data.shape}")
print(f"|decoder input data| : {decoder_input_data.shape}")
print(f"|decoder target data| : {decoder_target_data.shape}")

|encoder input data| : (10000, 15, 71)
|decoder input data| : (10000, 59, 93)
|decoder target data| : (10000, 59, 93)


In [139]:
# To Tensor
encoder_input_data = torch.FloatTensor(encoder_input_data)
decoder_input_data = torch.FloatTensor(decoder_input_data)
decoder_target_data = torch.FloatTensor(decoder_target_data)

# build the model

## sequence to sequence 구조도

![title](imgs/seq2seq.jpg)

## Encoder 구조도

![title](imgs/seq2seq_encoder.jpg)

## Encoder flow

![title](imgs/enc_math.png)

In [483]:
class Encoder(nn.Module):
    
    def __init__(self, word_vec_size, hidden_size, num_layers=3, dropout_p=.2, bidirectional=True):
        
        super().__init__()
        
        self.word_vec_size = word_vec_size
        self.hidden_size = int(hidden_size/2)  # to align |hidden size| with |hidden size of decoder(uni-di)|
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout_p = dropout_p
        
        self.lstm = nn.LSTM(
            input_size = self.word_vec_size,
            hidden_size = self.hidden_size,
            num_layers = self.num_layers,
            dropout = self.dropout_p,
            batch_first = True,
            bidirectional = self.bidirectional
        )
        
    def merge_encoder_hiddens(self, out_enc_hidden):
        
        out_enc_hidden, out_enc_cell = out_enc_hidden[0], out_enc_hidden[1]
        
        batch_size = out_enc_hidden.size(1) # should be 10000
        hidden_size = self.hidden_size # should be 256 (not 128)
        
        out_enc_hidden = out_enc_hidden.transpose(0,1).contiguous().view(batch_size, -1, 
                                                                         self.hidden_size*2).transpose(0,1).contiguous()
        
        out_enc_cell = out_enc_cell.transpose(0,1).contiguous().view(batch_size, -1, 
                                                                     self.hidden_size*2).transpose(0,1).contiguous()
        
        return out_enc_hidden, out_enc_cell

    def forward(self, x):
                
        # |x| = (10,000(bs), 15(t-step), 71(|vocab|))
        output_enc, out_enc_hidden = self.lstm(x)
        # |output_enc| = (10,000(bs), 1, 2*hidden_size*(1/2))
        # |output_hidden[0]| = (num_layers * 2(bi-direc), batch_size, hidden_size/2)
        
        return output_enc, out_enc_hidden

In [484]:
# encoder = Encoder(encoder_input_data.shape[-1], latent_dim)

In [485]:
# output_enc, out_enc_hidden = encoder(encoder_input_data)

In [486]:
# print(output_enc.shape)
# print(out_enc_hidden[0].shape)

## Decoder 구조도

![title](imgs/seq2seq_decoder.jpg)

## Decoder flow




![title](imgs/dec_math.png)

In [676]:
class Decoder(nn.Module):
    
    def __init__(self, word_vec_size, hidden_size, num_layers=3, dropout_p=.2, bidirectional=False):
        
        super().__init__()
        
        self.lstm = nn.LSTM(
            input_size = word_vec_size + hidden_size, # input_feeding
            hidden_size = hidden_size,
            num_layers = num_layers,
            dropout = dropout_p,
            batch_first=True,
            bidirectional=False
        )
        
    def forward(self, emb_t, h_t_1_tilde, h_t_1):

        '''

        Decoder forward는 input_feeding(지난 timestep의 h_tilde값)

        1) emb_t : embedding된 decoder input (bs, 1, word_vec_size)

        2) h_t_1_tilde : h(t-1) tilde = input_feeding으로 인해 전달될 지난 timestep의 h_tilde값 
           (h_tilde : attention에서 얻은 context vector와의 concat[]*W_concat 후 결과)
           (bs, 1, hidden_size)

        3) h_t_1 : h(t-1) = 이전 t-step의 hidden state값 = (h_t_1, c_t_1) = tuple(이전 t-step의 hidden, cell state)
        (#layers, bs, hidden_size)

        '''

        batch_size = emb_t.shape[0]
        hidden_size = h_t_1[0].shape[-1]

        ##########==================== for Input feeding ====================##########

        # for the first step, we should initialize the output of previous step (input_feeding)
        if h_t_1_tilde is None:
            
            # if this is the first time-step
            # emb_t.new : emb_t와 같은 device에서 연산되는 동일한 shape의 새 tensor 생성
            h_t_1_tilde = emb_t.new(batch_size, 1, hidden_size).zero_()
        
        x = torch.cat([emb_t, h_t_1_tilde], dim=-1)

        ##########==================== for Input feeding ====================##########

        y, h = self.lstm(x, h_t_1)
        # |y| = (bs, 1, hs) = |ouput_dec| = (10,000(bs), 1(t-step), hidden_size(uni-direc))
        # |h[0]| = |hidden_decoder| = (# layers, bs, hs)

        return y, h


## Encoder / Decoder 비교

```
1. Encoder
    1) Input
        - embedded input source sentence (all t-step) 
        
    2) Return
        - last hidden state of encoder
    
    3) 순차적 진행 여부
        - 한 번에 모든 input이 들어감
    
    
2. Decoder
    1) Input
        - embedded input target sentence (1 t-step) : teacher forcing
        - output of prev t-step : input feeding
        - hidden state of prev t-step
        
    2) Return
        - output of decoder for 1 t-step
        - hidden state
        
    3) 순차적 진행 여부
        - 한 번에 한 t-step 씩 진행됨

```

## Sequence to sequence의 Decoder 특징

```

1. Auto-regressive task. 즉, Bi-LSTM 불가함 (Uni-LSTM 사용)

2. Teacher forcing, Input feeding

3. BOS token으로 시작. 따라서, teacher-forcing으로 인한 y1 input이 2 time-step에 input됨 (한 t-step씩 밀려 들어감)

```



- Auto-regressive task and Teacher-forcing training
    - 참고 : https://kh-kim.gitbook.io/natural-language-processing-with-pytorch/00-cover-9/05-teacher-forcing

## Generator 구조도

![title](imgs/seq2seq_decoder.jpg)

In [677]:
class Generator(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super().__init__()
        
        # Linear
        self.output = nn.Linear(hidden_size, output_size)
        # Log softmax
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        # |x| = |output of decoder| = (bs, 1, hs)
        z = self.output(x)
        # |y| = (bs, 1, output_size)
        y = self.softmax(z)
        # |y| = (bs, 1, output_size)
        
        return y

In [678]:
print(f"|encoder input data| : {encoder_input_data.shape}")
print(f"|decoder input data| : {decoder_input_data.shape}")
print(f"|decoder target data| : {decoder_target_data.shape}")

|encoder input data| : torch.Size([10000, 15, 71])
|decoder input data| : torch.Size([10000, 59, 93])
|decoder target data| : torch.Size([10000, 59, 93])


## Train

In [679]:
# n_epochs = 10
# batch_size = 16
use_teacher_forcing = True

In [680]:
enc = Encoder(encoder_input_data.shape[-1], latent_dim).to(device)
dec = Decoder(decoder_input_data.shape[-1], latent_dim).to(device)
gen = Generator(latent_dim, decoder_target_data.shape[-1]).to(device)

In [681]:
print(enc)
print(dec)
print(gen)

Encoder(
  (lstm): LSTM(71, 128, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
)
Decoder(
  (lstm): LSTM(349, 256, num_layers=3, batch_first=True, dropout=0.2)
)
Generator(
  (output): Linear(in_features=256, out_features=93, bias=True)
  (softmax): LogSoftmax(dim=-1)
)


In [578]:
enc_optim = optim.RMSprop(enc.parameters())
dec_optim = optim.RMSprop(dec.parameters())

In [579]:
loss_f = nn.NLLLoss()

## batch 구성

### train, valid, test split

In [580]:
# split train, valid, test dataset
ratio = [0.6, 0.2, 0.2]
train_cnt = int(encoder_input_data.shape[0] * ratio[0])
valid_cnt = int(encoder_input_data.shape[0] * ratio[1])
test_cnt = int(encoder_input_data.shape[0] * ratio[2])

cnts = [train_cnt, valid_cnt, test_cnt]

# index_permu = torch.randperm(encoder_input_data.shape[0])

enc_inputs = encoder_input_data.split(cnts, dim=0)
dec_inputs = decoder_input_data.split(cnts, dim=0)
dec_targets = decoder_target_data.split(cnts, dim=0)
# enc_inputs[0] : trian set, enc_inputs[1] : dev set, enc_inputs[2] : test set

for x_i, y_i in zip(enc_inputs, dec_inputs):
    print(x_i.shape, y_i.shape)

torch.Size([6000, 15, 71]) torch.Size([6000, 59, 93])
torch.Size([2000, 15, 71]) torch.Size([2000, 59, 93])
torch.Size([2000, 15, 71]) torch.Size([2000, 59, 93])


### batch split

In [48]:
for epoch in range(n_epochs):
    
    enc_batchs = enc_inputs[0].split(batch_size, dim=0)
    dec_batchs = dec_inputs[0].split(batch_size, dim=0)
    tgt_batchs = dec_targets[0].split(batch_size, dim=0)
    
    print(len(enc_batchs), enc_batchs[0].shape)
    print(len(dec_batchs), dec_batchs[0].shape)
    print(len(tgt_batchs), tgt_batchs[0].shape)
    break

375 torch.Size([16, 15, 71])
375 torch.Size([16, 59, 93])
375 torch.Size([16, 59, 93])


## train

In [None]:
train_epoch_loss, dev_epoch_loss = [], []

for epoch in tqdm(range(n_epochs)):
    
    loss = 0
    enc_optim.zero_grad()
    dec_optim.zero_grad()
    
    # batch splsit for train set
    enc_batchs = enc_inputs[0].to(device).split(batch_size, dim=0)
    dec_batchs = dec_inputs[0].to(device).split(batch_size, dim=0)
    tgt_batchs = dec_targets[0].to(device).split(batch_size, dim=0)
    # |enc_batchs| = (32, 15, 71)
    # |dec_batchs| = (32, 59, 93)
    # |tgt_batchs| = (32, 59, 93)
    
    # batch split for dev set
    enc_batchs_dev = enc_inputs[1].to(device).split(batch_size, dim=0)
    dec_batchs_dev = dec_inputs[1].to(device).split(batch_size, dim=0)
    tgt_batchs_dev = dec_targets[1].to(device).split(batch_size, dim=0)
    
    
    # out of memory?
#     torch.cuda.empty_cache()
        

    print(f"epoch : {epoch}, train step start")
    
    if use_teacher_forcing:
            
        # batch
        for enc_batch, dec_batch, tgt_batch in zip(enc_batchs, dec_batchs, tgt_batchs):
            
            train_loss_batch = 0
            
            # encoder forward
            output_enc, init_hidden_dec = enc(enc_batch)

            # encoder dimension change
            init_hidden_dec = enc.merge_encoder_hiddens(init_hidden_dec)
            
            h_t_1_tilde = None
            h_t_1 = init_hidden_dec
            
            # time step
            for t_step in range(dec_batchs[0].shape[1]):
                
                # decoder forward
                h_t_1_tilde, h_t_1 = dec(tgt_batch[:, t_step, :].unsqueeze(1), 
                                         h_t_1_tilde,
                                         h_t_1)
                # generator
                pred = gen(h_t_1_tilde)
                
                # loss and teacher-forcing
                # with this, I get runtime error message like this
#                 RuntimeError: 1D target tensor expected, multi-target not supported
#                 loss += loss_f(pred.squeeze(1), tgt_batch[:, t_step, :].long())

                # loss and teacher-forcing
                # It works!
                loss += loss_f(pred.squeeze(1), torch.max(tgt_batch[:, t_step, :], 1)[1])
            
            train_loss_batch += float(loss)
    
        loss.backward()
        
        enc_optim.step()
        dec_optim.step()
        
        train_epoch_loss.append(float(loss)/len(enc_batchs))
        
        
        print(f"epoch : {epoch}, validation step start")

        loss_dev = 0

        with torch.no_grad():
            for enc_batch, dec_batch, tgt_batch in zip(enc_batchs_dev, dec_batchs_dev, tgt_batchs_dev):

                # encoder forward
                output_enc_dev, init_hidden_dec_dev = enc(enc_batch)
                # encoder dimension change
                init_hidden_dec_dev = enc.merge_encoder_hiddens(init_hidden_dec_dev)
                                     
                h_t_1_tilde = None
                h_t_1 = init_hidden_dec_dev
                
                # time step
                for t_step in range(dec_batchs[1].shape[1]):

                    # decoder forward
                    h_t_1_tilde, h_t_1 = dec(tgt_batch[:, t_step, :].unsqueeze(1), 
                                             h_t_1_tilde,
                                             h_t_1)
                    # generator
                    pred = gen(h_t_1_tilde)

                    # loss and teacher-forcing
                    loss_dev += loss_f(pred.squeeze(1), torch.max(tgt_batch[:, t_step, :], 1)[1])
                    
                    
            dev_epoch_loss.append(float(loss_dev))

    print("epoch : {} | train loss : {:.3f} | dev loss : {:.3f}".format(epoch, 
                                                                        train_epoch_loss[-1],
                                                                        dev_epoch_loss[-1]))
    

# reference : https://justkode.kr/deep-learning/pytorch-save
# save model for inference
PATH = 'result/'

# 모델 자체를 저장
torch.save(enc, PATH + 'enc.pt')  # 전체 모델 저장
torch.save(dec, PATH + 'dec.pt')  # 전체 모델 저장
torch.save(gen, PATH + 'gen.pt')  # 전체 모델 저장

# state_dict로 모델과 optimizer 저장
torch.save(enc.state_dict(), PATH + 'enc_model_state_dict.pt')  # 모델 객체의 state_dict 저장
torch.save(dec.state_dict(), PATH + 'dec_model_state_dict.pt')  # 모델 객체의 state_dict 저장
torch.save(gen.state_dict(), PATH + 'gen_model_state_dict.pt')  # 모델 객체의 state_dict 저장

torch.save({
    'enc_model': enc.state_dict(),
    'dec_model': dec.state_dict(),
    'gen_model': gen.state_dict(),
    'enc_optimizer': enc_optim.state_dict(),
    'dec_optimizer': dec_optim.state_dict()}, 
    PATH + 'all.tar')  # 여러 가지 값 저장, 학습 중 진행 상황 저장을 위해 epoch, loss 값 등 일반 scalar값 저장 가능

  0%|          | 0/20 [00:00<?, ?it/s]

epoch : 0, train step start
epoch : 0, validation step start
epoch : 0 | train loss : 267.727 | dev loss : 15968.322
epoch : 1, train step start
epoch : 1, validation step start
epoch : 1 | train loss : 120.784 | dev loss : 15559.784
epoch : 2, train step start
epoch : 2, validation step start
epoch : 2 | train loss : 118.132 | dev loss : 15542.146
epoch : 3, train step start
epoch : 3, validation step start
epoch : 3 | train loss : 117.547 | dev loss : 15992.714
epoch : 4, train step start
epoch : 4, validation step start
epoch : 4 | train loss : 122.122 | dev loss : 15315.339
epoch : 5, train step start
epoch : 5, validation step start
epoch : 5 | train loss : 116.264 | dev loss : 15231.855
epoch : 6, train step start


## model_load

In [531]:
os.getcwd()

'/home/jeongseobkim/바탕화면/pytorch_reboost/Machine_translator'

In [720]:
# model load

PATH = 'result/'

# 방법 1 : 전체 모델을 통쨰로 불러오기
enc = torch.load(PATH + 'enc.pt' )
dec = torch.load(PATH + 'dec.pt' )
gen = torch.load(PATH + 'gen.pt' )

# 방법 2 : state_dict를 불러온 후, 모델에 저장
enc.load_state_dict(torch.load(PATH + 'enc_model_state_dict.pt'))  # 모델 객체의 state_dict 저장
dec.load_state_dict(torch.load(PATH + 'dec_model_state_dict.pt'))  # 모델 객체의 state_dict 저장
gen.load_state_dict(torch.load(PATH + 'gen_model_state_dict.pt'))  # 모델 객체의 state_dict 저장

# optimizer load
checkpoint = torch.load(PATH + 'all.tar')   # dict 불러오기
enc_optim.load_state_dict(checkpoint['enc_optimizer'])
dec_optim.load_state_dict(checkpoint['dec_optimizer'])

In [721]:
enc

Encoder(
  (lstm): LSTM(71, 128, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
)

In [722]:
dec

Decoder(
  (lstm): LSTM(349, 256, num_layers=3, batch_first=True, dropout=0.2)
)

In [723]:
gen

Generator(
  (output): Linear(in_features=256, out_features=93, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [724]:
enc_optim

RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    eps: 1e-08
    lr: 0.01
    momentum: 0
    weight_decay: 0
)

In [725]:
dec_optim

RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    eps: 1e-08
    lr: 0.01
    momentum: 0
    weight_decay: 0
)

In [726]:
for key in checkpoint.keys():
    print(key)

enc_model
dec_model
gen_model
enc_optimizer
dec_optimizer


In [727]:
checkpoint['dec_optimizer']['param_groups']

[{'lr': 0.01,
  'momentum': 0,
  'alpha': 0.99,
  'eps': 1e-08,
  'centered': False,
  'weight_decay': 0,
  'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}]

In [728]:
# checkpoint['dec_optimizer']['state']

## inference

### recap data prerpocessing

In [729]:
# check test inputs and targets
print("enc test data : ", enc_inputs[2].shape)
print("dec test data : ", dec_inputs[2].shape)
print("dec_target test data : ", dec_targets[2].shape)

enc test data :  torch.Size([2000, 15, 71])
dec test data :  torch.Size([2000, 59, 93])
gen test data :  torch.Size([2000, 59, 93])


In [730]:
print(input_token_index)

{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, '&': 5, "'": 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '5': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'x': 67, 'y': 68, 'z': 69, 'é': 70}


In [731]:
print(target_token_index)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '5': 17, '8': 18, '9': 19, ':': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'Y': 44, 'a': 45, 'b': 46, 'c': 47, 'd': 48, 'e': 49, 'f': 50, 'g': 51, 'h': 52, 'i': 53, 'j': 54, 'k': 55, 'l': 56, 'm': 57, 'n': 58, 'o': 59, 'p': 60, 'q': 61, 'r': 62, 's': 63, 't': 64, 'u': 65, 'v': 66, 'w': 67, 'x': 68, 'y': 69, 'z': 70, '\xa0': 71, '«': 72, '»': 73, 'À': 74, 'Ç': 75, 'É': 76, 'Ê': 77, 'à': 78, 'â': 79, 'ç': 80, 'è': 81, 'é': 82, 'ê': 83, 'î': 84, 'ï': 85, 'ô': 86, 'ù': 87, 'û': 88, 'œ': 89, '\u2009': 90, '’': 91, '\u202f': 92}


In [732]:
# "\t" == "BOS"
target_token_index['\t']

0

In [733]:
# "\n" == "EOS"
target_token_index['\n']

1

In [734]:
# 데이터 전처리 다시 살펴보기
### We use "tab" as the "start sequence" character ("\t" == "BOS")
### for the targets, and "\n" as "end sequence" character. ("\n" == "EOS")
print("inputs sentence sample : ", len(input_texts))
print("targets sentence sample : ", len(target_texts))
print("inputs sample 1 : ", input_texts[100])
print("targets sample 1 : ", target_texts[100])

# \t : BOS, \n : EOS
target_texts[100]

inputs sentence sample :  10000
targets sentence sample :  10000
inputs sample 1 :  I paid.
targets sample 1 :  	Je payai.



'\tJe payai.\n'

In [735]:
decoder_input_data[100].shape

torch.Size([59, 93])

In [736]:
torch.nonzero(decoder_input_data[100][0])[0]

tensor([0])

In [737]:
torch.nonzero(decoder_input_data[100][10])[0]

tensor([1])

In [738]:
torch.nonzero(decoder_input_data[100][11])[0]

tensor([2])

In [739]:
torch.nonzero(decoder_input_data[100][58])[0]

tensor([2])

In [740]:
input_texts[0]

'Go.'

In [741]:
target_texts[0]

'\tVa !\n'

In [742]:
print(target_texts[0])

	Va !



### Reverse-lookup token index to decode sequences back to something readable

In [743]:
print(input_token_index)

{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, '&': 5, "'": 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '5': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'x': 67, 'y': 68, 'z': 69, 'é': 70}


In [744]:
print(target_token_index)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '5': 17, '8': 18, '9': 19, ':': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'Y': 44, 'a': 45, 'b': 46, 'c': 47, 'd': 48, 'e': 49, 'f': 50, 'g': 51, 'h': 52, 'i': 53, 'j': 54, 'k': 55, 'l': 56, 'm': 57, 'n': 58, 'o': 59, 'p': 60, 'q': 61, 'r': 62, 's': 63, 't': 64, 'u': 65, 'v': 66, 'w': 67, 'x': 68, 'y': 69, 'z': 70, '\xa0': 71, '«': 72, '»': 73, 'À': 74, 'Ç': 75, 'É': 76, 'Ê': 77, 'à': 78, 'â': 79, 'ç': 80, 'è': 81, 'é': 82, 'ê': 83, 'î': 84, 'ï': 85, 'ô': 86, 'ù': 87, 'û': 88, 'œ': 89, '\u2009': 90, '’': 91, '\u202f': 92}


In [745]:
# 'reverse' means 
# "\t : 0" 
# ---->>>>> 
# "0 : \t" 
# for recognize the one-hot vectorized token

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [746]:
print(target_token_index)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '5': 17, '8': 18, '9': 19, ':': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41, 'U': 42, 'V': 43, 'Y': 44, 'a': 45, 'b': 46, 'c': 47, 'd': 48, 'e': 49, 'f': 50, 'g': 51, 'h': 52, 'i': 53, 'j': 54, 'k': 55, 'l': 56, 'm': 57, 'n': 58, 'o': 59, 'p': 60, 'q': 61, 'r': 62, 's': 63, 't': 64, 'u': 65, 'v': 66, 'w': 67, 'x': 68, 'y': 69, 'z': 70, '\xa0': 71, '«': 72, '»': 73, 'À': 74, 'Ç': 75, 'É': 76, 'Ê': 77, 'à': 78, 'â': 79, 'ç': 80, 'è': 81, 'é': 82, 'ê': 83, 'î': 84, 'ï': 85, 'ô': 86, 'ù': 87, 'û': 88, 'œ': 89, '\u2009': 90, '’': 91, '\u202f': 92}


In [747]:
print(reverse_target_char_index)

{0: '\t', 1: '\n', 2: ' ', 3: '!', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: ',', 11: '-', 12: '.', 13: '0', 14: '1', 15: '2', 16: '3', 17: '5', 18: '8', 19: '9', 20: ':', 21: '?', 22: 'A', 23: 'B', 24: 'C', 25: 'D', 26: 'E', 27: 'F', 28: 'G', 29: 'H', 30: 'I', 31: 'J', 32: 'K', 33: 'L', 34: 'M', 35: 'N', 36: 'O', 37: 'P', 38: 'Q', 39: 'R', 40: 'S', 41: 'T', 42: 'U', 43: 'V', 44: 'Y', 45: 'a', 46: 'b', 47: 'c', 48: 'd', 49: 'e', 50: 'f', 51: 'g', 52: 'h', 53: 'i', 54: 'j', 55: 'k', 56: 'l', 57: 'm', 58: 'n', 59: 'o', 60: 'p', 61: 'q', 62: 'r', 63: 's', 64: 't', 65: 'u', 66: 'v', 67: 'w', 68: 'x', 69: 'y', 70: 'z', 71: '\xa0', 72: '«', 73: '»', 74: 'À', 75: 'Ç', 76: 'É', 77: 'Ê', 78: 'à', 79: 'â', 80: 'ç', 81: 'è', 82: 'é', 83: 'ê', 84: 'î', 85: 'ï', 86: 'ô', 87: 'ù', 88: 'û', 89: 'œ', 90: '\u2009', 91: '’', 92: '\u202f'}


In [748]:
reverse_target_char_index[56]

'l'

### make decode_sequence function

In [749]:
print("num_decoder_tokens : ",num_decoder_tokens)
print("target_token_index['\\t'] :",target_token_index['\t'])
print("target_token_index['\\n'] :",target_token_index['\n'])

num_decoder_tokens :  93
target_token_index['\t'] : 0
target_token_index['\n'] : 1


In [750]:
# def decode_sequence(input_seq):
    
#     # Encode the input as state vectors.
#     _, init_hidden_dec = enc(input_seq)
    
#     # encoder dimension change
#     init_hidden_dec = enc.merge_encoder_hiddens(init_hidden_dec)
    
#     # Generate 'empty' target sequence of length 1.
#     target_seq = np.zeros((1, 1, num_decoder_tokens)) # num_decoder_tokens = 93
    
#     # fill the first character of target sequence with the start character('\t').
#     # (0,0,0) = (1 sample, 1 time-step, start!)
#     # target_seq[0, 0, 0] = "start of target sequence!" = BOS token input
#     target_seq[0, 0, target_token_index["\t"]] = 1.0

#     # Sampling loop for a batch of sequences
#     # (to simplify, here we assume a batch of size 1).
#     stop_condition = False
#     decoded_sentence = ""
    
#     h_t_1_tilde = None
#     h_t_1 = init_hidden_dec
    
#     while not stop_condition:
        
#         # delete
# #         output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
#         target_seq = torch.FloatTensor(target_seq)

#         # get output of 1 step, hidden state
#         h_t_1_tilde, h_t_1 = dec(target_seq, h_t_1_tilde, h_t_1)
#         # |h_t_1_tilde| = (bs=1, 1, hs)
        
#         # Sample a token
#         # delete
# #         sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
# #         print(f"h_t_1_tilde shape : {h_t_1_tilde.shape}")
# #         print(f"torch.argmax(h_t_1_tilde[0,-1,:]) : {torch.argmax(h_t_1_tilde[0, -1, :])}")

#         sampled_token_index = torch.argmax(h_t_1_tilde[0, -1, :])
#         print(h_t_1_tilde[0, -1, :].shape)
#         print(f"sampled_token_index : {sampled_token_index}")
# #         print(f"target_seq before : {target_seq}")
#         print(f"target_seq before shape : {target_seq.shape}")

#         sampled_char = reverse_target_char_index[sampled_token_index.tolist()]
#         print(f"sampled_char : {sampled_char}")
#         decoded_sentence += sampled_char
#         print(f"decoded_sentence : {decoded_sentence}")

#         # Exit condition: either hit max length
#         # or find stop character.
#         if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
#             stop_condition = True

#         # Update the target sequence (of length 1).
#         target_seq = np.zeros((1, 1, num_decoder_tokens))
#         target_seq[0, 0, sampled_token_index] = 1.0
        
# #         print(f"target_seq after : {target_seq}")
#         print(f"target_seq after shape : {target_seq.shape}")

#     return decoded_sentence

In [751]:
# debugging
h_t_1_tilde_list = [] 

def decode_sequence(input_seq):
    
    # Encode the input as state vectors.
    _, init_hidden_dec = enc(input_seq)
    
    # encoder dimension change
    init_hidden_dec = enc.merge_encoder_hiddens(init_hidden_dec)
    
    # Generate 'empty' target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens)) # num_decoder_tokens = 93
    
    # fill the first character of target sequence with the start character('\t').
    # (0,0,0) = (1 sample, 1 time-step, start!)
    # target_seq[0, 0, 0] = "start of target sequence!" = BOS token input
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    
    h_t_1_tilde = None
    h_t_1 = init_hidden_dec
    
    while not stop_condition:

        target_seq = torch.FloatTensor(target_seq)

        # get output of 1 step, hidden state
        h_t_1_tilde, h_t_1 = dec(target_seq, h_t_1_tilde, h_t_1)
        
        h_t_1_tilde_list.append(h_t_1_tilde)
        # |h_t_1_tilde| = (bs=1, 1, hs)
        
        # Sample a token
        sampled_token_index = torch.argmax(gen(h_t_1_tilde[0, -1, :]))

        sampled_char = reverse_target_char_index[sampled_token_index.tolist()]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

    return decoded_sentence

In [752]:
for seq_index in range(105,111):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: I'm OK.
Decoded sentence: J                                                           
-
Input sentence: Listen.
Decoded sentence: J                                                           
-
Input sentence: No way!
Decoded sentence: J                                                           
-
Input sentence: No way!
Decoded sentence: J                                                           
-
Input sentence: No way!
Decoded sentence: J                                                           
-
Input sentence: No way!
Decoded sentence: J                                                           


In [754]:
h_t_1_tilde_list[0] == h_t_1_tilde_list[1]

tensor([[[ True, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False,  True, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False,  True, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False,  True, False, False, False,
          False, False, False, False, Fa

In [755]:
h_t_1_tilde_list[4] == h_t_1_tilde_list[5]

tensor([[[False,  True,  True, False, False, False, False, False, False, False,
           True,  True,  True,  True, False, False, False,  True, False, False,
          False, False, False, False,  True, False, False, False, False, False,
          False, False, False, False, False,  True, False,  True, False, False,
          False, False, False, False, False, False, False, False,  True,  True,
          False, False, False, False, False, False,  True, False, False, False,
           True, False, False, False,  True, False, False, False, False, False,
          False, False,  True, False, False,  True, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False,  True, False, False, False,  True,  True, False, False,  True,
           True,  True, False, False, False,  True,  True,  True, False, False,
          False, False,  True, False, False,  True, False,  True,  True, False,
          False, False, False,  True, Fa