## [tensorflow seq2seq](https://www.tensorflow.org/tutorials/text/nmt_with_attention#translate)

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np,
import os
import io
import time
import pandas as pd
import numpy as np
from tensorflow.keras.layers import LSTM,Dense,Embedding,Bidirectional,Concatenate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
data = pd.read_excel('kor.xlsx',sheet_name='Sheet1')
src = pd.DataFrame(data['ko'])
tar = pd.DataFrame(data['en'])
df = pd.concat([src, tar],axis=1)
df

Unnamed: 0,ko,en
0,나는 매일 저녁 배트를 만나러 다락방으로 가요.,I go to the attic every evening to meet Bat.
1,선생님 이문장이 이해가 안 가요.,"Sir, I don't understand this sentence here."
2,컴퓨터를 시작하면 시간이 너무 빠르게 가요.,Time flies when you start using the computer.
3,나는 오늘 자정에 한국으로 돌아 가요.,I'm going back to Korea today at midnight.
4,나는 일어나자마자 화장실에 가요.,I go to bathroom as soon as I wake up.
...,...,...
74995,나의 고민은 학교가 멀어서 통학하기 힘들어.,My worry is commuting to school because it's t...
74996,난 지금 내고양이때문에 충분히 힘들어.,I am going under enough difficulties because o...
74997,나와 대화가 어려운 것이 많이 힘들어?,Is having difficulties in talking with me too ...
74998,하루에 한번 연락하는게 그렇게 힘들어?,Is it that difficult to call once a day?


### Preprocessing

In [3]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')
def en_preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = w.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [4]:
def ko_preprocess_sentence(w):
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^ㄱ-ㅎㅏ|가-힣?.!,¿]+", " ", w)
  w = w.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [5]:
def preprocess_sentence(sent):
    # 위에서 구현한 함수를 내부적으로 호출
    #sent = unicode_to_ascii(sent.lower())

    # 단어와 구두점 사이에 공백을 만듭니다.
    # Ex) "he is a boy." => "he is a boy ."
    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환합니다.
    sent = re.sub(r"[^ㄱ-ㅎㅏ|가-힣a-zA-Z!.?]+", r" ", sent)
    
    sent = re.sub(r"\s+", " ", sent)

    sent = '<start> ' + sent + ' <end>'
    return sent

In [6]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [KOREAN, ENGLISH]
def create_dataset(data):
    ko, en = [],[]
    for i in range(len(data)):
        src_line = data.iloc[i][0].strip()
        tar_line = data.iloc[i][1].strip()
        
        # source 데이터 전처리
        kor = [w for w in preprocess_sentence(src_line).split()]
        ko.append(kor)
        #ko.append(kor[::-1])
        
       # print(ko)
        # target 데이터 전처리
        eng = [w for w in preprocess_sentence(tar_line).split()]
        en.append(eng)
        
    return ko,en

In [7]:
ko_list = []
en_list = []
for i in range(4):
        src_line = df.iloc[i][0].strip()
        tar_line = df.iloc[i][1].strip()
        
        # source 데이터 전처리
        korean = [' '.join(w for w in preprocess_sentence(src_line).split())]
        ko_list.append(korean[::-1])
        
       # print(ko)
        # target 데이터 전처리
        english = [w for w in preprocess_sentence(tar_line).split()]
        en_list.append(english)
print(ko_list)

[['<start> 나는 매일 저녁 배트를 만나러 다락방으로 가요 . <end>'], ['<start> 선생님 이문장이 이해가 안 가요 . <end>'], ['<start> 컴퓨터를 시작하면 시간이 너무 빠르게 가요 . <end>'], ['<start> 나는 오늘 자정에 한국으로 돌아 가요 . <end>']]


In [8]:
ko, en = create_dataset(df)

[['<start>', '나는', '매일', '저녁', '배트를', '만나러', '다락방으로', '가요', '.', '<end>'],
 ['<start>', '선생님', '이문장이', '이해가', '안', '가요', '.', '<end>'],
 ['<start>', '컴퓨터를', '시작하면', '시간이', '너무', '빠르게', '가요', '.', '<end>'],
 ['<start>', '나는', '오늘', '자정에', '한국으로', '돌아', '가요', '.', '<end>'],
 ['<start>', '나는', '일어나자마자', '화장실에', '가요', '.', '<end>'],
 ['<start>',
  '지금',
  '잠을',
  '자면',
  '깨어나지',
  '못할',
  '거',
  '같아서',
  '지금',
  '가요',
  '.',
  '<end>'],
 ['<start>', '학교가', '끝나자마자', '기숙사로', '가요', '.', '<end>'],
 ['<start>', '대한민국', '남자라면', '모두', '대에', '의무적으로', '군대에', '가요', '.', '<end>'],
 ['<start>', '오늘밤에', '비자', '때문에', '한국에', '가요', '.', '<end>'],
 ['<start>', '오늘은', '새', '자동차를', '받으러', '가요', '.', '<end>'],
 ['<start>', '급한', '일이', '있어서', '손님', '만나러', '가요', '.', '<end>'],
 ['<start>', '그는', '조금', '있으면', '수원으로', '가요', '.', '<end>'],
 ['<start>', '동물병원도', '있기', '때문에', '강아지들을', '데리고도', '자주', '가요', '.', '<end>'],
 ['<start>', '씻고', '교복을', '입고', '학교로', '가요', '.', '<end>'],
 ['<start>', '내일', '아침', '일찍', '얼음', '

In [9]:
print(ko[-2])
print(en[-2])

['<start>', '하루에', '한번', '연락하는게', '그렇게', '힘들어', '?', '<end>']
['<start>', 'Is', 'it', 'that', 'difficult', 'to', 'call', 'once', 'a', 'day', '?', '<end>']


In [13]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

    return tensor, lang_tokenizer

In [14]:
def load_dataset(df):
  # creating cleaned input, output pairs
    inp_lang, targ_lang = create_dataset(df)
    inp_lang, targ_lang = inp_lang[:63000],targ_lang[:63000]
   # print(inp_lang, targ_lang)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [15]:
# Try experimenting with the size of that dataset
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(df)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
 

In [16]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=13000)

print(input_tensor_train.shape, target_tensor_train.shape, input_tensor_val.shape, target_tensor_val.shape)

(50000, 17) (50000, 20) (13000, 17) (13000, 20)


## tf.data 데이터 셋 생성

In [17]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 17]), TensorShape([64, 20]))

## 인코더 및 디코더 모델 작성

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bilstm = tf.keras.layers.Bidirectional(LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,recurrent_initializer='glorot_uniform'))                                       
                    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state_fh, state_fc, state_bh, state_bc = self.bilstm(x, initial_state = hidden)

        return output, state_fh, state_fc, state_bh, state_bc

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)) for i in range(4)]
    

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
encoder_hidden = encoder.initialize_hidden_state()
#encoder_output, encoder_fh, encoder_sh, encoder_fc, encoder_sc = encoder(example_input_batch, encoder_hidden)
encoder_output, encoder_fh, encoder_fc, encoder_sh, encoder_sc = encoder(example_input_batch, encoder_hidden)

#encoder_states = [encoder_output, encoder_hidden]
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(encoder_output.shape))
print ('Encoder First Hidden state shape: (batch size, units) {}'.format(encoder_fh.shape))


Encoder output shape: (batch size, sequence length, units) (64, 17, 2048)
Encoder First Hidden state shape: (batch size, units) (64, 1024)


In [21]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  19236352  
_________________________________________________________________
bidirectional (Bidirectional multiple                  10493952  
Total params: 29,730,304
Trainable params: 29,730,304
Non-trainable params: 0
_________________________________________________________________


In [22]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query1, query2, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query = Concatenate()([query1, query2])
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [23]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(encoder_fh, encoder_sh, encoder_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 2048)
Attention weights shape: (batch_size, sequence_length, 1) (64, 17, 1)


In [24]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bilstm = tf.keras.layers.Bidirectional(LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform'))
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, fhidden, shidden, fcell, scell, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(fhidden, shidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state_fh, state_fc, state_sh, state_sc = self.bilstm(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state_fh, state_fc, state_sh, state_sc, attention_weights

In [25]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_output, _, _,_,_, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      encoder_fh, encoder_fc, encoder_sh, encoder_sc, encoder_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(decoder_output.shape))
 

Decoder output shape: (batch_size, vocab size) (64, 17900)


In [26]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  4582400   
_________________________________________________________________
bidirectional_1 (Bidirection multiple                  27271168  
_________________________________________________________________
dense_3 (Dense)              multiple                  36677100  
_________________________________________________________________
bahdanau_attention_1 (Bahdan multiple                  4197377   
Total params: 72,728,045
Trainable params: 72,728,045
Non-trainable params: 0
_________________________________________________________________


## 옵티마이저 및 손실 함수 정의

In [27]:
optimizer = tf.keras.optimizers.Adam()
#optimizer = tf.keras.optimizers.SGD(learning_rate=0.75)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [29]:
# 검사 점(오브젝트 기반 저장)
checkpoint_dir = ''
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## 훈련

In [30]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_fh, enc_hidden_fc, enc_hidden_sh,enc_hidden_sc  = encoder(inp, enc_hidden)

        dec_hidden_fh = enc_hidden_fh
        dec_hidden_sh = enc_hidden_sh
        dec_hidden_fc = enc_hidden_fc
        dec_hidden_sc = enc_hidden_sc

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
            predictions, dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc, _ = decoder(dec_input, dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc, enc_output)

            loss += loss_function(targ[:, t], predictions)

          # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [31]:
# from tqdm import tqdm
# EPOCHS = 10

# for epoch in tqdm(range(EPOCHS)):
#     start = time.time()

#     enc_hidden = encoder.initialize_hidden_state()
#     total_loss = 0

#     for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
#         batch_loss = train_step(inp, targ, enc_hidden)
#         total_loss += batch_loss

#         if batch % 100 == 0:
#             print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
#                                                        batch,
#                                                        batch_loss.numpy()))
#   # saving (checkpoint) the model every 2 epochs
#     if (epoch + 1) % 2 == 0:
#         checkpoint.save(file_prefix = checkpoint_prefix)

#     print('Epoch {} Loss {:.4f}'.format(epoch + 1,
#                                       total_loss / steps_per_epoch))
#     print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
 

In [32]:
# 어순 그대로 인풋
from tqdm import tqdm
EPOCHS = 10

for epoch in tqdm(range(EPOCHS)):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 Batch 0 Loss 5.4854
Epoch 1 Batch 100 Loss 2.9678
Epoch 1 Batch 200 Loss 2.8566
Epoch 1 Batch 300 Loss 2.5731
Epoch 1 Batch 400 Loss 2.5340
Epoch 1 Batch 500 Loss 2.6844
Epoch 1 Batch 600 Loss 2.5779
Epoch 1 Batch 700 Loss 2.3863


 10%|█         | 1/10 [07:59<1:11:56, 479.62s/it]

Epoch 1 Loss 2.7308
Time taken for 1 epoch 479.6230034828186 sec

Epoch 2 Batch 0 Loss 2.3692
Epoch 2 Batch 100 Loss 2.3279
Epoch 2 Batch 200 Loss 2.3559
Epoch 2 Batch 300 Loss 2.1077
Epoch 2 Batch 400 Loss 2.1161
Epoch 2 Batch 500 Loss 2.1966
Epoch 2 Batch 600 Loss 2.1946
Epoch 2 Batch 700 Loss 2.0510


 20%|██        | 2/10 [15:29<1:02:44, 470.61s/it]

Epoch 2 Loss 2.2190
Time taken for 1 epoch 449.56320428848267 sec

Epoch 3 Batch 0 Loss 1.8867
Epoch 3 Batch 100 Loss 1.9936
Epoch 3 Batch 200 Loss 1.8563
Epoch 3 Batch 300 Loss 1.8759
Epoch 3 Batch 400 Loss 1.7496
Epoch 3 Batch 500 Loss 1.9770
Epoch 3 Batch 600 Loss 1.7992
Epoch 3 Batch 700 Loss 1.8999


 30%|███       | 3/10 [22:52<53:57, 462.50s/it]  

Epoch 3 Loss 1.8851
Time taken for 1 epoch 443.5927984714508 sec

Epoch 4 Batch 0 Loss 1.4814
Epoch 4 Batch 100 Loss 1.5304
Epoch 4 Batch 200 Loss 1.4774
Epoch 4 Batch 300 Loss 1.5622
Epoch 4 Batch 400 Loss 1.5367
Epoch 4 Batch 500 Loss 1.6473
Epoch 4 Batch 600 Loss 1.6583
Epoch 4 Batch 700 Loss 1.5908


 40%|████      | 4/10 [30:21<45:49, 458.26s/it]

Epoch 4 Loss 1.5638
Time taken for 1 epoch 448.36810421943665 sec

Epoch 5 Batch 0 Loss 1.1633
Epoch 5 Batch 100 Loss 1.1733
Epoch 5 Batch 200 Loss 1.2346
Epoch 5 Batch 300 Loss 1.2184
Epoch 5 Batch 400 Loss 1.2984
Epoch 5 Batch 500 Loss 1.2461
Epoch 5 Batch 600 Loss 1.2017
Epoch 5 Batch 700 Loss 1.3356


 50%|█████     | 5/10 [37:45<37:50, 454.04s/it]

Epoch 5 Loss 1.2490
Time taken for 1 epoch 444.2026319503784 sec

Epoch 6 Batch 0 Loss 0.9085
Epoch 6 Batch 100 Loss 0.7908
Epoch 6 Batch 200 Loss 0.9994
Epoch 6 Batch 300 Loss 0.8872
Epoch 6 Batch 400 Loss 1.0270
Epoch 6 Batch 500 Loss 1.0258
Epoch 6 Batch 600 Loss 0.9542
Epoch 6 Batch 700 Loss 1.0843


 60%|██████    | 6/10 [45:14<30:10, 452.55s/it]

Epoch 6 Loss 0.9514
Time taken for 1 epoch 449.0749008655548 sec

Epoch 7 Batch 0 Loss 0.6270
Epoch 7 Batch 100 Loss 0.6673
Epoch 7 Batch 200 Loss 0.6881
Epoch 7 Batch 300 Loss 0.6132
Epoch 7 Batch 400 Loss 0.6735
Epoch 7 Batch 500 Loss 0.7055
Epoch 7 Batch 600 Loss 0.8371
Epoch 7 Batch 700 Loss 0.8033


 70%|███████   | 7/10 [52:39<22:30, 450.22s/it]

Epoch 7 Loss 0.6943
Time taken for 1 epoch 444.75692653656006 sec

Epoch 8 Batch 0 Loss 0.4703
Epoch 8 Batch 100 Loss 0.3999
Epoch 8 Batch 200 Loss 0.4785
Epoch 8 Batch 300 Loss 0.5116
Epoch 8 Batch 400 Loss 0.5034
Epoch 8 Batch 500 Loss 0.5276
Epoch 8 Batch 600 Loss 0.5209
Epoch 8 Batch 700 Loss 0.5781


 80%|████████  | 8/10 [1:00:10<15:01, 450.57s/it]

Epoch 8 Loss 0.4873
Time taken for 1 epoch 451.3807373046875 sec

Epoch 9 Batch 0 Loss 0.3111
Epoch 9 Batch 100 Loss 0.2900
Epoch 9 Batch 200 Loss 0.2793
Epoch 9 Batch 300 Loss 0.3188
Epoch 9 Batch 400 Loss 0.3658
Epoch 9 Batch 500 Loss 0.3019
Epoch 9 Batch 600 Loss 0.3819
Epoch 9 Batch 700 Loss 0.3533


 90%|█████████ | 9/10 [1:07:37<07:29, 449.53s/it]

Epoch 9 Loss 0.3286
Time taken for 1 epoch 447.09796261787415 sec

Epoch 10 Batch 0 Loss 0.1776
Epoch 10 Batch 100 Loss 0.2016
Epoch 10 Batch 200 Loss 0.2116
Epoch 10 Batch 300 Loss 0.1789
Epoch 10 Batch 400 Loss 0.2317
Epoch 10 Batch 500 Loss 0.2169
Epoch 10 Batch 600 Loss 0.2180
Epoch 10 Batch 700 Loss 0.2234


100%|██████████| 10/10 [1:15:09<00:00, 450.97s/it]

Epoch 10 Loss 0.2128
Time taken for 1 epoch 451.9692220687866 sec






In [35]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units)) for i in range(4)]
    enc_out, enc_hidden_fh,enc_hidden_sh, enc_hidden_fc,enc_hidden_sc = encoder(inputs, hidden)

    dec_hidden_fh = enc_hidden_fh
    dec_hidden_sh = enc_hidden_sh
    dec_hidden_fc = enc_hidden_fc
    dec_hidden_sc = enc_hidden_sc
    #dec_input = tf.expand_dims([targ_lang.word_index['<start> ']], 0)
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc, attention_weights = decoder(dec_input,
                                                         dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        #if targ_lang.index_word[predicted_id] == ' <end>':
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [36]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [None]:
tf.__version__

'2.3.0'

In [None]:
# import tensorflow as tf
# checkpoint.restore(tf.train.latest_checkpoint('/content/drive/My Drive/seq2seq_3조/bilstm_adam_model/training_checkpoints'))

In [40]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4c87da5128>

In [44]:
translate('어린 아이들이 스포츠를 즐기기엔 많이 힘들죠')

KeyError: ignored

In [92]:
translate('아빠는 밥 먹었어?')

Input: <start> 아빠는 밥 먹었어 ? <end>
Predicted translation: coffee go to see a lunch ? <end> 


In [93]:
translate('하루에 한번 연락하는 게 그렇게 힘들어?')

Input: <start> 하루에 한번 연락하는 게 그렇게 힘들어 ? <end>
Predicted translation: how is so tired on while ? <end> 


In [49]:
def translate_4_bleu(sentence):
    result, sentence = evaluate(sentence)
    return result

In [50]:
# 테스트셋 가져오기
en_list = list(df['en'].loc[63000:])
en_list[-5:]
ko_list =list(df['ko'].loc[63000:])
ko_list[-5:]

['나의 고민은 학교가 멀어서 통학하기 힘들어.',
 '난 지금 내고양이때문에 충분히 힘들어.',
 '나와 대화가 어려운 것이 많이 힘들어?',
 '하루에 한번 연락하는게 그렇게 힘들어?',
 '어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.']

In [69]:
ko_list[6]

'그리고 주문한 상품중 탑세개가 사이즈가 작아요.'

In [73]:
translate_4_bleu(ko_list[3]).split()[:-1]

['the', 'vacation', 'starts', 'next', 'week', '.']

In [83]:
# bleu
import nltk.translate.bleu_score as bleu
from nltk.translate.bleu_score import SmoothingFunction
def sentences_to_bleu(ref, pred):
  """
  ref : 참고용 타겟 문장(학습용 영어 문장)
  pred : 예측 문장(번역 결과)
  """
  smoothie = SmoothingFunction().method4
  return bleu.sentence_bleu(ref, pred, smoothing_function=smoothie)

In [86]:
from tqdm import tqdm_notebook
reference = []
pred = []
bleu_score = []
err_cnt = 0
smoothie = SmoothingFunction().method4
for i in tqdm_notebook(range(12000)):
  try: 
    decoded_sentence = translate_4_bleu(ko_list[i]).split()[:-1]
    reference= en_list[i].split()
    pred = decoded_sentence
    #bleu_score.append(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),decoded_sentence.split()))
    bleu_score.append(sentences_to_bleu(reference,pred))

  except KeyError:
    err_cnt += 1
    pass
print(err_cnt)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=12000.0), HTML(value='')))


10035


In [87]:
print(reference)
print(pred)

['Is', 'having', 'difficulties', 'in', 'talking', 'with', 'me', 'too', 'hard', 'for', 'you?']
['easy', 'working', 'easy', '?']


In [91]:
np.array(bleu_score).mean()

0.21738665585704872