## [tensorflow seq2seq](https://www.tensorflow.org/tutorials/text/nmt_with_attention#translate)

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
import numpy as np
from tensorflow.keras.layers import LSTM,Dense,Embedding,Bidirectional,Concatenate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
data = pd.read_excel('/content/drive/My Drive/seq2seq_3조/seq2seq_aihub/kor.xlsx',sheet_name='Sheet1')
src = pd.DataFrame(data['ko'])
tar = pd.DataFrame(data['en'])
df = pd.concat([src, tar],axis=1)
df

Unnamed: 0,ko,en
0,나는 매일 저녁 배트를 만나러 다락방으로 가요.,I go to the attic every evening to meet Bat.
1,선생님 이문장이 이해가 안 가요.,"Sir, I don't understand this sentence here."
2,컴퓨터를 시작하면 시간이 너무 빠르게 가요.,Time flies when you start using the computer.
3,나는 오늘 자정에 한국으로 돌아 가요.,I'm going back to Korea today at midnight.
4,나는 일어나자마자 화장실에 가요.,I go to bathroom as soon as I wake up.
...,...,...
74995,나의 고민은 학교가 멀어서 통학하기 힘들어.,My worry is commuting to school because it's t...
74996,난 지금 내고양이때문에 충분히 힘들어.,I am going under enough difficulties because o...
74997,나와 대화가 어려운 것이 많이 힘들어?,Is having difficulties in talking with me too ...
74998,하루에 한번 연락하는게 그렇게 힘들어?,Is it that difficult to call once a day?


### Preprocessing

In [4]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')
def en_preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = w.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [5]:
def ko_preprocess_sentence(w):
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^ㄱ-ㅎㅏ|가-힣?.!,¿]+", " ", w)
  w = w.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [6]:
def preprocess_sentence(sent):
    # 위에서 구현한 함수를 내부적으로 호출
    #sent = unicode_to_ascii(sent.lower())

    # 단어와 구두점 사이에 공백을 만듭니다.
    # Ex) "he is a boy." => "he is a boy ."
    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환합니다.
    sent = re.sub(r"[^ㄱ-ㅎㅏ|가-힣a-zA-Z!.?]+", r" ", sent)
    
    sent = re.sub(r"\s+", " ", sent)

    sent = '<start> ' + sent + ' <end>'
    return sent

In [7]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [KOREAN, ENGLISH]
def create_dataset(data):
    ko, en = [],[]
    for i in range(len(data)):
        src_line = data.iloc[i][0].strip()
        tar_line = data.iloc[i][1].strip()
        
        # source 데이터 전처리
        kor = [w for w in preprocess_sentence(src_line).split()]
        ko.append(kor)
        #ko.append(kor[::-1])
        
       # print(ko)
        # target 데이터 전처리
        eng = [w for w in preprocess_sentence(tar_line).split()]
        en.append(eng)
        
    return ko,en

In [8]:
ko_list = []
en_list = []
for i in range(4):
        src_line = df.iloc[i][0].strip()
        tar_line = df.iloc[i][1].strip()
        
        # source 데이터 전처리
        korean = [' '.join(w for w in preprocess_sentence(src_line).split())]
        ko_list.append(korean[::-1])
        
       # print(ko)
        # target 데이터 전처리
        english = [w for w in preprocess_sentence(tar_line).split()]
        en_list.append(english)
print(ko_list)

[['<start> 나는 매일 저녁 배트를 만나러 다락방으로 가요 . <end>'], ['<start> 선생님 이문장이 이해가 안 가요 . <end>'], ['<start> 컴퓨터를 시작하면 시간이 너무 빠르게 가요 . <end>'], ['<start> 나는 오늘 자정에 한국으로 돌아 가요 . <end>']]


In [9]:
ko, en = create_dataset(df)

In [10]:
print(ko[-2])
print(en[-2])

['<start>', '하루에', '한번', '연락하는게', '그렇게', '힘들어', '?', '<end>']
['<start>', 'Is', 'it', 'that', 'difficult', 'to', 'call', 'once', 'a', 'day', '?', '<end>']


In [11]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

    return tensor, lang_tokenizer

In [12]:
def load_dataset(df):
  # creating cleaned input, output pairs
    inp_lang, targ_lang = create_dataset(df)
   # print(inp_lang, targ_lang)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [13]:
# Try experimenting with the size of that dataset
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(df)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
 

In [14]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(input_tensor_train.shape, target_tensor_train.shape, input_tensor_val.shape, target_tensor_val.shape)

(60000, 17) (60000, 20) (15000, 17) (15000, 20)


## tf.data 데이터 셋 생성

In [15]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [16]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 17]), TensorShape([64, 20]))

## 인코더 및 디코더 모델 작성

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bilstm = tf.keras.layers.Bidirectional(LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,recurrent_initializer='glorot_uniform'))                                       
                    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state_fh, state_fc, state_bh, state_bc = self.bilstm(x, initial_state = hidden)

        return output, state_fh, state_fc, state_bh, state_bc

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)) for i in range(4)]
    

In [18]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
encoder_hidden = encoder.initialize_hidden_state()
#encoder_output, encoder_fh, encoder_sh, encoder_fc, encoder_sc = encoder(example_input_batch, encoder_hidden)
encoder_output, encoder_fh, encoder_fc, encoder_sh, encoder_sc = encoder(example_input_batch, encoder_hidden)

#encoder_states = [encoder_output, encoder_hidden]
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(encoder_output.shape))
print ('Encoder First Hidden state shape: (batch size, units) {}'.format(encoder_fh.shape))


Encoder output shape: (batch size, sequence length, units) (64, 17, 2048)
Encoder First Hidden state shape: (batch size, units) (64, 1024)


In [19]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  21821440  
_________________________________________________________________
bidirectional (Bidirectional multiple                  10493952  
Total params: 32,315,392
Trainable params: 32,315,392
Non-trainable params: 0
_________________________________________________________________


In [20]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query1, query2, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query = Concatenate()([query1, query2])
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [21]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(encoder_fh, encoder_sh, encoder_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 2048)
Attention weights shape: (batch_size, sequence_length, 1) (64, 17, 1)


In [22]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bilstm = tf.keras.layers.Bidirectional(LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform'))
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, fhidden, shidden, fcell, scell, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(fhidden, shidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state_fh, state_fc, state_sh, state_sc = self.bilstm(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state_fh, state_fc, state_sh, state_sc, attention_weights

In [23]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_output, _, _,_,_, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      encoder_fh, encoder_fc, encoder_sh, encoder_sc, encoder_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(decoder_output.shape))
 

Decoder output shape: (batch_size, vocab size) (64, 19286)


In [24]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  4937216   
_________________________________________________________________
bidirectional_1 (Bidirection multiple                  27271168  
_________________________________________________________________
dense_3 (Dense)              multiple                  39517014  
_________________________________________________________________
bahdanau_attention_1 (Bahdan multiple                  4197377   
Total params: 75,922,775
Trainable params: 75,922,775
Non-trainable params: 0
_________________________________________________________________


## 옵티마이저 및 손실 함수 정의

In [25]:
optimizer = tf.keras.optimizers.Adam()
#optimizer = tf.keras.optimizers.SGD(learning_rate=0.75)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [26]:
# 검사 점(오브젝트 기반 저장)
checkpoint_dir = '/content/drive/My Drive/seq2seq_3조/fin_bilstm_adam_model/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## 훈련

In [27]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_fh, enc_hidden_fc, enc_hidden_sh,enc_hidden_sc  = encoder(inp, enc_hidden)

        dec_hidden_fh = enc_hidden_fh
        dec_hidden_sh = enc_hidden_sh
        dec_hidden_fc = enc_hidden_fc
        dec_hidden_sc = enc_hidden_sc

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
            predictions, dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc, _ = decoder(dec_input, dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc, enc_output)

            loss += loss_function(targ[:, t], predictions)

          # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [28]:
# from tqdm import tqdm
# EPOCHS = 10

# for epoch in tqdm(range(EPOCHS)):
#     start = time.time()

#     enc_hidden = encoder.initialize_hidden_state()
#     total_loss = 0

#     for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
#         batch_loss = train_step(inp, targ, enc_hidden)
#         total_loss += batch_loss

#         if batch % 100 == 0:
#             print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
#                                                        batch,
#                                                        batch_loss.numpy()))
#   # saving (checkpoint) the model every 2 epochs
#     if (epoch + 1) % 2 == 0:
#         checkpoint.save(file_prefix = checkpoint_prefix)

#     print('Epoch {} Loss {:.4f}'.format(epoch + 1,
#                                       total_loss / steps_per_epoch))
#     print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
 

In [29]:
# 어순 그대로 인풋
from tqdm import tqdm
EPOCHS = 10

for epoch in tqdm(range(EPOCHS)):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 Batch 0 Loss 5.4731
Epoch 1 Batch 100 Loss 3.0433
Epoch 1 Batch 200 Loss 2.8230
Epoch 1 Batch 300 Loss 2.7668
Epoch 1 Batch 400 Loss 2.6739
Epoch 1 Batch 500 Loss 2.5467
Epoch 1 Batch 600 Loss 2.5713
Epoch 1 Batch 700 Loss 2.5986
Epoch 1 Batch 800 Loss 2.5634
Epoch 1 Batch 900 Loss 2.4656


 10%|█         | 1/10 [10:03<1:30:29, 603.26s/it]

Epoch 1 Loss 2.7400
Time taken for 1 epoch 603.2559950351715 sec

Epoch 2 Batch 0 Loss 2.3383
Epoch 2 Batch 100 Loss 2.2705
Epoch 2 Batch 200 Loss 2.3576
Epoch 2 Batch 300 Loss 2.3599
Epoch 2 Batch 400 Loss 2.3494
Epoch 2 Batch 500 Loss 2.2759
Epoch 2 Batch 600 Loss 2.1337
Epoch 2 Batch 700 Loss 2.3930
Epoch 2 Batch 800 Loss 2.1616
Epoch 2 Batch 900 Loss 2.3651


 20%|██        | 2/10 [19:40<1:19:23, 595.46s/it]

Epoch 2 Loss 2.2845
Time taken for 1 epoch 577.2708413600922 sec

Epoch 3 Batch 0 Loss 1.9881
Epoch 3 Batch 100 Loss 2.0224
Epoch 3 Batch 200 Loss 2.0149
Epoch 3 Batch 300 Loss 2.0844
Epoch 3 Batch 400 Loss 2.0229
Epoch 3 Batch 500 Loss 2.0367
Epoch 3 Batch 600 Loss 1.8935
Epoch 3 Batch 700 Loss 1.8987
Epoch 3 Batch 800 Loss 1.9988
Epoch 3 Batch 900 Loss 1.8921


 30%|███       | 3/10 [29:12<1:08:38, 588.42s/it]

Epoch 3 Loss 1.9833
Time taken for 1 epoch 571.9818737506866 sec

Epoch 4 Batch 0 Loss 1.7099
Epoch 4 Batch 100 Loss 1.7197
Epoch 4 Batch 200 Loss 1.6914
Epoch 4 Batch 300 Loss 1.6955
Epoch 4 Batch 400 Loss 1.6263
Epoch 4 Batch 500 Loss 1.6066
Epoch 4 Batch 600 Loss 1.5919
Epoch 4 Batch 700 Loss 1.6539
Epoch 4 Batch 800 Loss 1.6125
Epoch 4 Batch 900 Loss 1.5756


 40%|████      | 4/10 [38:48<58:28, 584.74s/it]  

Epoch 4 Loss 1.6709
Time taken for 1 epoch 576.1596217155457 sec

Epoch 5 Batch 0 Loss 1.2885
Epoch 5 Batch 100 Loss 1.4164
Epoch 5 Batch 200 Loss 1.2814
Epoch 5 Batch 300 Loss 1.3712
Epoch 5 Batch 400 Loss 1.4097
Epoch 5 Batch 500 Loss 1.3955
Epoch 5 Batch 600 Loss 1.3871
Epoch 5 Batch 700 Loss 1.3048
Epoch 5 Batch 800 Loss 1.3404
Epoch 5 Batch 900 Loss 1.2988


 50%|█████     | 5/10 [48:19<48:22, 580.57s/it]

Epoch 5 Loss 1.3564
Time taken for 1 epoch 570.8396944999695 sec

Epoch 6 Batch 0 Loss 1.0000
Epoch 6 Batch 100 Loss 0.9773
Epoch 6 Batch 200 Loss 1.0128
Epoch 6 Batch 300 Loss 1.0437
Epoch 6 Batch 400 Loss 1.1314
Epoch 6 Batch 500 Loss 1.0959
Epoch 6 Batch 600 Loss 1.0223
Epoch 6 Batch 700 Loss 1.1109
Epoch 6 Batch 800 Loss 1.1014
Epoch 6 Batch 900 Loss 1.0536


 60%|██████    | 6/10 [57:54<38:35, 578.78s/it]

Epoch 6 Loss 1.0493
Time taken for 1 epoch 574.5876441001892 sec

Epoch 7 Batch 0 Loss 0.8040
Epoch 7 Batch 100 Loss 0.7183
Epoch 7 Batch 200 Loss 0.7307
Epoch 7 Batch 300 Loss 0.6773
Epoch 7 Batch 400 Loss 0.8124
Epoch 7 Batch 500 Loss 0.7492
Epoch 7 Batch 600 Loss 0.7662
Epoch 7 Batch 700 Loss 0.8998
Epoch 7 Batch 800 Loss 0.7856
Epoch 7 Batch 900 Loss 0.8349


 70%|███████   | 7/10 [1:07:25<28:49, 576.56s/it]

Epoch 7 Loss 0.7748
Time taken for 1 epoch 571.3867156505585 sec

Epoch 8 Batch 0 Loss 0.5093
Epoch 8 Batch 100 Loss 0.5431
Epoch 8 Batch 200 Loss 0.5077
Epoch 8 Batch 300 Loss 0.4778
Epoch 8 Batch 400 Loss 0.5028
Epoch 8 Batch 500 Loss 0.6102
Epoch 8 Batch 600 Loss 0.6265
Epoch 8 Batch 700 Loss 0.5612
Epoch 8 Batch 800 Loss 0.5246
Epoch 8 Batch 900 Loss 0.5531


 80%|████████  | 8/10 [1:17:01<19:12, 576.37s/it]

Epoch 8 Loss 0.5439
Time taken for 1 epoch 575.9085574150085 sec

Epoch 9 Batch 0 Loss 0.3655
Epoch 9 Batch 100 Loss 0.3205
Epoch 9 Batch 200 Loss 0.3136
Epoch 9 Batch 300 Loss 0.3343
Epoch 9 Batch 400 Loss 0.3921
Epoch 9 Batch 500 Loss 0.4379
Epoch 9 Batch 600 Loss 0.4003
Epoch 9 Batch 700 Loss 0.3113
Epoch 9 Batch 800 Loss 0.3890
Epoch 9 Batch 900 Loss 0.3952


 90%|█████████ | 9/10 [1:26:34<09:35, 575.43s/it]

Epoch 9 Loss 0.3614
Time taken for 1 epoch 573.2372214794159 sec

Epoch 10 Batch 0 Loss 0.2251
Epoch 10 Batch 100 Loss 0.2573
Epoch 10 Batch 200 Loss 0.2021
Epoch 10 Batch 300 Loss 0.2271
Epoch 10 Batch 400 Loss 0.2072
Epoch 10 Batch 500 Loss 0.2519
Epoch 10 Batch 600 Loss 0.2221
Epoch 10 Batch 700 Loss 0.2555
Epoch 10 Batch 800 Loss 0.2121
Epoch 10 Batch 900 Loss 0.2175


100%|██████████| 10/10 [1:36:12<00:00, 577.27s/it]

Epoch 10 Loss 0.2292
Time taken for 1 epoch 578.0985381603241 sec






In [30]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units)) for i in range(4)]
    enc_out, enc_hidden_fh,enc_hidden_sh, enc_hidden_fc,enc_hidden_sc = encoder(inputs, hidden)

    dec_hidden_fh = enc_hidden_fh
    dec_hidden_sh = enc_hidden_sh
    dec_hidden_fc = enc_hidden_fc
    dec_hidden_sc = enc_hidden_sc
    #dec_input = tf.expand_dims([targ_lang.word_index['<start> ']], 0)
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc, attention_weights = decoder(dec_input,
                                                         dec_hidden_fh,dec_hidden_fc,dec_hidden_sh,dec_hidden_sc,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        #if targ_lang.index_word[predicted_id] == ' <end>':
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [31]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [37]:
tf.__version__

'2.3.0'

In [39]:
# import tensorflow as tf
# checkpoint.restore(tf.train.latest_checkpoint('/content/drive/My Drive/seq2seq_3조/bilstm_adam_model/training_checkpoints'))

In [32]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fcdee215eb8>

In [33]:
translate('어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.')

Input: <start> 어린 아이들이 스포츠를 즐기기엔 많이 힘들죠 . <end>
Predicted translation: sports diet . <end> 


In [35]:
translate('나의 고민은 학교가 멀어서 통학하기 힘들어.')

Input: <start> 나의 고민은 학교가 멀어서 통학하기 힘들어 . <end>
Predicted translation: goal is to the best . <end> 


In [37]:
$translate('나는 학교에 간다.')

Input: <start> 나는 학교에 간다 . <end>
Predicted translation: school . <end> 


In [38]:
translate('어린 아이들이 스포츠를 즐기기엔 많이 힘들죠')

Input: <start> 어린 아이들이 스포츠를 즐기기엔 많이 힘들죠 <end>
Predicted translation: sports park . <end> 


In [39]:
translate('아빠는 밥 먹었어?')

Input: <start> 아빠는 밥 먹었어 ? <end>
Predicted translation: eat ? <end> 


In [40]:
translate('하루에 한번 연락하는 게 그렇게 힘들어?')

Input: <start> 하루에 한번 연락하는 게 그렇게 힘들어 ? <end>
Predicted translation: for a while ? <end> 


In [41]:
def translate_4_bleu(sentence):
    result, sentence = evaluate(sentence)
    return result

In [108]:
# 테스트셋 가져오기
en_list = list(df['en'].loc[60000:])
en_list[-5:]
ko_list =list(df['ko'].loc[60000:])
ko_list[-5:]

['나의 고민은 학교가 멀어서 통학하기 힘들어.',
 '난 지금 내고양이때문에 충분히 힘들어.',
 '나와 대화가 어려운 것이 많이 힘들어?',
 '하루에 한번 연락하는게 그렇게 힘들어?',
 '어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.']

In [92]:
# bleu
import nltk.translate.bleu_score as bleu
from nltk.translate.bleu_score import SmoothingFunction
def sentences_to_bleu(ref, pred):
  """
  ref : 참고용 타겟 문장(학습용 영어 문장)
  pred : 예측 문장(번역 결과)
  """
  smoothie = SmoothingFunction().method4
  return bleu.sentence_bleu(ref, pred, smoothing_function=smoothie)

In [130]:
translate_4_bleu(ko_list[-1])[:-6]

'sports diet . '

In [145]:
from tqdm import tqdm_notebook
reference = []
pred = []
bleu_score = []
err_cnt = 0
for i in tqdm_notebook(range(len(ko_list))):
  try: 
    decoded_sentence = translate_4_bleu(ko_list[i])[:-6]
    reference.append(en_list[i])
    pred.append(decoded_sentence)
    #bleu_score.append(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),decoded_sentence.split()))
    #bleu_score.append(bleu.sentence_bleu(reference[i],pred[i]))

  except KeyError:
    err_cnt += 1
    print(err_cnt)
    pass
for i in tqdm_notebook(range(0,len(pred))):
  bleu_score.append(bleu.sentence_bleu([reference[i].split()], pred[i].split()))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))

1



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=14999.0), HTML(value='')))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().





In [143]:
bleu_score

[0.668740304976422,
 0.7598356856515925,
 0.6389431042462724,
 0.8408964152537145,
 0.7598356856515925]

In [139]:
bleu.sentence_bleu(reference[0],pred[0])

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.8498912392268879

In [136]:
pred

['speed into the plate . ',
 'kakao talk . ',
 'to give up my major . ',
 'land . ',
 'are different . ']

In [137]:
reference

['I decided the colors to be yellow, black and white.',
 'I have notified the expense report on KakaoTalk.',
 'After having these thoughts, I decided to give up my major.',
 'I visited London, Belgium, Netherlands, and France.',
 'So I thought of methods in different way.']

In [109]:
print(len(ko_list))

15000


In [66]:
print(bleu_score)

[0.9166068134248218, 0.8408964152537145, 0.8722877451005959, 0.7487402156832422, 0.8545740127924681, 0.9306048591020996, 0.9253911813809743, 0.8408964152537145, 0.9306048591020996, 0.9740037464252967, 1.0, 0.9554427922043668, 0.8408964152537145, 0.9306048591020996, 0.9036020036098448, 0.9036020036098448, 0.9709835434146469, 0.8408964152537145, 0.8335516383402117, 0.9193227152249185]


In [148]:
# bleu 평균
sum = 0
for i in range(len(bleu_score)):
  sum += bleu_score[i]

avg = sum / len(pred)

In [149]:
avg

0.729040904039391