## [tensorflow seq2seq](https://www.tensorflow.org/tutorials/text/nmt_with_attention#translate)

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
data = pd.read_excel('kor.xlsx',sheet_name='Sheet1')
src = pd.DataFrame(data['ko'])
tar = pd.DataFrame(data['en'])
df = pd.concat([src, tar],axis=1)
df

Unnamed: 0,ko,en
0,나는 매일 저녁 배트를 만나러 다락방으로 가요.,I go to the attic every evening to meet Bat.
1,선생님 이문장이 이해가 안 가요.,"Sir, I don't understand this sentence here."
2,컴퓨터를 시작하면 시간이 너무 빠르게 가요.,Time flies when you start using the computer.
3,나는 오늘 자정에 한국으로 돌아 가요.,I'm going back to Korea today at midnight.
4,나는 일어나자마자 화장실에 가요.,I go to bathroom as soon as I wake up.
...,...,...
74995,나의 고민은 학교가 멀어서 통학하기 힘들어.,My worry is commuting to school because it's t...
74996,난 지금 내고양이때문에 충분히 힘들어.,I am going under enough difficulties because o...
74997,나와 대화가 어려운 것이 많이 힘들어?,Is having difficulties in talking with me too ...
74998,하루에 한번 연락하는게 그렇게 힘들어?,Is it that difficult to call once a day?


### Preprocessing

In [3]:
 # Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def en_preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [4]:
def ko_preprocess_sentence(w):
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^ㄱ-ㅎㅏ|가-힣?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [5]:
def preprocess_sentence(sent):
    # 위에서 구현한 함수를 내부적으로 호출
    #sent = unicode_to_ascii(sent.lower())

    # 단어와 구두점 사이에 공백을 만듭니다.
    # Ex) "he is a boy." => "he is a boy ."
    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환합니다.
    sent = re.sub(r"[^ㄱ-ㅎㅏ|가-힣a-zA-Z!.?]+", r" ", sent)
    
    sent = re.sub(r"\s+", " ", sent)

    sent = '<start> ' + sent + ' <end>'
    return sent

In [6]:
# 전처리 테스트
# ko_sent = u"너 저녁 먹었어?"
# en_sent = u"Have you had dinner?"
print(preprocess_sentence(df.iloc[2][0]))
print(preprocess_sentence(df.iloc[2][1]))

<start> 컴퓨터를 시작하면 시간이 너무 빠르게 가요 . <end>
<start> Time flies when you start using the computer . <end>


In [8]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [KOREAN, ENGLISH]
def create_dataset(data):
    ko, en = [],[]
    for i in range(len(data)):
        src_line = data.iloc[i][0].strip()
        tar_line = data.iloc[i][1].strip()
        
        # source 데이터 전처리
        kor = [w for w in preprocess_sentence(src_line).split()]
        #ko.append(kor[::-1])
        ko.append(kor)
        
       # print(ko)
        # target 데이터 전처리
        eng = [w for w in preprocess_sentence(tar_line).split()]
        en.append(eng)
        
    return ko,en

In [9]:
ko, en = create_dataset(df)

In [10]:
print(ko[10])
print(en[10])

['<start>', '급한', '일이', '있어서', '손님', '만나러', '가요', '.', '<end>']
['<start>', 'I', 'm', 'meeting', 'a', 'guest', 'for', 'urgent', 'matters', '.', '<end>']


In [11]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

    return tensor, lang_tokenizer

In [12]:
def load_dataset(df):
  # creating cleaned input, output pairs
    inp_lang, targ_lang = create_dataset(df)
    inp_lang, targ_lang = inp_lang[:63000],targ_lang[:63000]
   # print(inp_lang, targ_lang)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [13]:
# Try experimenting with the size of that dataset
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(df)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
 

In [14]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=13000)

print(input_tensor_train.shape, target_tensor_train.shape, input_tensor_val.shape, target_tensor_val.shape)

(50000, 17) (50000, 20) (13000, 17) (13000, 20)


In [15]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [16]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
1010 ----> 주문
2101 ----> 취소
39180 ----> 요청해주시면
177 ----> 바로
1510 ----> 환불
2202 ----> 처리
155 ----> 하겠어요
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
33 ----> if
7 ----> you
853 ----> cancel
5 ----> the
172 ----> order
20 ----> we
24 ----> will
669 ----> refund
12 ----> it
147 ----> right
360 ----> away
3 ----> .
2 ----> <end>


## tf.data 데이터 셋 생성

In [23]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [24]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 17]), TensorShape([64, 20]))

## 인코더 및 디코더 모델 작성

In [25]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

        #self.lstm = tf.keras.layers.LSTM(self.enc_units,return_sequences=True,
        #                            return_state=True)
                                       
                    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    

In [26]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
encoder_hidden = encoder.initialize_hidden_state()
encoder_output, encoder_hidden = encoder(example_input_batch, encoder_hidden)

encoder_states = [encoder_output, encoder_hidden]
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(encoder_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(encoder_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 17, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [27]:
encoder.summary()

Model: "encoder_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  19236352  
_________________________________________________________________
gru_1 (GRU)                  multiple                  3938304   
Total params: 23,174,656
Trainable params: 23,174,656
Non-trainable params: 0
_________________________________________________________________


In [28]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [29]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(encoder_hidden, encoder_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 17, 1)


In [30]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [31]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      encoder_hidden, encoder_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(decoder_output.shape))
 

Decoder output shape: (batch_size, vocab size) (64, 17900)


In [32]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  4582400   
_________________________________________________________________
gru_2 (GRU)                  multiple                  7084032   
_________________________________________________________________
dense_3 (Dense)              multiple                  18347500  
_________________________________________________________________
bahdanau_attention_1 (Bahdan multiple                  2100225   
Total params: 32,114,157
Trainable params: 32,114,157
Non-trainable params: 0
_________________________________________________________________


## 옵티마이저 및 손실 함수 정의

In [33]:
optimizer = tf.keras.optimizers.Adam()
#optimizer = tf.keras.optimizers.SGD(learning_rate=0.75)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [34]:
# 검사 점(오브젝트 기반 저장)
checkpoint_dir = '/content/drive/My Drive/seq2seq_3조/gru_adam_model/gru_training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## 훈련

In [35]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

          # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [36]:
from tqdm import tqdm
EPOCHS = 10

for epoch in tqdm(range(EPOCHS)):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
 

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 Batch 0 Loss 5.5005
Epoch 1 Batch 100 Loss 3.2240
Epoch 1 Batch 200 Loss 2.9801
Epoch 1 Batch 300 Loss 2.9807
Epoch 1 Batch 400 Loss 2.6831
Epoch 1 Batch 500 Loss 2.7514
Epoch 1 Batch 600 Loss 2.6785
Epoch 1 Batch 700 Loss 2.5718


 10%|█         | 1/10 [08:17<1:14:39, 497.76s/it]

Epoch 1 Loss 2.8808
Time taken for 1 epoch 497.76361775398254 sec

Epoch 2 Batch 0 Loss 2.5940
Epoch 2 Batch 100 Loss 2.5123
Epoch 2 Batch 200 Loss 2.4397
Epoch 2 Batch 300 Loss 2.3439
Epoch 2 Batch 400 Loss 2.2779
Epoch 2 Batch 500 Loss 2.3930
Epoch 2 Batch 600 Loss 2.3694
Epoch 2 Batch 700 Loss 2.3878


 20%|██        | 2/10 [16:11<1:05:23, 490.46s/it]

Epoch 2 Loss 2.4107
Time taken for 1 epoch 473.4225299358368 sec

Epoch 3 Batch 0 Loss 2.1939
Epoch 3 Batch 100 Loss 2.0493
Epoch 3 Batch 200 Loss 2.2864
Epoch 3 Batch 300 Loss 2.2985
Epoch 3 Batch 400 Loss 2.1865
Epoch 3 Batch 500 Loss 2.2225
Epoch 3 Batch 600 Loss 2.0608
Epoch 3 Batch 700 Loss 2.0314


 30%|███       | 3/10 [24:01<56:30, 484.35s/it]  

Epoch 3 Loss 2.1659
Time taken for 1 epoch 470.08321928977966 sec

Epoch 4 Batch 0 Loss 1.9526
Epoch 4 Batch 100 Loss 1.8013
Epoch 4 Batch 200 Loss 1.9235
Epoch 4 Batch 300 Loss 1.9326
Epoch 4 Batch 400 Loss 1.9497
Epoch 4 Batch 500 Loss 1.8645
Epoch 4 Batch 600 Loss 1.9448
Epoch 4 Batch 700 Loss 1.7663


 40%|████      | 4/10 [31:55<48:07, 481.18s/it]

Epoch 4 Loss 1.9234
Time taken for 1 epoch 473.77760124206543 sec

Epoch 5 Batch 0 Loss 1.5800
Epoch 5 Batch 100 Loss 1.6766
Epoch 5 Batch 200 Loss 1.6205
Epoch 5 Batch 300 Loss 1.7337
Epoch 5 Batch 400 Loss 1.6741
Epoch 5 Batch 500 Loss 1.5896
Epoch 5 Batch 600 Loss 1.5950
Epoch 5 Batch 700 Loss 1.6101


 50%|█████     | 5/10 [39:47<39:52, 478.42s/it]

Epoch 5 Loss 1.6741
Time taken for 1 epoch 471.99182987213135 sec

Epoch 6 Batch 0 Loss 1.3794
Epoch 6 Batch 100 Loss 1.4251
Epoch 6 Batch 200 Loss 1.4068
Epoch 6 Batch 300 Loss 1.3124
Epoch 6 Batch 400 Loss 1.4703
Epoch 6 Batch 500 Loss 1.4024
Epoch 6 Batch 600 Loss 1.5331
Epoch 6 Batch 700 Loss 1.4178


 60%|██████    | 6/10 [47:42<31:49, 477.46s/it]

Epoch 6 Loss 1.4195
Time taken for 1 epoch 475.2177851200104 sec

Epoch 7 Batch 0 Loss 1.1425
Epoch 7 Batch 100 Loss 1.1800
Epoch 7 Batch 200 Loss 1.1800
Epoch 7 Batch 300 Loss 1.2179
Epoch 7 Batch 400 Loss 1.1141
Epoch 7 Batch 500 Loss 1.2404
Epoch 7 Batch 600 Loss 1.2337
Epoch 7 Batch 700 Loss 1.2721


 70%|███████   | 7/10 [55:33<23:46, 475.58s/it]

Epoch 7 Loss 1.1719
Time taken for 1 epoch 471.1725387573242 sec

Epoch 8 Batch 0 Loss 0.9826
Epoch 8 Batch 100 Loss 0.8966
Epoch 8 Batch 200 Loss 0.8664
Epoch 8 Batch 300 Loss 0.9569
Epoch 8 Batch 400 Loss 0.9049
Epoch 8 Batch 500 Loss 0.9855
Epoch 8 Batch 600 Loss 0.9301
Epoch 8 Batch 700 Loss 0.9108


 80%|████████  | 8/10 [1:03:26<15:49, 474.69s/it]

Epoch 8 Loss 0.9401
Time taken for 1 epoch 472.60268902778625 sec

Epoch 9 Batch 0 Loss 0.6829
Epoch 9 Batch 100 Loss 0.6929
Epoch 9 Batch 200 Loss 0.7215
Epoch 9 Batch 300 Loss 0.6846
Epoch 9 Batch 400 Loss 0.7784
Epoch 9 Batch 500 Loss 0.7483
Epoch 9 Batch 600 Loss 0.7789
Epoch 9 Batch 700 Loss 0.7619


 90%|█████████ | 9/10 [1:11:17<07:53, 473.57s/it]

Epoch 9 Loss 0.7350
Time taken for 1 epoch 470.9703850746155 sec

Epoch 10 Batch 0 Loss 0.6576
Epoch 10 Batch 100 Loss 0.5375
Epoch 10 Batch 200 Loss 0.5219
Epoch 10 Batch 300 Loss 0.5106
Epoch 10 Batch 400 Loss 0.6218
Epoch 10 Batch 500 Loss 0.6109
Epoch 10 Batch 600 Loss 0.6319
Epoch 10 Batch 700 Loss 0.5245


100%|██████████| 10/10 [1:19:10<00:00, 475.09s/it]

Epoch 10 Loss 0.5621
Time taken for 1 epoch 473.824848651886 sec






In [37]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

번역 Bleu

In [38]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [42]:
def translate_4_bleu(sentence):
    result, sentence = evaluate(sentence)
    return result

In [43]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb0d3cfe668>

In [44]:
translate_4_bleu('어디야?')

'hanagae of science country would like running ? <end> '

In [48]:
translate('나의 고민은 학교가 힘들어.')

Input: <start> 나의 고민은 학교가 힘들어 . <end>
Predicted translation: my dinner was my hard to sleep at school . <end> 


In [49]:
translate('몇 시야?')

Input: <start> 몇 시야 ? <end>
Predicted translation: what was the amount of the captain was ? <end> 


In [50]:
translate('나는 학교에 간다')

Input: <start> 나는 학교에 간다 <end>
Predicted translation: i went to the school i go to school . <end> 


In [53]:
translate('어린 아이들이 스포츠를 많이 .')

Input: <start> 어린 아이들이 스포츠를 많이 . <end>
Predicted translation: people visit many children and women . <end> 


In [54]:
translate('아빠는 밥 먹었어?')

Input: <start> 아빠는 밥 먹었어 ? <end>
Predicted translation: does the cat has gone with the beer ? <end> 


In [58]:
translate('나와 너는 먹었어?')

Input: <start> 나와 너는 먹었어 ? <end>
Predicted translation: are you catch a cup of coffee ? <end> 


## BLEU Score

In [59]:
def translate_4_bleu(sentence):
    result, sentence = evaluate(sentence)
    return result

In [60]:
# 테스트셋 가져오기
en_list = list(df['en'].loc[63000:])
en_list[-5:]
ko_list =list(df['ko'].loc[63000:])
ko_list[-5:]

['나의 고민은 학교가 멀어서 통학하기 힘들어.',
 '난 지금 내고양이때문에 충분히 힘들어.',
 '나와 대화가 어려운 것이 많이 힘들어?',
 '하루에 한번 연락하는게 그렇게 힘들어?',
 '어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.']

In [62]:
# bleu
import nltk.translate.bleu_score as bleu
from nltk.translate.bleu_score import SmoothingFunction
def sentences_to_bleu(ref, pred):
  """
  ref : 참고용 타겟 문장(학습용 영어 문장)
  pred : 예측 문장(번역 결과)
  """
  smoothie = SmoothingFunction().method4
  return bleu.sentence_bleu(ref, pred, smoothing_function=smoothie)

In [74]:
from tqdm import tqdm_notebook
reference = []
pred = []
bleu_score = []
err_cnt = 0
smoothie = SmoothingFunction().method4
for i in tqdm_notebook(range(12000)):
  try: 
    decoded_sentence = translate_4_bleu(ko_list[i]).split()[:-1]
    reference= en_list[i].split()
    pred = decoded_sentence
    #bleu_score.append(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),decoded_sentence.split()))
    bleu_score.append(sentences_to_bleu(reference,pred))

  except KeyError:
    err_cnt += 1
    pass
print(err_cnt)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=12000.0), HTML(value='')))


10035


In [68]:
print(reference)
print(pred)

['Is', 'having', 'difficulties', 'in', 'talking', 'with', 'me', 'too', 'hard', 'for', 'you?']
['are', 'you', 'having', 'a', 'lot', 'of', 'things', 'again', '?']


In [69]:
np.array(bleu_score).mean()

0.16614073240701077

In [66]:
bleu_score

[0.19913749124569236,
 0.18305025818318518,
 0.2290302783996535,
 0.18305025818318518,
 0.22557031245416842,
 0.1940679506664638,
 0.262447205266488,
 0.2477820306346147,
 0.23418392299336482,
 0.19552795980276136,
 0.19817632389021378,
 0.1969244213551724,
 0.22557031245416842,
 0.23567269439532595,
 0.1969244213551724,
 0.19817632389021378,
 0.19913749124569236,
 0.23252324097940982,
 0.1969244213551724,
 0.19552795980276136,
 0.19552795980276136,
 0.23418392299336482,
 0.2290302783996535,
 0.2290302783996535,
 0.22557031245416842,
 0.2290302783996535,
 0.19259074009082888,
 0.2307869877265558,
 0.1969244213551724,
 0.2307869877265558,
 0.25916711346124927,
 0.18305025818318518,
 0.25540778661907937,
 0.23567269439532595,
 0.1940679506664638,
 0.19552795980276136,
 0.2103957290011624,
 0.1969244213551724,
 0.15768474972683594,
 0.19817632389021378,
 0.19552795980276136,
 0.19259074009082888,
 0.22226897726623815,
 0.2307869877265558,
 0.1969244213551724,
 0.1969244213551724,
 0.19817

In [73]:
from tqdm import tqdm_notebook
reference = []
pred = []
bleu_score = []
err_cnt = 0
for i in tqdm_notebook(range(12000)):
  try: 
    decoded_sentence = translate_4_bleu(ko_list[i])[:-6]
    reference.append(en_list[i])
    pred.append(decoded_sentence)
    #bleu_score.append(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),decoded_sentence.split()))
    #bleu_score.append(bleu.sentence_bleu(reference[i],pred[i]))
  except KeyError:
    err_cnt += 1
   
    pass
print(err_cnt)
for i in tqdm_notebook(range(0,len(pred))):
    bleu_score.append(bleu.sentence_bleu([reference[i].split()], pred[i].split()))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=12000.0), HTML(value='')))


10035


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1965.0), HTML(value='')))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().





In [75]:
np.array(bleu_score).mean()

0.2124186240019619