## [tensorflow seq2seq](https://www.tensorflow.org/tutorials/text/nmt_with_attention#translate)

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
data = pd.read_excel('/content/drive/My Drive/seq2seq_3조/seq2seq_aihub/kor.xlsx',sheet_name='Sheet1')
src = pd.DataFrame(data['ko'])
tar = pd.DataFrame(data['en'])
df = pd.concat([src, tar],axis=1)
df

Unnamed: 0,ko,en
0,나는 매일 저녁 배트를 만나러 다락방으로 가요.,I go to the attic every evening to meet Bat.
1,선생님 이문장이 이해가 안 가요.,"Sir, I don't understand this sentence here."
2,컴퓨터를 시작하면 시간이 너무 빠르게 가요.,Time flies when you start using the computer.
3,나는 오늘 자정에 한국으로 돌아 가요.,I'm going back to Korea today at midnight.
4,나는 일어나자마자 화장실에 가요.,I go to bathroom as soon as I wake up.
...,...,...
74995,나의 고민은 학교가 멀어서 통학하기 힘들어.,My worry is commuting to school because it's t...
74996,난 지금 내고양이때문에 충분히 힘들어.,I am going under enough difficulties because o...
74997,나와 대화가 어려운 것이 많이 힘들어?,Is having difficulties in talking with me too ...
74998,하루에 한번 연락하는게 그렇게 힘들어?,Is it that difficult to call once a day?


### Preprocessing

In [4]:
 # Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def en_preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [5]:
def ko_preprocess_sentence(w):
  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^ㄱ-ㅎㅏ|가-힣?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [6]:
def preprocess_sentence(sent):
    # 위에서 구현한 함수를 내부적으로 호출
    #sent = unicode_to_ascii(sent.lower())

    # 단어와 구두점 사이에 공백을 만듭니다.
    # Ex) "he is a boy." => "he is a boy ."
    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환합니다.
    sent = re.sub(r"[^ㄱ-ㅎㅏ|가-힣a-zA-Z!.?]+", r" ", sent)
    
    sent = re.sub(r"\s+", " ", sent)

    sent = '<start> ' + sent + ' <end>'
    return sent

In [7]:
# 전처리 테스트
# ko_sent = u"너 저녁 먹었어?"
# en_sent = u"Have you had dinner?"
print(preprocess_sentence(df.iloc[2][0]))
print(preprocess_sentence(df.iloc[2][1]))

<start> 컴퓨터를 시작하면 시간이 너무 빠르게 가요 . <end>
<start> Time flies when you start using the computer . <end>


In [8]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [KOREAN, ENGLISH]
def create_dataset(data):
    ko, en = [],[]
    for i in range(len(data)):
        src_line = data.iloc[i][0].strip()
        tar_line = data.iloc[i][1].strip()
        
        # source 데이터 전처리
        kor = [w for w in preprocess_sentence(src_line).split()]
        #ko.append(kor[::-1])
        ko.append(kor)
        
       # print(ko)
        # target 데이터 전처리
        eng = [w for w in preprocess_sentence(tar_line).split()]
        en.append(eng)
        
    return ko,en

In [9]:
ko, en = create_dataset(df)

In [10]:
print(ko[10])
print(en[10])

['<start>', '급한', '일이', '있어서', '손님', '만나러', '가요', '.', '<end>']
['<start>', 'I', 'm', 'meeting', 'a', 'guest', 'for', 'urgent', 'matters', '.', '<end>']


In [11]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

    return tensor, lang_tokenizer

In [12]:
def load_dataset(df):
  # creating cleaned input, output pairs
    inp_lang, targ_lang = create_dataset(df)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [13]:
# Try experimenting with the size of that dataset
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(df)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
 

In [14]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)


print(input_tensor_train.shape, target_tensor_train.shape, input_tensor_val.shape, target_tensor_val.shape)

(60000, 17) (60000, 20) (15000, 17) (15000, 20)


In [15]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [16]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
5 ----> 나는
1057 ----> 방금
1658 ----> 인보이스를
58466 ----> 어카운팅
10523 ----> 팀에게
341 ----> 주었어요
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> i
98 ----> just
404 ----> gave
5 ----> the
14910 ----> invoive
6 ----> to
5 ----> the
2925 ----> accounting
331 ----> team
3 ----> .
2 ----> <end>


## tf.data 데이터 셋 생성

In [17]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 128
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([128, 17]), TensorShape([128, 20]))

## 인코더 및 디코더 모델 작성

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

        #self.lstm = tf.keras.layers.LSTM(self.enc_units,return_sequences=True,
        #                            return_state=True)
                                       
                    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
encoder_hidden = encoder.initialize_hidden_state()
encoder_output, encoder_hidden = encoder(example_input_batch, encoder_hidden)

encoder_states = [encoder_output, encoder_hidden]
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(encoder_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(encoder_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (128, 17, 1024)
Encoder Hidden state shape: (batch size, units) (128, 1024)


In [21]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  21821440  
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
Total params: 25,759,744
Trainable params: 25,759,744
Non-trainable params: 0
_________________________________________________________________


In [22]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [23]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(encoder_hidden, encoder_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (128, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (128, 17, 1)


In [24]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [25]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      encoder_hidden, encoder_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(decoder_output.shape))
 

Decoder output shape: (batch_size, vocab size) (128, 19286)


In [26]:
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  4937216   
_________________________________________________________________
gru_1 (GRU)                  multiple                  7084032   
_________________________________________________________________
dense_3 (Dense)              multiple                  19768150  
_________________________________________________________________
bahdanau_attention_1 (Bahdan multiple                  2100225   
Total params: 33,889,623
Trainable params: 33,889,623
Non-trainable params: 0
_________________________________________________________________


## 옵티마이저 및 손실 함수 정의

In [27]:
optimizer = tf.keras.optimizers.Adam()
#optimizer = tf.keras.optimizers.SGD(learning_rate=0.75)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [28]:
# 검사 점(오브젝트 기반 저장)
checkpoint_dir = '/content/drive/My Drive/seq2seq_3조/gru_adam_model/gru_training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## 훈련

In [29]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

          # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [30]:
from tqdm import tqdm
EPOCHS = 10

for epoch in tqdm(range(EPOCHS)):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
 

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 Batch 0 Loss 5.5810
Epoch 1 Batch 100 Loss 3.1276
Epoch 1 Batch 200 Loss 2.9818
Epoch 1 Batch 300 Loss 2.8650
Epoch 1 Batch 400 Loss 2.7685


 10%|█         | 1/10 [03:47<34:09, 227.67s/it]

Epoch 1 Loss 3.0112
Time taken for 1 epoch 227.6706464290619 sec

Epoch 2 Batch 0 Loss 2.6662
Epoch 2 Batch 100 Loss 2.6235
Epoch 2 Batch 200 Loss 2.5739
Epoch 2 Batch 300 Loss 2.5020
Epoch 2 Batch 400 Loss 2.3438


 20%|██        | 2/10 [07:22<29:51, 223.92s/it]

Epoch 2 Loss 2.5264
Time taken for 1 epoch 215.1669671535492 sec

Epoch 3 Batch 0 Loss 2.4163
Epoch 3 Batch 100 Loss 2.4258
Epoch 3 Batch 200 Loss 2.3046
Epoch 3 Batch 300 Loss 2.2887
Epoch 3 Batch 400 Loss 2.2073


 30%|███       | 3/10 [10:56<25:45, 220.81s/it]

Epoch 3 Loss 2.3023
Time taken for 1 epoch 213.56847596168518 sec

Epoch 4 Batch 0 Loss 2.1418
Epoch 4 Batch 100 Loss 2.0780
Epoch 4 Batch 200 Loss 2.1388
Epoch 4 Batch 300 Loss 2.1981
Epoch 4 Batch 400 Loss 2.1035


 40%|████      | 4/10 [14:32<21:55, 219.28s/it]

Epoch 4 Loss 2.1258
Time taken for 1 epoch 215.69131350517273 sec

Epoch 5 Batch 0 Loss 1.9267
Epoch 5 Batch 100 Loss 1.9109
Epoch 5 Batch 200 Loss 1.9111
Epoch 5 Batch 300 Loss 1.9855
Epoch 5 Batch 400 Loss 1.9085


 50%|█████     | 5/10 [18:05<18:08, 217.63s/it]

Epoch 5 Loss 1.9440
Time taken for 1 epoch 213.79107284545898 sec

Epoch 6 Batch 0 Loss 1.8058
Epoch 6 Batch 100 Loss 1.8177
Epoch 6 Batch 200 Loss 1.7394
Epoch 6 Batch 300 Loss 1.7548
Epoch 6 Batch 400 Loss 1.7305


 60%|██████    | 6/10 [21:43<14:29, 217.50s/it]

Epoch 6 Loss 1.7582
Time taken for 1 epoch 217.18131041526794 sec

Epoch 7 Batch 0 Loss 1.5155
Epoch 7 Batch 100 Loss 1.5605
Epoch 7 Batch 200 Loss 1.5591
Epoch 7 Batch 300 Loss 1.5315
Epoch 7 Batch 400 Loss 1.5081


 70%|███████   | 7/10 [25:18<10:50, 216.75s/it]

Epoch 7 Loss 1.5645
Time taken for 1 epoch 214.9904818534851 sec

Epoch 8 Batch 0 Loss 1.3364
Epoch 8 Batch 100 Loss 1.3575
Epoch 8 Batch 200 Loss 1.3327
Epoch 8 Batch 300 Loss 1.3134
Epoch 8 Batch 400 Loss 1.3420


 80%|████████  | 8/10 [28:55<07:13, 216.84s/it]

Epoch 8 Loss 1.3658
Time taken for 1 epoch 217.05326533317566 sec

Epoch 9 Batch 0 Loss 1.1587
Epoch 9 Batch 100 Loss 1.1872
Epoch 9 Batch 200 Loss 1.2148
Epoch 9 Batch 300 Loss 1.1408
Epoch 9 Batch 400 Loss 1.2444


 90%|█████████ | 9/10 [32:30<03:36, 216.28s/it]

Epoch 9 Loss 1.1729
Time taken for 1 epoch 214.96300649642944 sec

Epoch 10 Batch 0 Loss 0.9912
Epoch 10 Batch 100 Loss 0.9577
Epoch 10 Batch 200 Loss 1.0080
Epoch 10 Batch 300 Loss 0.9766
Epoch 10 Batch 400 Loss 1.0218


100%|██████████| 10/10 [36:06<00:00, 216.68s/it]

Epoch 10 Loss 0.9918
Time taken for 1 epoch 216.7000834941864 sec






In [31]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [32]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [33]:
def translate_4_bleu(sentence):
    result, sentence = evaluate(sentence)
    return result

In [34]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7d2e4aaa58>

In [35]:
translate_4_bleu('어디야?')

'what is the type of japan ? <end> '

In [36]:
translate('나의 고민은 학교가 멀어서 통학하기 힘들어.')

Input: <start> 나의 고민은 학교가 멀어서 통학하기 힘들어 . <end>
Predicted translation: my voice of my work is turning off the school students . <end> 


In [49]:
translate('몇 시야?')

Input: <start> 몇 시야 ? <end>
Predicted translation: what is the closest ? <end> 


In [40]:
translate('나는 학교에 간다')

Input: <start> 나는 학교에 간다 <end>
Predicted translation: i will do my father s work at the school . <end> 


In [39]:
translate('어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.')

Input: <start> 어린 아이들이 스포츠를 즐기기엔 많이 힘들죠 . <end>
Predicted translation: it is hard to play children to play sports than the game . <end> 


In [37]:
translate('아빠는 밥 먹었어?')

Input: <start> 아빠는 밥 먹었어 ? <end>
Predicted translation: shall i ate it with a meal with a while ? <end> 


In [38]:
translate('하루에 한번 연락하는게 그렇게 힘들어?')

Input: <start> 하루에 한번 연락하는게 그렇게 힘들어 ? <end>
Predicted translation: is it ok to call it right away ? <end> 


## BLEU Score

In [None]:
df.to_csv('ko_en.csv',index=False)

In [41]:
import nltk.translate.bleu_score as bleu

# df = pd.read_csv('/content/ko_en.csv')
# ko, en = df['ko'], df['en']

result, sentence = evaluate('어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.')
candidate = result[:-6]
reference = ['It is difficult for young children to enjoy sports.', 'It is hard to enjoy sports for young people']
print(candidate)

print(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),candidate.split()))
# NLTK 패키지 구현되어져 있는 코드로 계산한 BLEU 점수

it is hard to play children to play sports than the game . 
0.30576902884505114


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [42]:
en_list = list(df['en'].loc[59999:])
en_list[-5:]

["My worry is commuting to school because it's too far.",
 'I am going under enough difficulties because of my cat.',
 'Is having difficulties in talking with me too hard for you?',
 'Is it that difficult to call once a day?',
 'It is difficult for young children to enjoy sports.']

In [43]:
ko_list =list(df['ko'].loc[59999:])
ko_list[-5:]

['나의 고민은 학교가 멀어서 통학하기 힘들어.',
 '난 지금 내고양이때문에 충분히 힘들어.',
 '나와 대화가 어려운 것이 많이 힘들어?',
 '하루에 한번 연락하는게 그렇게 힘들어?',
 '어린 아이들이 스포츠를 즐기기엔 많이 힘들죠.']

In [44]:
import nltk.translate.bleu_score as bleu
from nltk.translate.bleu_score import SmoothingFunction
def sentences_to_bleu(ref, pred):
  """
  ref : 참고용 타겟 문장(학습용 영어 문장)
  pred : 예측 문장(번역 결과)
  """
  smoothie = SmoothingFunction().method4
  return bleu.sentence_bleu(ref, pred, smoothing_function=smoothie)

In [73]:
from tqdm import tqdm_notebook
reference = []
pred = []
bleu_score = []
for i in tqdm_notebook(range(14840, 14850)):
 # try :
  decoded_sentence = translate_4_bleu(ko_list[i])[:-6]
  reference.append(en_list[i])
  pred.append(decoded_sentence)
  bleu_score.append(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),candidate.split()))
  # except Exception as e:
  #   pass

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().





In [74]:
print(bleu_score)

[0, 0.6262844962765469, 0.6262844962765469, 0.6930977286178778, 0.7447819789879647, 0.7447819789879647, 0.7447819789879647, 0.7447819789879647, 0.7447819789879647, 0.7447819789879647]


In [79]:
print(bleu_score)

[0.5266403878479265, 0.5266403878479265, 0.5266403878479265, 0.5266403878479265, 0.5266403878479265, 0.6262844962765469, 0.6262844962765469, 0.6262844962765469, 0.6262844962765469]


In [81]:
print(reference)

['You can see both oriental and western beauty.', 'I am more interested in you and your surroundings.', 'If you use a dehydrator, it may change the shape of it.', 'However, I feel that your words are different from your mind.', 'But it remains in our memories for good.']


In [80]:
print(pred)

['you can enjoy the market and western beauty . ', 'i have many things more than you and warmhearted . ', 'if you need a dehydrator it is a tool to use it . ', 'but you are not like your mind than what you are . ', 'but our offices is always busy together . ']


In [80]:
from tqdm import tqdm_notebook
reference = []
pred = []
bleu_score = []
err_cnt = 0
for i in tqdm_notebook(range(0, 15000)):
  try:
    decoded_sentence = translate_4_bleu(ko_list[i])[:-6]
    reference.append(en_list[i])
    pred.append(decoded_sentence)
   # bleu_score.append(bleu.sentence_bleu(list(map(lambda ref: ref.split(), reference)),candidate.split()))
    bleu_score.append(bleu.sentence_bleu([reference[i].split()],pred[i].split()))

  except KeyError:
    err_cnt += 1
    print(err_cnt)
    pass

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


1



In [101]:
sum = 0
for i in range(len(bleu_score)):
  sum += bleu_score[i]

avg = sum / (len(pred))
print(avg)

0.42729303674050234


In [90]:
len(bleu_score)

14999