# Урок 10. Машинный перевод. Модель seq2seq и механизм внимания

Разобраться с моделью перевода (без механизма внимания) как она устроена, запустить для перевода с русского на английский (при желании можно взять другие пары языков)

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import pandas as pd
import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
!wget http://www.manythings.org/anki/rus-eng.zip

--2022-09-15 09:21:11--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15011848 (14M) [application/zip]
Saving to: ‘rus-eng.zip’


2022-09-15 09:21:14 (6.18 MB/s) - ‘rus-eng.zip’ saved [15011848/15011848]



In [3]:
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng/

Archive:  rus-eng.zip
  inflating: rus-eng/rus.txt         
  inflating: rus-eng/_about.txt      


In [4]:
!ls /content/rus-eng/ -lah

total 72M
drwxr-xr-x 2 root root 4.0K Sep 15 09:21 .
drwxr-xr-x 1 root root 4.0K Sep 15 09:21 ..
-rw-r--r-- 1 root root 1.5K Sep  6 03:10 _about.txt
-rw-r--r-- 1 root root  72M Sep  6 03:10 rus.txt


In [5]:
# Download the file
path_to_file = "/content/rus-eng/rus.txt"

In [6]:
def preprocess_sentence(w):
  w = w.lower().strip()

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [7]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [8]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [9]:
en, ru = create_dataset(path_to_file, None)
print(en[77])
print(ru[77])

<start> smile . <end>
<start> улыбнитесь . <end>


In [10]:
print(en[10])
print(ru[10])

<start> run ! <end>
<start> бегите ! <end>


In [11]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [12]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer


In [13]:
len(en), len(ru)

(451436, 451436)

In [14]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [15]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [16]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [17]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
8 ----> это
36 ----> было
45 ----> бы
732 ----> глупо
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
18 ----> that
190 ----> would
33 ----> be
286 ----> stupid
3 ----> .
2 ----> <end>


## Create a tf.data dataset

In [18]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 1024
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [19]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([1024, 15]), TensorShape([1024, 11]))

In [20]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=False,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [21]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (1024, 1024)


## Decoder without attention

In [22]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden):
    # enc_output shape == (batch_size, max_length, hidden_size)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x, initial_state=hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state

In [23]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

In [24]:
decoder_sample_x.shape

TensorShape([1024, 7335])

In [25]:
decoder_sample_h.shape

TensorShape([1024, 1024])

In [26]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [27]:
checkpoint_dir = './training_nmt_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [28]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [29]:
EPOCHS = 100

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 50 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 4 epochs
  if (epoch + 1) % 4 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6222
Epoch 1 Batch 50 Loss 2.0955
Epoch 1 Loss 2.3069
Time taken for 1 epoch 34.88617563247681 sec

Epoch 2 Batch 0 Loss 1.9523
Epoch 2 Batch 50 Loss 1.8393
Epoch 2 Loss 1.8640
Time taken for 1 epoch 23.375584363937378 sec

Epoch 3 Batch 0 Loss 1.7376
Epoch 3 Batch 50 Loss 1.6451
Epoch 3 Loss 1.6575
Time taken for 1 epoch 23.828977584838867 sec

Epoch 4 Batch 0 Loss 1.5228
Epoch 4 Batch 50 Loss 1.4469
Epoch 4 Loss 1.4608
Time taken for 1 epoch 25.001121520996094 sec

Epoch 5 Batch 0 Loss 1.3700
Epoch 5 Batch 50 Loss 1.2977
Epoch 5 Loss 1.3096
Time taken for 1 epoch 24.567402362823486 sec

Epoch 6 Batch 0 Loss 1.2072
Epoch 6 Batch 50 Loss 1.1958
Epoch 6 Loss 1.1760
Time taken for 1 epoch 24.88222050666809 sec

Epoch 7 Batch 0 Loss 1.1067
Epoch 7 Batch 50 Loss 1.0344
Epoch 7 Loss 1.0512
Time taken for 1 epoch 25.182317972183228 sec

Epoch 8 Batch 0 Loss 0.9444
Epoch 8 Batch 50 Loss 0.9116
Epoch 8 Loss 0.9263
Time taken for 1 epoch 26.099363803863525 sec

Epoch 9 Ba

In [30]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # storing the attention weights to plot later on
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [31]:
def remove_char(s):
    remove_result = s[8 : -6]
    return remove_result

def remove_char_trans(s):
    remove_result_trans = s[0 : -6]
    return remove_result_trans

In [32]:
final_result = []

def translate(sentence):
  result, sentence = evaluate(sentence)

  sent = (remove_char(sentence))
  res = (remove_char_trans(result))
  
  final_result.append([sent, res])
  df = pd.DataFrame(final_result, columns=['текст', 'перевод без внимания'])

  return df

In [33]:
translate('Привет!')
translate('Как дела?')
translate('Как тебя зовут?')
translate('Что ты делаешь завтра?')
translate('Я собираюсь играть в футбол')
translate('Если я смогу, я приду играть')
translate('Когда мы играли в футбол, шел дождь')
translate('В последнее время я поздно ложусь спать')
translate('Когда я ехал мимо станции, у меня слетела шляпа')
translate('Ёж птица гордая, пока не дадут скорость - летать не станет')

Unnamed: 0,текст,перевод без внимания
0,привет !,hello !
1,как дела ?,how are you ?
2,как тебя зовут ?,how is your name ?
3,что ты делаешь завтра ?,what are you all tomorrow ?
4,я собираюсь играть в футбол,i play football .
5,"если я смогу , я приду играть","if i can come , me ?"
6,"когда мы играли в футбол , шел дождь",when did we miss the car ?
7,в последнее время я поздно ложусь спать,"i'm in boston , too ."
8,"когда я ехал мимо станции , у меня слетела шляпа",when did i look very fat ?
9,"ж птица гордая , пока не дадут скорость летать...",ok to stay with me .


Короткие предложения сеть хоть с ошибками, но переводит. С длинными предложениями сеть не справляется совсем.

Для сравнения рассмотрим модель с механизмом внимания.

## Decoder with attention

In [34]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [35]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (1024, 15, 1024)
Encoder Hidden state shape: (batch size, units) (1024, 1024)


In [36]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [37]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (1024, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (1024, 15, 1)


In [38]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [39]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (1024, 7335)


In [40]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [41]:
checkpoint_dir = './training_attention_checkpoints2'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [42]:
EPOCHS = 50

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 50 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 4 epochs
  if (epoch + 1) % 4 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.0205
Epoch 1 Batch 50 Loss 0.0355
Epoch 1 Loss 0.0298
Time taken for 1 epoch 25.67899227142334 sec

Epoch 2 Batch 0 Loss 0.0227
Epoch 2 Batch 50 Loss 0.0366
Epoch 2 Loss 0.0296
Time taken for 1 epoch 25.78018617630005 sec

Epoch 3 Batch 0 Loss 0.0205
Epoch 3 Batch 50 Loss 0.0346
Epoch 3 Loss 0.0297
Time taken for 1 epoch 25.78372049331665 sec

Epoch 4 Batch 0 Loss 0.0233
Epoch 4 Batch 50 Loss 0.0296
Epoch 4 Loss 0.0298
Time taken for 1 epoch 25.923246383666992 sec

Epoch 5 Batch 0 Loss 0.0264
Epoch 5 Batch 50 Loss 0.0360
Epoch 5 Loss 0.0298
Time taken for 1 epoch 25.70167064666748 sec

Epoch 6 Batch 0 Loss 0.0209
Epoch 6 Batch 50 Loss 0.0326
Epoch 6 Loss 0.0294
Time taken for 1 epoch 25.616418838500977 sec

Epoch 7 Batch 0 Loss 0.0232
Epoch 7 Batch 50 Loss 0.0295
Epoch 7 Loss 0.0297
Time taken for 1 epoch 25.637518405914307 sec

Epoch 8 Batch 0 Loss 0.0203
Epoch 8 Batch 50 Loss 0.0295
Epoch 8 Loss 0.0294
Time taken for 1 epoch 25.844788789749146 sec

Epoch 9 Batc

In [43]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result2 = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result2 += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result2, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result2, sentence, attention_plot

In [44]:
final_result2 = []

def remove_char(s):
    remove_result = s[8 : -6]
    return remove_result

def remove_char_trans(s):
    remove_result_trans = s[0 : -6]
    return remove_result_trans

def translate(sentence):
  result2, sentence, attention_plot = evaluate(sentence)

  sent = (remove_char(sentence))
  res = (remove_char_trans(result2))
  
  final_result2.append([sent, res])
  df2 = pd.DataFrame(final_result2, columns=['текст', 'перевод с вниманием'])

  # print('Input: %s' % (sentence))
  # print('Predicted translation: {}'.format(result2))

  attention_plot = attention_plot[:len(result2.split(' ')), :len(sentence.split(' '))]
  # plot_attention(attention_plot, sentence.split(' '), result.split(' '))
  return df2

In [45]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7cd76faa50>

In [46]:
translate('Привет!')
translate('Как дела?')
translate('Как тебя зовут?')
translate('Что ты делаешь завтра?')
translate('Я собираюсь играть в футбол')
translate('Если я смогу, я приду играть')
translate('Когда мы играли в футбол, шел дождь')
translate('В последнее время я поздно ложусь спать')
translate('Когда я ехал мимо станции, у меня слетела шляпа')
translate('Ёж птица гордая, пока не дадут скорость - летать не станет')

Unnamed: 0,текст,перевод с вниманием
0,привет !,feasible avoided electrician choice over soup'...
1,как дела ?,feasible mine terrible woman jeans phoned trap...
2,как тебя зовут ?,feasible mine terrible woman jeans phoned trap...
3,что ты делаешь завтра ?,feasible mine choice amateurs model groan lyin...
4,я собираюсь играть в футбол,feasible mine terrible woman jeans phoned trap...
5,"если я смогу , я приду играть",feasible mine terrible woman jeans phoned bana...
6,"когда мы играли в футбол , шел дождь",feasible mine terrible woman jeans reads iphon...
7,в последнее время я поздно ложусь спать,feasible mine parties zombie duties terrified ...
8,"когда я ехал мимо станции , у меня слетела шляпа",feasible mine parties conspicuous planted croc...
9,"ж птица гордая , пока не дадут скорость летать...",pushy lagged stank glasses waits coyote war su...


Очень странный результат получился. Причем лосс у сети с вниманием ниже, чем у сети без внимания. Сеть с вниманием не справилась с задачей, есть признаки зацикленности.