In [77]:
import tensorflow as tf


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from keras import preprocessing

import unicodedata
import re
import numpy as np
import os
import io
import time

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)



In [33]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', 
    extract=True
)
path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [34]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [35]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    w = re.sub(r"([?.!,¿¡])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    w = re.sub(r"[^a-zA-Z0-9.,!¿¡]", " ", w)
    
    w = w.strip()
    
    w = '<start> ' + w + ' <end>'
    return w

In [36]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book <end>
b'<start> \xc2\xbf puedo tomar prestado este libro <end>'


In [37]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
    
    return zip(*word_pairs)

In [38]:
en, sp = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [39]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
    lang_tokenizer.fit_on_texts(lang)
    
    tensor = lang_tokenizer.texts_to_sequences(lang)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, lang_tokenizer

In [82]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [83]:
def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)
    
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [41]:
num_examples = 50000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [42]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

40000 40000 10000 10000


In [43]:
def convert(lang, tensor):
    for t in tensor:
        if (t != 0):
            print("%d ----> %s" % (t, lang.index_word[t]))
            

In [44]:
print("Input Language: Index to Word Mapping")
convert(inp_lang, input_tensor_train[0])
print()
print("Target Language: Index to Word Mapping")
convert(targ_lang, target_tensor_train[0])

Input Language: Index to Word Mapping
1 ----> <start>
373 ----> tenes
4386 ----> lindos
271 ----> ojos
3 ----> .
2 ----> <end>

Target Language: Index to Word Mapping
1 ----> <start>
5 ----> you
25 ----> have
901 ----> cute
330 ----> eyes
3 ----> .
2 ----> <end>


In [45]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [46]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 16]), TensorShape([64, 12]))

In [95]:
def gru(units):
    if tf.config.list_physical_devices('GPU'):
        return tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

In [96]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [97]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# Sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)
Encoder hidden state shape: (batch size, units) (64, 1024)


In [49]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden_size)
        # query_with_time_axis shape == (batch_size, 1, hidden_size)
        # values shape == (batch_size, max_len, hidden_size)
        # We're doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)
        
        # score shape == (batch_size, max_length, 1)
        # We get 1 at the last axis because we're applying the score to self.V
        # Shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
    
        # context_vector shape after sum == (batch_size, hidden_size)l
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

In [50]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # Passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights

In [51]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print("Output decoder shape: (batch size, vocab size) {}".format(sample_decoder_output.shape))

Output decoder shape: (batch size, vocab size) (64, 6888)


In [52]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

In [53]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [54]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        
        dec_hidden = enc_hidden
        
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
        
        # Teacher forcing, feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # Passing enc-output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            loss += loss_function(targ[:, t], predictions)
            
            # Using Teacher Forcing
            dec_input = tf.expand_dims(targ[:, t], 1)
            
    batch_loss = (loss / int(targ.shape[1]))
    
    variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients, variables))
 
    return batch_loss      

In [55]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for(batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1, batch, batch_loss.numpy()))
            
    # Saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch: {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.8789
Epoch 1 Batch 100 Loss 2.2300
Epoch 1 Batch 200 Loss 2.0340
Epoch 1 Batch 300 Loss 1.9298
Epoch 1 Batch 400 Loss 1.6715
Epoch 1 Batch 500 Loss 1.8108
Epoch 1 Batch 600 Loss 1.5836
Epoch 1 Loss 1.9515
Time taken for 1 epoch: 44.488365173339844 sec

Epoch 2 Batch 0 Loss 1.3786
Epoch 2 Batch 100 Loss 1.3477
Epoch 2 Batch 200 Loss 1.1445
Epoch 2 Batch 300 Loss 1.1491
Epoch 2 Batch 400 Loss 1.1911
Epoch 2 Batch 500 Loss 0.8970
Epoch 2 Batch 600 Loss 0.8309
Epoch 2 Loss 1.1634
Time taken for 1 epoch: 39.17271161079407 sec

Epoch 3 Batch 0 Loss 0.7507
Epoch 3 Batch 100 Loss 0.6981
Epoch 3 Batch 200 Loss 0.7351
Epoch 3 Batch 300 Loss 0.8520
Epoch 3 Batch 400 Loss 0.7356
Epoch 3 Batch 500 Loss 0.7408
Epoch 3 Batch 600 Loss 0.5844
Epoch 3 Loss 0.7051
Time taken for 1 epoch: 37.54280066490173 sec

Epoch 4 Batch 0 Loss 0.4111
Epoch 4 Batch 100 Loss 0.4491
Epoch 4 Batch 200 Loss 0.4701
Epoch 4 Batch 300 Loss 0.3973
Epoch 4 Batch 400 Loss 0.4335
Epoch 4 Batch 500 Loss 0.4

In [78]:
def evaluate(sentence):
    print(type(max_length_inp))
    print(type(max_length_targ))
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    
   #     lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
   # lang_tokenizer.fit_on_texts(lang)
    
   # tensor = lang_tokenizer.texts_to_sequences(lang)
    
   # tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
   # return tensor, lang_tokenizer
    sentence = tf.keras.preprocessing.text.text_to_word_sequence(sentence)
    
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_seqences(inputs,
                                                          maxlen = max_length_inp,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
    
        # For storing attention plots later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        preicted_id = tf.argmax(predictions[0]).numpy()
   
        result += targ_lang.index_word[predicted_id] + ' '

        if(targ_lang.index_word[predicted_id] == '<end>'):
            return result, sentence, attention_plot
        
        # The predicted ID is fed back into the model

        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

In [79]:
# Function for plotting the attention weights:

def plot_attention(attention, sentence, preicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontsize = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [80]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    print('Input: %s'.format(sentence))
    print('Predicted Translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [81]:
translate('hace mucho frio aqui.')

<class 'int'>
<class 'int'>


AttributeError: 'list' object has no attribute 'split'

In [None]:
translate(u'esta es mi vida.')

In [None]:
translate(u'¿Todavia esta en casa?')