In [None]:
import os
import re
import time
import unicodedata
import tensorflow as tf
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

In [None]:
# Download file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [None]:
# Process file (Pandas)
file = open(path_to_file)
file_string = file.read()
file_translations = [translation.split('\t') for translation in file_string.split('\n')]
df = pd.DataFrame(file_translations)
df = df.drop(df.index[118964]) # error with None in last line
df.tail()

Unnamed: 0,0,1
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
118963,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."


In [None]:
# IN: array of sentences
# OUT: array of numbers - sentences (sentences, words), tokenizer
def tokenize(sentences):
  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
  tokenizer.fit_on_texts(sentences)
  
  sentences = tokenizer.texts_to_sequences(sentences)
  sentences = tf.keras.preprocessing.sequence.pad_sequences(sentences, padding='post')
  return sentences, tokenizer

In [None]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sentence):
  sentence = unicode_to_ascii(sentence.lower().strip())

  # space between word and punctuation after it
  sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
  
  sentence = sentence.strip()
  return '<start> ' + sentence + ' <end>'


In [None]:
# test train split
eng_sentences = [preprocess_sentence(sentence) for sentence in df[0]]
spa_sentences = [preprocess_sentence(sentence) for sentence in df[1]]

input_train, input_eval, output_train, output_eval = train_test_split(eng_sentences, spa_sentences, test_size=0.2) # input, output

input_train, eng_tokenizer = tokenize(input_train)
output_train, spa_tokenizer = tokenize(output_train)
input_train.shape

(95171, 51)

In [None]:
# Variables
EPOCHS = 10
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
STEPS_PER_EPOCHS = BUFFER_SIZE//BATCH_SIZE
EMBEDDING_DIM = 256
UNITS = 1024
VOCAB_INPUT_SIZE = len(eng_tokenizer.word_index) + 1
VOCAB_OUTPUT_SIZE = len(spa_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_train, output_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
example_input, example_output = next(iter(dataset))
WORDS_INPUT = len(example_input[1])
WORDS_OUTPUT = len(example_output[1])
dataset

<BatchDataset shapes: ((64, 51), (64, 53)), types: (tf.int32, tf.int32)>

# Encoder, Decoder, Attention

In [None]:
# INPUT:
# -- (BATCH_SIZE, WORDS) array of sentences
# -- (BATCH_SIZE, UNITS) enc_hidden
# OUTPUT:
# -- (BATCH_SIZE, WORDS, UNITS) x
# -- (BATCH_SIZE, UNITS) enc_hidden

class Encoder(tf.keras.Model):
  def __init__(self, VOCAB_INPUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE):
    super(Encoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(VOCAB_INPUT_SIZE, EMBEDDING_DIM)
    self.gru = tf.keras.layers.GRU(UNITS, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
  
  def call(self, x, enc_hidden):
    x = self.embedding(x)
    x, enc_hidden = self.gru(x, initial_state=enc_hidden)
    return x, enc_hidden
  
  def initialize_enc_hidden(self):
    return tf.zeros((BATCH_SIZE, UNITS))

In [None]:
# INPUT:
# -- (BATCH_SIZE, WORDS, UNITS) x
# -- (BATCH_SIZE, UNITS) enc_hidden
# OUTPUT
# -- (BATCH_SIZE, WORDS, 1) attention_weights
# -- (BATCH_SIZE, UNITS) context_vector

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, UNITS):
    super(BahdanauAttention, self).__init__()
    self.D1 = tf.keras.layers.Dense(UNITS)
    self.D2 = tf.keras.layers.Dense(UNITS)
    self.D3 = tf.keras.layers.Dense(1)
  
  def call(self, x, enc_hidden):
    x = self.D1(x)
    enc_hidden = self.D2(tf.expand_dims(enc_hidden, axis=1))
    attention_weights = self.D3(tf.nn.tanh(x + enc_hidden))
    attention_weights = tf.nn.softmax(attention_weights)
    context_vector = attention_weights * x
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [None]:
# INPUT
# -- (BATCH_SIZE, 1) x
# -- (BATCH_SIZE, WORDS, UNITS) enc_output
# -- (BATCH_SIZE, UNITS) enc_hidden
# OUTPUT
# -- (BATCH_SIZE, VOCAB_OUTPUT_SIZE) x
# -- (BATCH_SIZE, UNITS) hidden
# -- (BATCH_SIZE, WORDS, 1) attention_weights

class Decoder(tf.keras.Model):
  def __init__(self, UNITS, VOCAB_OUTPUT_SIZE, EMBEDDING_DIM, BATCH_SIZE):
    super(Decoder, self).__init__()
    self.BATCH_SIZE = BATCH_SIZE
    self.attention = BahdanauAttention(UNITS)
    self.embedding = tf.keras.layers.Embedding(VOCAB_OUTPUT_SIZE, EMBEDDING_DIM)
    self.gru = tf.keras.layers.GRU(UNITS, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.D1 = tf.keras.layers.Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
  
  def call(self, x, enc_output, hidden):
    context_vector, attention_weights = self.attention(enc_output, hidden)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    x, hidden = self.gru(x)
    x = tf.reshape(x, [self.BATCH_SIZE, -1])
    x = self.D1(x)
    return x, hidden, attention_weights

# Training

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))  # False for all padded sequences
  loss_ = loss_obj(real, pred)                     # Loss (64)

  mask = tf.cast(mask, dtype=loss_.dtype)             # Mask (64) - 0 for all 0
  loss_ *= mask                                       # Change all loss for paddings to 0
  return tf.reduce_mean(loss_)

In [None]:
encoder = Encoder(VOCAB_INPUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)
decoder = Decoder(UNITS, VOCAB_OUTPUT_SIZE, EMBEDDING_DIM, BATCH_SIZE)

checkpoint_dir = "/content/drive/My Drive/code/checkpoints"
checkpoint_prefix = f"{checkpoint_dir}/ckpt"
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
# checkpoint.save(file_prefix=checkpoint_prefix)
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
def train_step(input, target, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(input, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([eng_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
    
    for t in range(1, target.shape[1]):
      predictions, dec_hidden, attention_weights = decoder(dec_input, enc_output, dec_hidden)
      loss += loss_function(target[:,t], predictions)
      dec_input = tf.expand_dims(target[:,t], 1)
      pass
    pass
  batch_loss = loss / int(target.shape[1])
  
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return batch_loss

In [None]:
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_enc_hidden()
  total_loss = 0
  for batch, (input, target) in enumerate(dataset.take(STEPS_PER_EPOCHS)):
    batch_loss = train_step(input, target, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
      pass
    pass
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
    pass
  print(f'Epoch {epoch + 1} Loss {total_loss.numpy():.4f}')
  print(f'Time taken for 1 epoch {time.time() - start} sec\n')
  pass

Epoch 1 Batch 0 Loss 1.5570


KeyboardInterrupt: ignored

In [None]:
def translate(sentence):
  sentence = preprocess_sentence(sentence)
  sentence = [eng_tokenizer.word_index[word] for word in sentence.split()]
  sentence = tf.keras.preprocessing.sequence.pad_sequences([sentence], maxlen=WORDS_INPUT, padding='post')
  sentence = tf.convert_to_tensor(sentence)

  result = ''
  enc_hidden = tf.zeros((1, UNITS))
  enc_output, enc_hidden = encoder(sentence, enc_hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([spa_tokenizer.word_index['<start>']], 0)

  for i in range(WORDS_OUTPUT):
    predictions, dec_hidden, attention_weights = decoder(dec_input, enc_output, dec_hidden)
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += spa_tokenizer.index_word[predicted_id] + ' '
    if spa_tokenizer.index_word[predicted_id] == '<end>':
      return result, sentence
    dec_input = tf.expand_dims([predicted_id], 0)
  return result, sentence

SyntaxError: ignored

In [None]:
result, sentence = translate('I am finding a job')
print(result)