In [None]:
! pip install "tensorflow-text>=2.11"

Collecting tensorflow-text>=2.11
  Downloading tensorflow_text-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<2.15,>=2.14.0 (from tensorflow-text>=2.11)
  Downloading tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes==0.2.0 (from tensorflow<2.15,>=2.14.0->tensorflow-text>=2.11)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
Collecting wrapt<1.15,>=1.11.0 (from tensorflow<2.15,>=2.14.0->tensorflow-text>=2.11)
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinu

In [None]:
import numpy as np
import pathlib
import os
import tensorflow_text as tf_text
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [None]:
path_to_zip = keras.utils.get_file('spa-eng.zip',origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',extract = True)
path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng'/'spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [None]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target,context in pairs])
  target = np.array([target for target,context in pairs])

  return target, context

In [None]:
target_raw,context_raw = load_data(path_to_file)

print(context_raw[1])
print(target_raw[1])

Vete.
Go.


In [None]:
Buffer_Size = len(context_raw)
Batch_Size = 64

is_train = np.random.uniform(size=(len(target_raw),)) < 0.8

train_raw = (tf.data.Dataset
    .from_tensor_slices((context_raw[is_train], target_raw[is_train])).shuffle(Buffer_Size).batch(Batch_Size))
val_raw = (tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train])).shuffle(Buffer_Size).batch(Batch_Size))

In [None]:
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings[:5])
  print(example_target_strings[:5])
  break

tf.Tensor(
[b'Tom ya no lo necesitaba.' b'Tan solo d\xc3\xadganle la verdad.'
 b'No pens\xc3\xa9 que Tom quisiera volver a verme.'
 b'No se puede comprar en ning\xc3\xban otro lugar, solo all\xc3\xad.'
 b'Tom salt\xc3\xb3 al agua helada.'], shape=(5,), dtype=string)

tf.Tensor(
[b"Tom didn't need it anymore." b'Just tell him the truth.'
 b"I didn't think Tom would want to see me again."
 b"You can't buy it anywhere but there."
 b'Tom jumped into the cold water.'], shape=(5,), dtype=string)


In [None]:
# Standarization
def tf_lower_and_split_punct(text):
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

In [None]:
# Now we do a text vectorization

max_vocab_size = 10000

context_text_processor = layers.TextVectorization(standardize=tf_lower_and_split_punct,
                                                  max_tokens=max_vocab_size,
                                                  ragged=True)

context_text_processor.adapt(train_raw.map(lambda context, target : context))

context_text_processor.get_vocabulary()[:15]

['',
 '[UNK]',
 '[START]',
 '[END]',
 '.',
 'que',
 'de',
 'el',
 'a',
 'no',
 'tom',
 'la',
 '?',
 '¿',
 'en']

In [None]:
# text vectorization also for target values
target_text_processor = layers.TextVectorization(standardize=tf_lower_and_split_punct,
                                                  max_tokens=max_vocab_size,
                                                  ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target : target))

target_text_processor.get_vocabulary()[:15]


['',
 '[UNK]',
 '[START]',
 '[END]',
 '.',
 'the',
 'i',
 'to',
 'you',
 'tom',
 'a',
 '?',
 'is',
 'he',
 'in']

In [None]:
context_vocab = np.array(context_text_processor.get_vocabulary())

In [None]:
# process the dataset
def process_text(context , target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context,targ_in),targ_out

train_ds = train_raw.map(process_text,tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [None]:
# Now build the encoder

class Encoder(layers.Layer):
  def __init__(self,text_processor,units):
    super(Encoder,self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.units = units

    self.embedding = layers.Embedding(self.vocab_size,units,mask_zero = True)

    self.rnn = layers.Bidirectional(merge_mode = 'sum',
                                    layer = layers.GRU(units,return_sequences = True,recurrent_initializer='glorot_uniform'))

  def call(self,x):
    x = self.embedding(x)
    x = self.rnn(x)
    return x

  def convert_input(self,texts):
    texts = tf.convert_to_tensor(texts)
    if len(texts.shape) == 0:
      texts = tf.convert_to_tensor(texts)[tf.newaxis]
    context = self.text_processor(texts).to_tensor()
    context = self(context)
    return context

In [None]:
# Now build the cross attention layer
UNITS = 256

class CrossAttention(layers.Layer):
  def __init__(self,units,**kwargs):
    super().__init__()
    self.mha = layers.MultiHeadAttention(key_dim=units,num_heads=1,**kwargs)
    self.layernorm = layers.LayerNormalization()
    self.add = layers.Add()

  def call(self,x,context):
    attn_output, attn_scores = self.mha(query=x,value=context,return_attention_scores=True)
    attn_scores = tf.reduce_mean(attn_scores,axis=1)
    self.last_attention_weights = attn_scores

    x = self.add([x,attn_output])
    x = self.layernorm(x)
    return x

In [None]:
attention_layer = CrossAttention(UNITS)

embed = layers.Embedding(target_text_processor.vocabulary_size(),output_dim=UNITS,mask_zero=True)

In [None]:
# Now we build the decoder

class Decoder(layers.Layer):
  @classmethod
  def add_method(cls,fun):
    setattr(cls,fun.__name__,fun)
    return fun

  def __init__(self,text_processor,units):
    super(Decoder,self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.word_to_id = layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='',oov_token='[UNK]')
    self.id_to_word = layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='',oov_token='[UNK]',invert=True)
    self.start_token = self.word_to_id('[START]')
    self.end_token = self.word_to_id('[END]')
    self.units = units

    self.embedding = layers.Embedding(self.vocab_size,units,mask_zero=True)

    self.rnn = layers.GRU(units,return_sequences=True,return_state = True, recurrent_initializer='glorot_uniform')

    self.attention = CrossAttention(units)

    self.output_layer = layers.Dense(self.vocab_size)

In [None]:
@Decoder.add_method
def call(self,context,x,state=None,return_state=False):
  x = self.embedding(x)
  x , state = self.rnn(x, initial_state = state)
  x = self.attention(x,context)
  self.last_attention_weights = self.attention.last_attention_weights
  logits = self.output_layer(x)
  if return_state:
    return logits, state
  else:
    return logits

decoder = Decoder(target_text_processor,UNITS)

In [None]:
# Now build the model

class Translator(keras.Model):
  @classmethod
  def add_method(cls,fun):
    setattr(cls,fun.__name__,fun)
    return fun

  def __init__(self,units,context_text_processor,target_text_processor):
    super().__init__()

    encoder = Encoder(context_text_processor,units)
    decoder = Decoder(target_text_processor,units)

    self.encoder = encoder
    self.decoder = decoder

  def call(self,inputs):
    context ,x = inputs
    context = self.encoder(context)
    logits = self.decoder(context,x)

    return logits

In [None]:
model = Translator(UNITS,context_text_processor,target_text_processor)

In [None]:
def masked_loss(y_true, y_pred):
    loss_fn = keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    matchs = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)
    return tf.reduce_sum(matchs)/tf.reduce_sum(mask)

In [None]:
model.compile(optimizer='adam',
              loss=masked_loss,
              metrics=[masked_acc, masked_loss])

In [None]:
history = model.fit(
    train_ds.repeat(),
    epochs=50,
    steps_per_epoch = 100,
    validation_data=val_ds,
    validation_steps = 20,
    callbacks=[keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
