In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

## Load Data

In [2]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                               with_info=True,
                               as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

2024-05-22 12:48:57.684693: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-05-22 12:48:57.684710: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2024-05-22 12:48:57.684716: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2024-05-22 12:48:57.684742: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-22 12:48:57.684755: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Tokenization

In [3]:
# Fit the tokenizer on your text
pt_sentences = []
en_sentences = []

for pt, en in train_examples:
    pt_sentences.append(pt.numpy().decode('utf-8'))
    en_sentences.append(en.numpy().decode('utf-8'))

In [4]:
import re
from unicodedata import normalize

def clean_text(text):
    text = normalize('NFD', text.lower())
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

def clean_and_prepare_text(text):
    text = '[start] ' + clean_text(text) + ' [end]'
    return text

pt_sentences = list(map(clean_text, pt_sentences))
en_sentences = list(map(clean_and_prepare_text, en_sentences))

In [5]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

sequence_len = 50

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()

# Enable padding
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=sequence_len)

# Enable truncation
tokenizer.enable_truncation(max_length=sequence_len)

# Fit the tokenizer on your text
tokenizer.train_from_iterator(pt_sentences + en_sentences)

# Get vocab size
vocab_size = tokenizer.get_vocab_size()
print('Vocabulary Size: ', vocab_size)





Vocabulary Size:  30000


In [6]:
pt_tokens = []
en_tokens = []

for pt, en in train_examples:
  pt = pt.numpy().decode('utf-8')
  en = en.numpy().decode('utf-8')

  # Encode the sentences
  pt_encoded = tokenizer.encode(pt)
  en_encoded = tokenizer.encode(en)

  # Get the tokens
  pt_tokens.append(pt_encoded.ids)
  en_tokens.append(en_encoded.ids)


In [7]:
tokenizer.decode(pt_tokens[100])

' e  pelos vistos  o grande profeta de um caso de violncia  um caso precedente de violncia [[[[[[[[[[[[[[[[[[[[[[[[['

In [8]:
pt_tokens = tf.convert_to_tensor(pt_tokens)
en_tokens = tf.convert_to_tensor(en_tokens)

## Basic Network

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM


BUFFER_SIZE = len(pt_tokens)
BATCH_SIZE = 64

# Create a tf.data.Dataset object for easier shuffling and batched training
dataset = tf.data.Dataset.from_tensor_slices((pt_tokens, en_tokens))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# Define your Transformer model here
# This is a simplified version and might not contain all the components of a full Transformer model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=64),  # You might want to adjust the input_dim parameter depending on your vocabulary size
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10000)  # You might want to adjust this parameter depending on your target vocabulary size
])

loss = SparseCategoricalCrossentropy(from_logits=True)
optimizer = Adam()
accuracy = SparseCategoricalAccuracy()

model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])

# Train the model
model.fit(dataset, epochs=1)


2024-05-22 12:49:08.623323: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-05-22 12:49:08.641054: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp_2.




<keras.src.callbacks.History at 0x36d285750>

## Transformer

In [10]:
inputs = { 'encoder_input': pt_tokens, 'decoder_input': en_tokens[:, :-1] }
outputs = en_tokens[:, 1:]

In [11]:
vocab_size = tokenizer.get_vocab_size()

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from keras_nlp.layers import TokenAndPositionEmbedding, TransformerEncoder
from keras_nlp.layers import TransformerDecoder

np.random.seed(42)
tf.random.set_seed(42)

num_heads = 8
embed_dim = 256

encoder_input = Input(shape=(None,), dtype='int64', name='encoder_input')
x = TokenAndPositionEmbedding(vocab_size, sequence_len, embed_dim)(encoder_input)
encoder_output = TransformerEncoder(embed_dim, num_heads)(x)
encoded_seq_input = Input(shape=(None, embed_dim))

decoder_input = Input(shape=(None,), dtype='int64', name='decoder_input')
x = TokenAndPositionEmbedding(vocab_size, sequence_len, embed_dim, mask_zero=True)(decoder_input)
x = TransformerDecoder(embed_dim, num_heads)(x, encoded_seq_input)
x = Dropout(0.4)(x)

decoder_output = Dense(vocab_size, activation='softmax')(x)
decoder = Model([decoder_input, encoded_seq_input], decoder_output)
decoder_output = decoder([decoder_input, encoder_output])

model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary(line_length=120)


Using TensorFlow backend
Model: "model_1"
________________________________________________________________________________________________________________________
 Layer (type)                       Output Shape                        Param #     Connected to                        
 encoder_input (InputLayer)         [(None, None)]                      0           []                                  
                                                                                                                        
 token_and_position_embedding (Tok  (None, None, 256)                   7692800     ['encoder_input[0][0]']             
 enAndPositionEmbedding)                                                                                                
                                                                                                                        
 decoder_input (InputLayer)         [(None, None)]                      0           []                         

In [13]:
from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
hist = model.fit(inputs, outputs, epochs=10, validation_split=0.2, callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
def translate_text(text, model, tokenizer, sequence_len):

    text_encoded = tokenizer.encode(text)
    encoder_input = text_encoded.ids
    encoder_input = tf.convert_to_tensor(encoder_input)
    encoder_input = tf.reshape(encoder_input, (50, 1))
    
    decoded_text = '[start]'
    for i in range(sequence_len):
        decoder_input = tokenizer.encode(decoded_text).ids
        decoder_input = tf.convert_to_tensor(decoder_input)
        decoder_input = tf.reshape(decoder_input, (50, 1))
    
        prediction = model([encoder_input, decoder_input])
    
        idx = np.argmax(prediction[i, 0, :])
        token = tokenizer.decode([idx])
        decoded_text += token

        if token == '[end]':
            break

    return decoded_text

In [26]:
for i in range(5):
    text = pt_sentences[i]
    decoded_text = translate_text(text, model, tokenizer, sequence_len)
    print (text)
    print (decoded_text)
    print ("-------------")

e quando melhoramos a procura  tiramos a unica vantagem da impressao  que e a serendipidade 
[start] and when better we the to  tap the and your one so  that and so  it  and  and so and so  so  and  and so and so  so  and  and so and so  so  and  and
-------------
mas e se estes fatores fossem ativos 
[start] but and if these the you devices  know  and  and so and so  so  and  and so and so  so  and  and so and so  so  and  and so and so  so  and  and so
-------------
mas eles nao tinham a curiosidade de me testar 
[start] but  nation they so communicate  i with   the  hum and  so and  so and  so and  so and  so and  so and  so and  so and  so and  so and  so and  so
-------------
e esta rebeldia consciente e a razao pela qual eu  como agnostica  posso ainda ter fe 
[start] and this equally i is and  it so what   as silk i   i still  a  and so  and so  and so  and so  and so  and so  and so  and so  and so  and
-------------
   podem usar tudo sobre a mesa no meu corpo  
[start]   usin