In [1]:
import numpy as np
import tensorflow as tf
import random
import matplotlib.pyplot as plt
plt.style.use('ggplot')

seed = 50
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

## LSTM example: Machine translation

### Data

In [2]:
from num2words import num2words

def create_french_to_english_data():
    start = 1
    end = 1000
    
    # Create the dataset
    source = []
    target = []
    for i in range(start, end+1):
        # Convert the number to words in French
        words = '<start> ' + num2words(i, lang='fr') + ' <end>'
        source.append(words)
        words = '<start> ' + num2words(i, lang='en') + ' <end>'
        target.append(words)
    
    return source, target

In [3]:
source, target = create_french_to_english_data()

In [4]:
from sklearn.model_selection import train_test_split

def split_data(source, target):

    train_source, test_source, train_target, test_target = train_test_split(
        source, target, test_size=0.2, random_state=42
    )

    return train_source, test_source, train_target, test_target

In [5]:
train_source, test_source, train_target, test_target = split_data(source, target)

### Prepare sequences

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>", filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_source + train_target)

In [7]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

def prepare_sequences(source, target, tokenizer):
    """
    Prepare the input and output sequences for training the model.

    Parameters:
    dataset (list): The dataset to be processed.
    tokenizer (Tokenizer): The tokenizer to be used.

    Returns:
    tuple: The prepared input and output sequences.
    """


    X_encoder = tokenizer.texts_to_sequences(source)

    Y_encoded = tokenizer.texts_to_sequences(target)
    Y_decoder_input = Y_encoded
    Y_decoder_output = [encoded[1:] + [0] for encoded in Y_encoded]
    
    X_encoder = pad_sequences(X_encoder, maxlen=8, padding='post', truncating='post', dtype='float32')
    Y_decoder_input = pad_sequences(Y_decoder_input, maxlen=8, padding='post', truncating='post', dtype='float32')
    Y_decoder_output = pad_sequences(Y_decoder_output, maxlen=8, padding='post', truncating='post', dtype='float32')

    return X_encoder, Y_decoder_input, Y_decoder_output

In [8]:
X_train_encoder, Y_train_decoder_input, Y_train_decoder_output = prepare_sequences(train_source, train_target, tokenizer)
X_test_encoder, Y_test_decoder_input, Y_test_decoder_output = prepare_sequences(test_source, test_target, tokenizer)

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, CategoryEncoding
from tensorflow.keras.models import Model


def create_lstm_encoder_decoder_model(
    encoder_vocab_size, decoder_vocab_size, embedding_dim, seq_length
):

    # Encoder
    encoder_inputs = Input(shape=(seq_length,), name="encoder_inputs")
    encoder_embedding = Embedding(
        input_dim=encoder_vocab_size, output_dim=embedding_dim, name="encoder_embedding"
    )(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(
        units=lstm_units, return_state=True, name="encoder_lstm"
    )(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(seq_length,), name="decoder_inputs")
    decoder_embedding = Embedding(
        input_dim=decoder_vocab_size, output_dim=embedding_dim, name="decoder_embedding"
    )(decoder_inputs)
    decoder_lstm, _, _ = LSTM(
        units=lstm_units, return_sequences=True, return_state=True, name="decoder_lstm"
    )(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(
        units=decoder_vocab_size, activation="softmax", name="decoder_dense"
    )
    decoder_outputs = decoder_dense(decoder_lstm)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

    # Display the model summary
    model.summary(line_length=120)

    return model


In [10]:
# Assuming vocabulary sizes are known
encoder_vocab_size = 59  # Change this to your actual encoder vocabulary size
decoder_vocab_size = 59  # Change this to your actual decoder vocabulary size
embedding_dim = 256  # Dimension of the embedding vectors
lstm_units = 512  # Number of LSTM units
seq_length = 8  # Sequence length for both encoder and decoder

model = create_lstm_encoder_decoder_model(encoder_vocab_size, decoder_vocab_size, embedding_dim, seq_length)

2024-06-12 20:42:58.857366: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-06-12 20:42:58.857391: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2024-06-12 20:42:58.857398: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2024-06-12 20:42:58.857429: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-12 20:42:58.857447: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
________________________________________________________________________________________________________________________
 Layer (type)                       Output Shape                        Param #     Connected to                        
 encoder_inputs (InputLayer)        [(None, 8)]                         0           []                                  
                                                                                                                        
 decoder_inputs (InputLayer)        [(None, 8)]                         0           []                                  
                                                                                                                        
 encoder_embedding (Embedding)      (None, 8, 256)                      15104       ['encoder_inputs[0][0]']            
                                                                                                                        
 decoder_embeddin

In [11]:
from tensorflow.keras.callbacks import EarlyStopping

callback_es = EarlyStopping(monitor='val_loss', patience=20)
history = model.fit(
    [X_train_encoder, Y_train_decoder_input],
    Y_train_decoder_output,
    batch_size=64,
    epochs=500,
    validation_split=0.2,
    callbacks=[callback_es]
)

Epoch 1/500


2024-06-12 20:43:00.729847: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7

In [12]:
Y_hat = model.predict([X_test_encoder, Y_test_decoder_input])



In [13]:
def translate(text):
    encoder_input = tokenizer.texts_to_sequences([text]) 
    encoder_input = pad_sequences(encoder_input, maxlen=8, padding='post', truncating='post', dtype='float32')
    
    T_y = 8
    decoder_input = np.zeros([1, T_y])
    word = '<start>'
    idx = tokenizer.word_index[word]
    
    translated_text = ''
    for t in range(T_y):
        translated_text += word + ' '
        decoder_input[0, t] = idx
        y_hat = model.predict([encoder_input, decoder_input])
        idx = np.argmax(y_hat[0, t, :])
        word = tokenizer.index_word[idx]
        if word == '<end>':
            translated_text += word + ' '
            break

    return translated_text

In [14]:
def generate_sample_translations():
    indices = np.random.choice(200, 5, replace=False)
    for i in range(5):    
        text = test_source[indices[i]]
        translated_text = translate(text)
        print ("----------------")
        print ("French text:", test_source[indices[i]])
        print ("Translation:", translated_text)
        print ("Ground truth:", test_target[indices[i]])

    return None

generate_sample_translations()

----------------
French text: <start> neuf cent quatre-vingt-six <end>
Translation: <start> nine hundred and eighty six <end> 
Ground truth: <start> nine hundred and eighty-six <end>
----------------
French text: <start> six cent soixante-neuf <end>
Translation: <start> six hundred and sixty nine <end> 
Ground truth: <start> six hundred and sixty-nine <end>
----------------
French text: <start> six cent trente-sept <end>
Translation: <start> six hundred and thirty seven <end> 
Ground truth: <start> six hundred and thirty-seven <end>
----------------
French text: <start> sept cent quarante-deux <end>
Translation: <start> seven hundred and forty two <end> 
Ground truth: <start> seven hundred and forty-two <end>
----------------
French text: <start> huit cent quatre-vingt-quatorze <end>
Translation: <start> eight hundred and ninety four <end> 
Ground truth: <start> eight hundred and ninety-four <end>
