## Imports

In [2]:
import numpy as np 
import pandas as pd 
import random
import os
import tensorflow as tf
from tqdm import tqdm
import re

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Concatenate, Dropout, Input, Dense, 
                                     Add, LayerNormalization, MultiHeadAttention)

from transformers import FlaubertTokenizer, TFFlaubertModel
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from plotly.offline import init_notebook_mode


##  Load and Prepare the Dataset

In [3]:
# Load datasets
train = pd.read_parquet('/home/inductive-anks/machine-translation/Machine-Translation/data/raw/train-00000-of-00001.parquet')
test = pd.read_parquet('/home/inductive-anks/machine-translation/Machine-Translation/data/raw/test-00000-of-00001.parquet')
validate = pd.read_parquet('/home/inductive-anks/machine-translation/Machine-Translation/data/raw/validation-00000-of-00001.parquet')

# Extract Dyula and French columns from the nested dictionary
train[['dyu', 'fr']] = train['translation'].apply(pd.Series)
validate[['dyu', 'fr']] = validate['translation'].apply(pd.Series)
test[['dyu', 'fr']] = test['translation'].apply(pd.Series)

# Drop the original 'translation' column
train.drop(columns=['translation'], inplace=True)
validate.drop(columns=['translation'], inplace=True)
test.drop(columns=['translation'], inplace=True)

# Convert text to lowercase
train['dyu'] = train['dyu'].apply(lambda x: x.lower())
train['fr'] = train['fr'].apply(lambda x: x.lower())

validate['dyu'] = validate['dyu'].apply(lambda x: x.lower())
validate['fr'] = validate['fr'].apply(lambda x: x.lower())

test['dyu'] = test['dyu'].apply(lambda x: x.lower())

# Concatenate train and validate datasets
train = pd.concat([train, validate], ignore_index=True)

# Measure the length of each sentence
train['dyu_length'] = train['dyu'].apply(lambda x: len(x.split()))
train['fr_length'] = train['fr'].apply(lambda x: len(x.split()))

df = train.copy()


## Load FlauBERT Tokenizer and Model

In [4]:
# Load FlauBERT tokenizer and model
flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
flaubert_model = TFFlaubertModel.from_pretrained('flaubert/flaubert_base_cased')

# Define max lengths based on the dataset
max_dyu_len = df['dyu_length'].max()
max_fr_len = df['fr_length'].max() + 1  # Add 1 for decoder input


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFFlaubertModel: ['pred_layer.proj.bias']
- This IS expected if you are initializing TFFlaubertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFFlaubertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFFlaubertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was tra

## Tokenize the Sentences

In [5]:
# Function to tokenize sentences
def encode_sentences(sentences, tokenizer, max_len):
    return tokenizer(
        sentences,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Encode sentences using FlauBERT tokenizer
train_encodings_dyu = encode_sentences(train['dyu'].tolist(), flaubert_tokenizer, max_dyu_len)
train_encodings_fr = encode_sentences(train['fr'].tolist(), flaubert_tokenizer, max_fr_len)


## Define the FlauBERT Embedding Layer

In [6]:
# from tensorflow.keras.layers import Layer

# class FlauBERTEmbeddingLayer(Layer):
#     def __init__(self, flaubert_model, **kwargs):
#         super(FlauBERTEmbeddingLayer, self).__init__(**kwargs)
#         self.flaubert_model = flaubert_model

#     def call(self, inputs):
#         return self.flaubert_model(inputs)[0]  # Return only the embeddings (last hidden states)

# # Create an instance of the custom FlauBERT embedding layer
# flaubert_embedding_layer = FlauBERTEmbeddingLayer(flaubert_model)

In [7]:
from tensorflow.keras.layers import Layer

class FlauBERTEmbeddingLayer(Layer):
    def __init__(self, flaubert_model, **kwargs):
        super(FlauBERTEmbeddingLayer, self).__init__(**kwargs)
        self.flaubert_model = flaubert_model

    def call(self, inputs):
        return self.flaubert_model(inputs)[0]  # Return only the embeddings (last hidden states)

    def get_config(self):
        config = super().get_config()
        config.update({
            'flaubert_model': self.flaubert_model.name,  # Save the name of the model
        })
        return config

    @classmethod
    def from_config(cls, config):
        # Load the model by name from the Hugging Face transformers library
        flaubert_model = TFFlaubertModel.from_pretrained(config['flaubert_model'])
        return cls(flaubert_model=flaubert_model)


## Define the Positional Encoding Layer

In [8]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

# class PositionalEncoding(Layer):
#     def __init__(self, position, d_model):
#         super(PositionalEncoding, self).__init__()
#         self.pos_encoding = positional_encoding(position, d_model)

#     def call(self, x):
#         return x + self.pos_encoding[:, :tf.shape(x)[1], :]

import tensorflow as tf
import numpy as np

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_seq_len, dm):
        super(PositionalEncoding, self).__init__()
        self.positional_encoding = self.positional_encoding_matrix(max_seq_len, dm)

    def positional_encoding_matrix(self, max_seq_len, dm):
        angle_rads = self.get_angles(np.arange(max_seq_len)[:, np.newaxis],
                                     np.arange(dm)[np.newaxis, :],
                                     dm)
        # apply sin to even indices
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        # apply cos to odd indices
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        
        positional_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(positional_encoding, dtype=tf.float32)

    def get_angles(self, pos, i, dm):
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(dm))
        return pos * angle_rates

    def call(self, inputs):
        return inputs + self.positional_encoding[:, :tf.shape(inputs)[1], :]

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_seq_len": self.positional_encoding.shape[1],
            "dm": self.positional_encoding.shape[2]
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(config['max_seq_len'], config['dm'])



## Define the Transformer Encoder Layer

In [9]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Multi-Head Self Attention
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = Add()([x, inputs])
    x = LayerNormalization(epsilon=1e-6)(x)
    
    # Feed Forward Network
    x_ffn = Dense(ff_dim, activation="relu")(x)
    x_ffn = Dropout(dropout)(x_ffn)
    x_ffn = Dense(inputs.shape[-1])(x_ffn)
    x_ffn = Add()([x_ffn, x])
    x_ffn = LayerNormalization(epsilon=1e-6)(x_ffn)
    return x_ffn

## Define the Transformer Model with Stacked Layers and Positional Encoding

In [10]:
flaubert_embedding_layer = FlauBERTEmbeddingLayer(flaubert_model)

In [11]:
# # Define transformer hyperparameters
# head_size = 512
# num_heads = 8
# ff_dim = 2048
# dropout = 0.1
# num_layers = 4  # Number of stacked layers

# Increase depth and width of the model
head_size = 1024
num_heads = 16
ff_dim = 4096
dropout = 0.1
num_layers = 4  # Increasing the depth

# Encoder
encoder_inputs = Input(shape=(max_dyu_len,), name='encoder_inputs', dtype=tf.int32)
encoder_embeddings = flaubert_embedding_layer(encoder_inputs)  
pos_encoding = PositionalEncoding(max_dyu_len, encoder_embeddings.shape[-1])
encoder_embeddings = pos_encoding(encoder_embeddings)

x = encoder_embeddings
for _ in range(num_layers):
    x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
encoder_outputs = x

# Decoder
decoder_inputs = Input(shape=(max_fr_len,), name='decoder_inputs', dtype=tf.int32)
decoder_embeddings = flaubert_embedding_layer(decoder_inputs)
decoder_embeddings = pos_encoding(decoder_embeddings)

x = decoder_embeddings
for _ in range(num_layers):
    x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
decoder_outputs = x

# Output layer
output_layer = Dense(flaubert_tokenizer.vocab_size, activation='softmax', name='output_layer')(decoder_outputs)

# Define the model
model_2 = Model([encoder_inputs, decoder_inputs], output_layer)


## Compile the Model with a Learning Rate Schedule

In [12]:
# Learning Rate Schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4,
    decay_steps=10000,
    decay_rate=0.9)

# Compile the model
model_2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Model summary
model_2.summary()


## Prepare the Data for Training

In [13]:
# Ensure decoder input and target data have the correct sequence length
decoder_input_data = np.array(train_encodings_fr['input_ids'][:, :-1])
decoder_target_data = np.expand_dims(np.array(train_encodings_fr['input_ids'][:, 1:]), axis=-1)

# Pad sequences if necessary
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_fr_len, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_fr_len, padding='post')

# Convert to NumPy arrays
encoder_input_data = np.array(train_encodings_dyu['input_ids'])
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = np.array(decoder_target_data)

# Split data
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2
)

## Train the Model

In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [15]:
# # Define the checkpoint directory and file format
# checkpoint_dir = '/home/inductive-anks/machine-translation/Machine-Translation/models/checkpoints'
# checkpoint_filepath = checkpoint_dir + '/model_epoch_{epoch:02d}.keras'

# # Create a callback that saves the model's weights
# checkpoint_callback = ModelCheckpoint(
#     filepath=checkpoint_filepath,
#     save_weights_only=False,  # Set to True if you want to save only weights
#     save_best_only=False,  # Set to True to save only the best model based on validation loss
#     save_freq='epoch'  # Save at the end of every epoch
# )

In [16]:
batch_size = 30
epochs = 20

from tensorflow.keras.callbacks import EarlyStopping

# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)

from tensorflow.keras.callbacks import Callback

# Custom callback to stop training if accuracy exceeds 98%
class StopAtAccuracy(Callback):
    def __init__(self, accuracy_threshold=0.98):
        super(StopAtAccuracy, self).__init__()
        self.accuracy_threshold = accuracy_threshold

    def on_epoch_end(self, epoch, logs=None):
        if logs.get('accuracy') >= self.accuracy_threshold:
            print(f"\nReached {self.accuracy_threshold * 100:.2f}% accuracy. Stopping training.")
            self.model.stop_training = True

# Instantiate the custom callback
stop_at_98 = StopAtAccuracy(accuracy_threshold=0.98)

# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=1,
    restore_best_weights=True
)

# Training with custom callback and early stopping
history = model_2.fit(
    [encoder_input_train, decoder_input_train],
    decoder_target_train,
    batch_size=batch_size,
    epochs=epochs,#
    validation_data=([encoder_input_val, decoder_input_val], decoder_target_val),
    callbacks=[early_stopping, stop_at_98]ma
)



Epoch 1/20


[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1584s[0m 6s/step - accuracy: 0.5533 - loss: 5.2987 - val_accuracy: 0.7045 - val_loss: 2.3150
Epoch 2/20
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1561s[0m 6s/step - accuracy: 0.7373 - loss: 2.0164 - val_accuracy: 0.7835 - val_loss: 1.7622
Epoch 3/20
[1m255/255[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1543s[0m 6s/step - accuracy: 0.7971 - loss: 1.5089 - val_accuracy: 0.8101 - val_loss: 1.5167
Epoch 4/20
[1m 79/255[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m16:57[0m 6s/step - accuracy: 0.8151 - loss: 1.2987

In [16]:
def sequence_to_text(sequence, tokenizer):
    return tokenizer.decode(sequence, skip_special_tokens=True)

In [21]:
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

def sequence_to_text(sequence, tokenizer):
    return tokenizer.decode(sequence, skip_special_tokens=True)

# Generate predictions for a subset of test sentences
num_samples = 10 # Number of samples to evaluate
random_indices = random.sample(range(len(encoder_input_val)), num_samples)

# Collect predictions, actual sentences, and BLEU scores
predicted_sentences = []
actual_sentences = []
bleu_scores = []

for idx in random_indices:
    # Prepare inputs
    encoder_input_seq = tf.expand_dims(encoder_input_val[idx], axis=0)
    decoder_input_seq = tf.expand_dims(decoder_input_val[idx], axis=0)
    
    # Predict the output sequence
    pred_seq = model.predict([encoder_input_seq, decoder_input_seq])
    pred_seq = tf.argmax(pred_seq, axis=-1).numpy().flatten()
    
    # Convert sequences to text
    predicted_sentence = sequence_to_text(pred_seq, flaubert_tokenizer)
    actual_sentence = sequence_to_text(decoder_target_val[idx].flatten(), flaubert_tokenizer)
    
    # Check if the predicted sentence is not empty
    if predicted_sentence.strip():
        # Calculate BLEU score
        ref_tokens = [actual_sentence.split()]
        pred_tokens = predicted_sentence.split()
        smoothie = SmoothingFunction().method4
        bleu_score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
    else:
        bleu_score = 0.0  # Assign a BLEU score of 0 if the predicted sentence is empty
    
    # Append results
    predicted_sentences.append(predicted_sentence)
    actual_sentences.append(actual_sentence)
    bleu_scores.append(bleu_score)
    
# Print results
for i in range(num_samples):
    print(f"Original Dyula Sentence (ID: {random_indices[i]}):")
    print(sequence_to_text(encoder_input_val[random_indices[i]].flatten(), flaubert_tokenizer))
    print(f"Predicted French Sentence: {predicted_sentences[i]}")
    print(f"Actual French Sentence: {actual_sentences[i]}")
    print(f"BLEU Score: {bleu_scores[i]:.4f}\n")

# Calculate average BLEU score
average_bleu = np.mean(bleu_scores)
print(f"Average BLEU Score for Sampled Sentences: {average_bleu:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
Original Dyula Sentence (ID: 406):
hun kagni.ini tchié
Predicted French Sentence: je suis bien, merci!
Actual French Sentence: je suis bien, merci!
BLEU Score: 1.0000

Original Dyula Sentence (ID: 1372):
kèlè djougou ka lacaleli
Predicted French Sentence: voici le rôle de l' impôt.
Actual French Sentence: voici le récit de l' attentat.
BLEU Score: 0.1374

Original Dyula Sentence (ID: 1079):
kamelé baw fè wa?
Pred

In [19]:
# Save the model architecture and weights
model.save('/home/inductive-anks/machine-translation/Machine-Translation/models/model.keras')