## Imports

In [1]:
import numpy as np 
import pandas as pd 
import random
import os
import tensorflow as tf
from tqdm import tqdm
import re

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Embedding, LSTM, Concatenate, Dropout,
                                     Input, Dense, Bidirectional, Layer, TimeDistributed)

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout, LayerNormalization, MultiHeadAttention, Add, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np


import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from plotly.offline import init_notebook_mode


2024-08-15 00:56:02.429608: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-15 00:56:02.920537: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-15 00:56:03.076083: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-15 00:56:03.681362: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Loading Dataset

In [None]:
train = pd.read_parquet('/kaggle/input/machine-translation-data/train-00000-of-00001.parquet')

In [None]:
test = pd.read_parquet('/kaggle/input/machine-translation-data/test-00000-of-00001.parquet')

In [None]:
validate = pd.read_parquet('/kaggle/input/machine-translation-data/validation-00000-of-00001.parquet')

## Text Preprocessing

In [None]:
train[['dyu', 'fr']] = train['translation'].apply(pd.Series)
train.drop(columns=['translation'], inplace=True)

validate[['dyu', 'fr']] = validate['translation'].apply(pd.Series)
validate.drop(columns=['translation'], inplace=True)

test[['dyu', 'fr']] = test['translation'].apply(pd.Series)
test.drop(columns=['translation'], inplace=True)

In [None]:
print('Length of train data: ', len(train))
print('Length of test data: ', len(test))
print('Length of validate data: ', len(validate))

In [None]:
train['dyu'] = train['dyu'].apply(lambda x: x.lower())
train['fr'] = train['fr'].apply(lambda x: x.lower())

validate['dyu'] = validate['dyu'].apply(lambda x: x.lower())
validate['fr'] = validate['fr'].apply(lambda x: x.lower())

test['dyu'] = test['dyu'].apply(lambda x: x.lower())

In [None]:
train = pd.concat([train, validate], ignore_index=True)

In [None]:
def dyu_preprocessing(data, col):
    data[col] = data[col].astype(str)
    data[col] = data[col].apply(lambda x: x.lower())
    return data

def fr_preprocessing(data, col):
    data[col] = data[col].apply(lambda x: x.lower())
    #data[col] = "startseq "+data[col]+" endseq"
    return data

In [None]:
train = dyu_preprocessing(train,'dyu')
train = fr_preprocessing(train,'fr')

In [None]:
train['dyu_length'] = train['dyu'].apply(lambda x: len(x.split()))
train['fr_length'] = train['fr'].apply(lambda x: len(x.split()))

In [None]:
train.head()

In [None]:
df = train.copy()

In [None]:
dyu_tokenizer = Tokenizer()
dyu_tokenizer.fit_on_texts(df['dyu'].values)
dyu_vocab_size = len(dyu_tokenizer.word_index) + 1
dyu_sequences = dyu_tokenizer.texts_to_sequences(df['dyu'].values)

In [None]:
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(df['fr'].values)
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_sequences = fr_tokenizer.texts_to_sequences(df['fr'].values)

In [None]:
max_dyu_len = df['dyu_length'].max()
max_fr_len = df['fr_length'].max()

dyu_padded = pad_sequences(dyu_sequences, maxlen=max_dyu_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

In [None]:
# Display the prepared data
print("Dyula padded sequences:")
print(dyu_padded[:5])

print("French padded sequences:")
print(fr_padded[:5])

# Save vocabulary sizes for future use
print(f"Dyula Vocabulary Size: {dyu_vocab_size}")
print(f"French Vocabulary Size: {fr_vocab_size}")

In [None]:
px.histogram(train, x="fr_length",height=700, title="French Sentences Length Distribution", marginal="box")

In [None]:
px.histogram(train, x="dyu_length",height=700, title="Dyulu Sentences Length Distribution", marginal="box")

## Multi Head Attention Model Implementation

In [None]:
# Load GloVe Embeddings
def load_glove_embeddings(glove_file, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create Embedding Matrix
def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim=300):
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load GloVe embeddings
glove_file = '/kaggle/input/glove-text-embeddings/glove.6B.300d.txt'
embedding_dim = 300
embeddings_index = load_glove_embeddings(glove_file, embedding_dim)

# Create embedding matrices for Dyula and French
dyu_embedding_matrix = create_embedding_matrix(dyu_tokenizer, embeddings_index, embedding_dim)
fr_embedding_matrix = create_embedding_matrix(fr_tokenizer, embeddings_index, embedding_dim)

# Display embedding matrix shapes
print(f"Dyula Embedding Matrix Shape: {dyu_embedding_matrix.shape}")
print(f"French Embedding Matrix Shape: {fr_embedding_matrix.shape}")


In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Multi-Head Self Attention
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = Add()([x, inputs])
    x = LayerNormalization(epsilon=1e-6)(x)
    
    # Feed Forward Network
    x_ffn = Dense(ff_dim, activation="relu")(x)
    x_ffn = Dropout(dropout)(x_ffn)
    x_ffn = Dense(inputs.shape[-1])(x_ffn)
    x_ffn = Add()([x_ffn, x])
    x_ffn = LayerNormalization(epsilon=1e-6)(x_ffn)
    return x_ffn

In [None]:
head_size = 256  # Size of each attention head
num_heads = 40 # Number of attention heads
ff_dim = 512  # Hidden layer size in feed-forward network
dropout = 0.1  # Dropout rate
embedding_dim = 300  # Dimension of word embeddings

In [None]:
# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=dyu_vocab_size, 
                              output_dim=embedding_dim, 
                              weights=[dyu_embedding_matrix], 
                              input_length=max_dyu_len, 
                              trainable=False, 
                              name='encoder_embedding')(encoder_inputs)

# Pass through multiple transformer encoder layers
encoder_outputs = transformer_encoder(encoder_embedding, head_size, num_heads, ff_dim, dropout)

In [None]:
# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=fr_vocab_size, 
                              output_dim=embedding_dim, 
                              weights=[fr_embedding_matrix], 
                              input_length=max_fr_len, 
                              trainable=False, 
                              name='decoder_embedding')(decoder_inputs)

# Pass through multiple transformer encoder layers (used in decoder as well)
decoder_outputs = transformer_encoder(decoder_embedding, head_size, num_heads, ff_dim, dropout)

In [None]:
# Output layer
output_layer = Dense(fr_vocab_size, activation='softmax', name='output_layer')(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], output_layer)

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [None]:
# Prepare target sequences
decoder_input_data = fr_padded[:, :-1]
decoder_target_data = np.expand_dims(fr_padded[:, 1:], -1)

In [None]:
# Split the data
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(
    dyu_padded, decoder_input_data, decoder_target_data, test_size=0.2
)

In [None]:
# Train the model
batch_size = 30
epochs = 100
history = model.fit(
    [encoder_input_train, decoder_input_train],
    decoder_target_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([encoder_input_val, decoder_input_val], decoder_target_val)
)

In [None]:
import numpy as np
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def sequence_to_text(sequence, tokenizer):
    return ' '.join([tokenizer.index_word.get(i, '') for i in sequence if i > 0])

# Generate predictions for a subset of test sentences
num_samples = 10  # Change this to any number of samples you want
random_indices = random.sample(range(len(encoder_input_val)), num_samples)

# Collect predictions, actual sentences, and BLEU scores
predicted_sentences = []
actual_sentences = []
bleu_scores = []

for idx in random_indices:
    # Prepare inputs
    encoder_input_seq = np.expand_dims(encoder_input_val[idx], axis=0)
    decoder_input_seq = np.expand_dims(decoder_input_val[idx], axis=0)
    
    # Predict the output sequence
    pred_seq = model.predict([encoder_input_seq, decoder_input_seq])
    pred_seq = np.argmax(pred_seq, axis=-1).flatten()
    
    # Convert sequences to text
    predicted_sentence = sequence_to_text(pred_seq, fr_tokenizer)
    actual_sentence = sequence_to_text(decoder_target_val[idx].flatten(), fr_tokenizer)
    
    # Check if the predicted sentence is not empty
    if predicted_sentence.strip():
        # Calculate BLEU score
        ref_tokens = [actual_sentence.split()]
        pred_tokens = predicted_sentence.split()
        smoothie = SmoothingFunction().method4
        bleu_score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
    else:
        bleu_score = 0.0  # Assign a BLEU score of 0 if the predicted sentence is empty
    
    # Append results
    predicted_sentences.append(predicted_sentence)
    actual_sentences.append(actual_sentence)
    bleu_scores.append(bleu_score)
    
# Print results
for i in range(num_samples):
    print(f"Original Dyula Sentence (ID: {random_indices[i]}):")
    print(sequence_to_text(encoder_input_val[random_indices[i]], dyu_tokenizer))
    print(f"Predicted French Sentence: {predicted_sentences[i]}")
    print(f"Actual French Sentence: {actual_sentences[i]}")
    print(f"BLEU Score: {bleu_scores[i]:.4f}\n")

# Calculate average BLEU score
average_bleu = np.mean(bleu_scores)
print(f"Average BLEU Score for Sampled Sentences: {average_bleu:.4f}")
