In [1]:
# Import necessary libraries and functions
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Ensure GPU is used
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
else:
    print("No GPU detected. Falling back to CPU.")

GPU is available.


In [3]:
# Read the contents of the English text file
with open("/content/drive/MyDrive/UNH Data Science Courses/DSCI 6011 - DL/Final Project/data/english-corpus.txt", 'r',encoding="utf8") as file:
    english_lines = file.readlines()

# Read the contents of the Urdu text file
with open("/content/drive/MyDrive/UNH Data Science Courses/DSCI 6011 - DL/Final Project/data/urdu-corpus.txt", 'r',encoding="utf8") as file:
    urdu_lines = file.readlines()

# Create a DataFrame
df = pd.DataFrame({'English': english_lines, 'Urdu': urdu_lines})

# Remove newline characters from the strings
df['English'] = df['English'].str.strip()
df['Urdu'] = df['Urdu'].str.strip()

df.head()

Unnamed: 0,English,Urdu
0,is zain your nephew,زین تمہارا بھتیجا ہے۔
1,i wish youd trust me,کاش تم مجھ پر بھروسہ کرتے
2,did he touch you,کیا اس نے آپ کو چھوا؟
3,its part of life,اس کی زندگی کا حصہ
4,zain isnt ugly,زین بدصورت نہیں ہے۔


In [4]:
# Calculate the maximum length of the sentences in either language
max_length_english = max(df['English'].str.split().apply(len))
max_length_urdu = max(df['Urdu'].str.split().apply(len))
max_sequence_length = max(max_length_english, max_length_urdu)

# Tokenizer
english_tokenizer = tf.keras.layers.TextVectorization(output_sequence_length=max_sequence_length)
urdu_tokenizer = tf.keras.layers.TextVectorization(output_sequence_length=max_sequence_length)

# Adapt tokenizers
english_tokenizer.adapt(df['English'])
urdu_tokenizer.adapt(df['Urdu'])

# Tokenize data
english_sequences = english_tokenizer(df['English']).numpy()
urdu_sequences = urdu_tokenizer(df['Urdu']).numpy()

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    urdu_sequences, english_sequences, test_size=0.2, random_state=42)

In [5]:
class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super(PositionalEncoding, self).__init__()
        self.token_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embedding = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        return self.token_embedding(x) + self.position_embedding(positions)

def transformer_encoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    inputs = layers.Input(shape=(None, embed_dim))
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attention = layers.Dropout(dropout_rate)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
    ff = layers.Dense(ff_dim, activation="relu")(attention)
    ff = layers.Dense(embed_dim)(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    outputs = layers.LayerNormalization(epsilon=1e-6)(attention + ff)
    return tf.keras.Model(inputs=inputs, outputs=outputs)

def transformer_decoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    enc_inputs = layers.Input(shape=(None, embed_dim))
    dec_inputs = layers.Input(shape=(None, embed_dim))
    attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(dec_inputs, dec_inputs)
    attention1 = layers.LayerNormalization(epsilon=1e-6)(dec_inputs + attention1)
    attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(attention1, enc_inputs)
    attention2 = layers.LayerNormalization(epsilon=1e-6)(attention1 + attention2)
    ff = layers.Dense(ff_dim, activation="relu")(attention2)
    ff = layers.Dense(embed_dim)(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    outputs = layers.LayerNormalization(epsilon=1e-6)(attention2 + ff)
    return tf.keras.Model(inputs=[enc_inputs, dec_inputs], outputs=outputs)

In [6]:
# class PositionalEncoding(layers.Layer):
#     def __init__(self, sequence_length, vocab_size, embed_dim):
#         super(PositionalEncoding, self).__init__()
#         self.token_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
#         self.position_embedding = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)

#     def call(self, x):
#         positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
#         return self.token_embedding(x) + self.position_embedding(positions)

# class PaddingMask(layers.Layer):
#     def call(self, inputs):
#         mask = tf.cast(tf.math.not_equal(inputs, 0), tf.float32)
#         return mask[:, tf.newaxis, tf.newaxis, :]  # Add extra dimensions to match the attention mask shape

# def transformer_encoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
#     inputs = layers.Input(shape=(None, embed_dim))
#     mask = layers.Input(shape=(1, 1, None), dtype=tf.float32)  # Mask input

#     attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs, attention_mask=mask)
#     attention = layers.Dropout(dropout_rate)(attention)
#     attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
#     ff = layers.Dense(ff_dim, activation="relu")(attention)
#     ff = layers.Dense(embed_dim)(ff)
#     ff = layers.Dropout(dropout_rate)(ff)
#     outputs = layers.LayerNormalization(epsilon=1e-6)(attention + ff)
#     return tf.keras.Model(inputs=[inputs, mask], outputs=outputs)

# def transformer_decoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
#     enc_inputs = layers.Input(shape=(None, embed_dim))
#     dec_inputs = layers.Input(shape=(None, embed_dim))
#     enc_mask = layers.Input(shape=(1, 1, None), dtype=tf.float32)  # Encoder mask input
#     dec_mask = layers.Input(shape=(1, 1, None), dtype=tf.float32)  # Decoder mask input

#     attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(dec_inputs, dec_inputs, attention_mask=dec_mask)
#     attention1 = layers.LayerNormalization(epsilon=1e-6)(dec_inputs + attention1)
#     attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(attention1, enc_inputs, attention_mask=enc_mask)
#     attention2 = layers.LayerNormalization(epsilon=1e-6)(attention1 + attention2)
#     ff = layers.Dense(ff_dim, activation="relu")(attention2)
#     ff = layers.Dense(embed_dim)(ff)
#     ff = layers.Dropout(dropout_rate)(ff)
#     outputs = layers.LayerNormalization(epsilon=1e-6)(attention2 + ff)
#     return tf.keras.Model(inputs=[enc_inputs, dec_inputs, enc_mask, dec_mask], outputs=outputs)

In [7]:
# # Example usage
# vocab_size_english = len(english_tokenizer.get_vocabulary())
# vocab_size_urdu = len(urdu_tokenizer.get_vocabulary())
# max_vocab_length = max(vocab_size_english, vocab_size_urdu)
# embed_dim = max_vocab_length
# num_heads = 4
# ff_dim = 64
# max_sequence_length = 50

# encoder_inputs = layers.Input(shape=(None,))
# decoder_inputs = layers.Input(shape=(None,))

# encoder_embeddings = PositionalEncoding(max_sequence_length, vocab_size_urdu, embed_dim)(encoder_inputs)
# decoder_embeddings = PositionalEncoding(max_sequence_length, vocab_size_english, embed_dim)(decoder_inputs)

# # Create masks using the custom PaddingMask layer
# enc_mask = PaddingMask()(encoder_inputs)
# dec_mask = PaddingMask()(decoder_inputs)

# encoder = transformer_encoder(embed_dim, num_heads, ff_dim)
# encoder_outputs = encoder([encoder_embeddings, enc_mask])

# decoder = transformer_decoder(embed_dim, num_heads, ff_dim)
# decoder_outputs = decoder([encoder_outputs, decoder_embeddings, enc_mask, dec_mask])

# outputs = layers.Dense(vocab_size_english, activation="softmax")(decoder_outputs)

# model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)
# model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [6]:
# Step 3: Build and Compile the Model
vocab_size_english = len(english_tokenizer.get_vocabulary())
vocab_size_urdu = len(urdu_tokenizer.get_vocabulary())
embed_dim = 32
num_heads = 4
ff_dim = 64

encoder_inputs = layers.Input(shape=(None,))
decoder_inputs = layers.Input(shape=(None,))

encoder_embeddings = PositionalEncoding(max_sequence_length, vocab_size_urdu, embed_dim)(encoder_inputs)
decoder_embeddings = PositionalEncoding(max_sequence_length, vocab_size_english, embed_dim)(decoder_inputs)

encoder = transformer_encoder(embed_dim, num_heads, ff_dim)
encoder_outputs = encoder(encoder_embeddings)

decoder = transformer_decoder(embed_dim, num_heads, ff_dim)
decoder_outputs = decoder([encoder_outputs, decoder_embeddings])

outputs = layers.Dense(vocab_size_english, activation="softmax")(decoder_outputs)

model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [7]:
# Step 4: Train the Model
def create_shifted_targets(y):
    return y[:, :-1], y[:, 1:]

train_decoder_input, train_decoder_target = create_shifted_targets(y_train)
val_decoder_input, val_decoder_target = create_shifted_targets(y_val)

history = model.fit(
    [X_train, train_decoder_input], train_decoder_target,
    validation_data=([X_val, val_decoder_input], val_decoder_target),
    batch_size=128,
    epochs=3
)

Epoch 1/3
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 141ms/step - accuracy: 0.7728 - loss: 6.1312 - val_accuracy: 0.8320 - val_loss: 1.2471
Epoch 2/3
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.8399 - loss: 1.1588 - val_accuracy: 0.8522 - val_loss: 0.9968
Epoch 3/3
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8559 - loss: 0.9478 - val_accuracy: 0.8682 - val_loss: 0.8678


In [8]:
def translate_urdu_to_english(urdu_sentence, model, urdu_tokenizer, english_tokenizer, max_sequence_length):
    """
    Translates a given Urdu sentence to English using the trained model.

    Args:
        urdu_sentence (str): The Urdu input sentence to be translated.
        model (tf.keras.Model): The trained Transformer model.
        urdu_tokenizer (tf.keras.layers.TextVectorization): Tokenizer for Urdu text.
        english_tokenizer (tf.keras.layers.TextVectorization): Tokenizer for English text.
        max_sequence_length (int): Maximum sequence length for translation.

    Returns:
        str: The translated English sentence.
    """
    # Tokenize the Urdu input sentence
    tokenized_input = urdu_tokenizer([urdu_sentence]).numpy()
    tokenized_input = tf.convert_to_tensor(tokenized_input)

    # Start with the <start> token in the target sequence
    start_token = english_tokenizer("<start>").numpy()[0]
    end_token = english_tokenizer("<end>").numpy()[0]
    target_sequence = tf.convert_to_tensor([[start_token]])  # Shape: (1, 1)

    # Placeholder for storing the predicted tokens
    predicted_tokens = []

    for _ in range(max_sequence_length):
        # Pass inputs through the model
        output = model([tokenized_input, target_sequence], training=False)

        # Extract the logits of the last time step
        predictions = output[:, -1, :]  # Shape: (1, vocab_size)

        # Get the predicted token (argmax)
        predicted_token = tf.argmax(predictions, axis=-1).numpy()[0]

        # Append the predicted token to the sequence
        predicted_tokens.append(predicted_token)

        # Check if the <end> token is reached
        if predicted_token == end_token:
            break

        # Append the predicted token to the target sequence for the next step
        target_sequence = tf.concat([target_sequence, [[predicted_token]]], axis=-1)

    # Convert predicted tokens back to words using the vocabulary
    vocab = english_tokenizer.get_vocabulary()
    translated_sentence = " ".join([vocab[token] for token in predicted_tokens if token > 0])

    # Print the input and translated sentence
    print(f"Input (Urdu): {urdu_sentence}")
    print(f"Translated (English): {translated_sentence}")

    return translated_sentence

In [9]:


urdu_sentences = [
    "میں اسکول جا رہا ہوں۔",
    "آج موسم بہت اچھا ہے۔",
    "یہ ایک خوبصورت کتاب ہے۔",
    "کیا آپ مدد کر سکتے ہیں؟",
    "وہ بہت اچھا کھلاڑی ہے۔"
]

for sentence in urdu_sentences:
    translate_urdu_to_english(
        urdu_sentence=sentence,
        model=model,
        urdu_tokenizer=urdu_tokenizer,
        english_tokenizer=english_tokenizer,
        max_sequence_length=max_sequence_length
    )



Input (Urdu): میں اسکول جا رہا ہوں۔
Translated (English): a a a a a a a a a a a a a a a a a a a
Input (Urdu): آج موسم بہت اچھا ہے۔
Translated (English): is a a a a a a a a a a a a a a a a a a
Input (Urdu): یہ ایک خوبصورت کتاب ہے۔
Translated (English): is a a a a a a a a a a a a a a a a a a
Input (Urdu): کیا آپ مدد کر سکتے ہیں؟
Translated (English): you have you you you have you you am you you have you have you have you have you
Input (Urdu): وہ بہت اچھا کھلاڑی ہے۔
Translated (English): is a a a a a a a a a a a a a a a a a a
