In [1]:
# Import necessary libraries and functions
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Ensure GPU is used
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
else:
    print("No GPU detected. Falling back to CPU.")

GPU is available.


In [3]:
# Read the contents of the English text file
with open("/content/drive/MyDrive/UNH Data Science Courses/DSCI 6011 - DL/Final Project/data/english-corpus.txt", 'r',encoding="utf8") as file:
    english_lines = file.readlines()

# Read the contents of the Urdu text file
with open("/content/drive/MyDrive/UNH Data Science Courses/DSCI 6011 - DL/Final Project/data/urdu-corpus.txt", 'r',encoding="utf8") as file:
    urdu_lines = file.readlines()

# Create a DataFrame
df = pd.DataFrame({'English': english_lines, 'Urdu': urdu_lines})

# Remove newline characters from the strings
df['English'] = df['English'].str.strip()
df['Urdu'] = df['Urdu'].str.strip()

df.head()

Unnamed: 0,English,Urdu
0,is zain your nephew,زین تمہارا بھتیجا ہے۔
1,i wish youd trust me,کاش تم مجھ پر بھروسہ کرتے
2,did he touch you,کیا اس نے آپ کو چھوا؟
3,its part of life,اس کی زندگی کا حصہ
4,zain isnt ugly,زین بدصورت نہیں ہے۔


In [4]:
# Calculate the maximum length of the sentences in either language
max_length_english = max(df['English'].str.split().apply(len))
max_length_urdu = max(df['Urdu'].str.split().apply(len))
max_sequence_length = max(max_length_english, max_length_urdu)

# Add start and end tokens
df['English'] = "<start> " + df['English'] + " <end>"
df['Urdu'] = "<start> " + df['Urdu'] + " <end>"

# Ensure tokenizers handle padding explicitly
english_tokenizer = tf.keras.layers.TextVectorization(output_sequence_length=max_sequence_length, ragged=False)
urdu_tokenizer = tf.keras.layers.TextVectorization(output_sequence_length=max_sequence_length, ragged=False)

# Adapt tokenizers
english_tokenizer.adapt(["<pad>", "<start>", "<end>"] + list(df['English']))
urdu_tokenizer.adapt(["<pad>", "<start>", "<end>"] + list(df['Urdu']))

# Tokenize data
english_sequences = english_tokenizer(df['English']).numpy()
urdu_sequences = urdu_tokenizer(df['Urdu']).numpy()

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    urdu_sequences, english_sequences, test_size=0.2, random_state=42)

In [5]:
# Shift decoder targets
def create_shifted_targets(y):
    return y[:, :-1], y[:, 1:]

train_decoder_input, train_decoder_target = create_shifted_targets(y_train)
val_decoder_input, val_decoder_target = create_shifted_targets(y_val)

In [6]:
# Define padding function
def create_padding_mask(sequence):
    return tf.cast(tf.math.equal(sequence, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]  # Mask shape: (batch_size, 1, 1, sequence_length)

In [7]:
# Encoder padding mask for X_train
train_encoder_padding_mask = create_padding_mask(X_train)
val_encoder_padding_mask = create_padding_mask(X_val)

# Decoder padding mask for train_decoder_input
train_decoder_padding_mask = create_padding_mask(train_decoder_input)
val_decoder_padding_mask = create_padding_mask(val_decoder_input)

In [8]:
# Define positinoal embedding class
class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super(PositionalEncoding, self).__init__()
        self.token_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embedding = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        return self.token_embedding(x) + self.position_embedding(positions)

In [9]:
# Define encoder
def transformer_encoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    inputs = layers.Input(shape=(None, embed_dim))
    mask = layers.Input(shape=(1, 1, None))
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs, attention_mask=mask)
    attention = layers.Dropout(dropout_rate)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
    ff = layers.Dense(ff_dim, activation="relu")(attention)
    ff = layers.Dense(embed_dim)(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    outputs = layers.LayerNormalization(epsilon=1e-6)(attention + ff)
    return tf.keras.Model(inputs=[inputs, mask], outputs=outputs)

# Define decoder
def transformer_decoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    enc_inputs = layers.Input(shape=(None, embed_dim))
    dec_inputs = layers.Input(shape=(None, embed_dim))
    mask = layers.Input(shape=(1, 1, None))
    attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(dec_inputs, dec_inputs, attention_mask=mask)
    attention1 = layers.LayerNormalization(epsilon=1e-6)(dec_inputs + attention1)
    attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(attention1, enc_inputs)
    attention2 = layers.LayerNormalization(epsilon=1e-6)(attention1 + attention2)
    ff = layers.Dense(ff_dim, activation="relu")(attention2)
    ff = layers.Dense(embed_dim)(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    outputs = layers.LayerNormalization(epsilon=1e-6)(attention2 + ff)
    return tf.keras.Model(inputs=[enc_inputs, dec_inputs, mask], outputs=outputs)

In [11]:
# Build the model by layers
vocab_size_english = len(english_tokenizer.get_vocabulary())
vocab_size_urdu = len(urdu_tokenizer.get_vocabulary())
embed_dim = 128
num_heads = 8
ff_dim = 64

# Inputs
encoder_inputs = layers.Input(shape=(None,))
decoder_inputs = layers.Input(shape=(None,))
encoder_mask = layers.Input(shape=(1, 1, None))
decoder_mask = layers.Input(shape=(1, 1, None))

# Encoder
encoder_embeddings = PositionalEncoding(max_sequence_length, vocab_size_urdu, embed_dim)(encoder_inputs)
encoder = transformer_encoder(embed_dim, num_heads, ff_dim)
encoder_outputs = encoder([encoder_embeddings, encoder_mask])

# Decoder
decoder_embeddings = PositionalEncoding(max_sequence_length, vocab_size_english, embed_dim)(decoder_inputs)
decoder = transformer_decoder(embed_dim, num_heads, ff_dim)
decoder_outputs = decoder([encoder_outputs, decoder_embeddings, decoder_mask])

# Output
outputs = layers.Dense(vocab_size_english, activation="softmax")(decoder_outputs)

# Model
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs, encoder_mask, decoder_mask], outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [12]:
# Instantiate optimizer and loss function criterion
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [13]:
# Train the Model

history = model.fit(
    [X_train, train_decoder_input, train_encoder_padding_mask, train_decoder_padding_mask],
    train_decoder_target,
    validation_data=(
        [X_val, val_decoder_input, val_encoder_padding_mask, val_decoder_padding_mask],
        val_decoder_target
    ),
    batch_size=128,
    epochs=10
)

Epoch 1/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 118ms/step - accuracy: 0.7246 - loss: 3.4123 - val_accuracy: 0.8110 - val_loss: 1.2078
Epoch 2/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - accuracy: 0.8136 - loss: 1.1491 - val_accuracy: 0.8409 - val_loss: 0.9784
Epoch 3/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.8551 - loss: 0.8552 - val_accuracy: 0.8802 - val_loss: 0.7261
Epoch 4/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.8957 - loss: 0.5902 - val_accuracy: 0.9023 - val_loss: 0.5820
Epoch 5/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9239 - loss: 0.4171 - val_accuracy: 0.9147 - val_loss: 0.5043
Epoch 6/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9424 - loss: 0.3001 - val_accuracy: 0.9214 - val_loss: 0.4635
Epoch 7/10
[1m154/

In [14]:
def translate_urdu_to_english(urdu_sentence, model, urdu_tokenizer, english_tokenizer, max_sequence_length):
    # Tokenize the Urdu sentence
    tokenized_input = urdu_tokenizer([urdu_sentence]).numpy()
    tokenized_input = tf.convert_to_tensor(tokenized_input)

    # Create padding mask for the encoder
    encoder_mask = create_padding_mask(tokenized_input)

    # Initialize decoder input with <start> token
    start_token = english_tokenizer("<start>").numpy()[0]
    end_token = english_tokenizer("<end>").numpy()[0]
    target_sequence = tf.convert_to_tensor([[start_token]])

    # Store predicted tokens
    predicted_tokens = []

    for _ in range(max_sequence_length):
        # Create decoder mask for the current target sequence
        decoder_mask = create_padding_mask(target_sequence)

        # Call the model with the current inputs
        output = model([tokenized_input, target_sequence, encoder_mask, decoder_mask], training=False)

        # Get the predicted token
        predictions = output[:, -1, :]  # Get the logits for the last timestep
        predicted_token = tf.argmax(predictions, axis=-1).numpy()[0]

        # Append the predicted token
        predicted_tokens.append(predicted_token)

        # Break if <end> token is reached
        if predicted_token == end_token:
            break

        # Update the target sequence by appending the predicted token
        target_sequence = tf.concat([target_sequence, [[predicted_token]]], axis=-1)

    # Convert tokens back to text
    vocab = english_tokenizer.get_vocabulary()
    translated_sentence = " ".join([vocab[token] for token in predicted_tokens if token not in {0, start_token, end_token}])

    # Print the translation
    print(f"Input (Urdu): {urdu_sentence}")
    print(f"Translated (English): {translated_sentence}")
    return translated_sentence

In [15]:
# Testing of unseen example urdu sentences
urdu_sentences = [
    "میں اسکول جا رہا ہوں۔",
    "آج موسم بہت اچھا ہے۔",
    "یہ ایک خوبصورت کتاب ہے۔",
    "کیا آپ مدد کر سکتے ہیں؟",
    "وہ بہت اچھا کھلاڑی ہے۔"
]

for sentence in urdu_sentences:
    translate_urdu_to_english(sentence, model, urdu_tokenizer, english_tokenizer, max_sequence_length)



Input (Urdu): میں اسکول جا رہا ہوں۔
Translated (English): to be going to school
Input (Urdu): آج موسم بہت اچھا ہے۔
Translated (English): the weather is so well today
Input (Urdu): یہ ایک خوبصورت کتاب ہے۔
Translated (English): a is a beautiful book
Input (Urdu): کیا آپ مدد کر سکتے ہیں؟
Translated (English): can you do help me
Input (Urdu): وہ بہت اچھا کھلاڑی ہے۔
Translated (English): a great of great player
