In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current CUDA device:", torch.cuda.get_device_name(0))


CUDA available: True
CUDA device count: 1
Current CUDA device: NVIDIA GeForce RTX 4060


In [4]:
# System and numerical libraries
import os
import numpy as np

# PyTorch libraries for model building, training, and evaluation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Tokenization and text preprocessing
from collections import Counter
import re

# BLEU score for evaluation
from nltk.translate.bleu_score import sentence_bleu

# Visualization
import matplotlib.pyplot as plt


In [5]:
# Reading the data from text files
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    return lines

english_sentences = read_corpus("Dataset/english-corpus.txt")
urdu_sentences = read_corpus("Dataset/urdu-corpus.txt")

# Check if both files have the same number of sentences
assert len(english_sentences) == len(urdu_sentences), "Mismatch in number of sentences!"

In [6]:
def tokenize(text):
    # Lowercase and remove punctuation (you may need to handle language-specific punctuation for Urdu)
    text = text.lower().strip()
    # Using regex to keep only words (this can be adjusted for Urdu)
    tokens = re.findall(r'\w+', text)
    return tokens

# Tokenize each sentence from both corpora
english_tokens = [tokenize(sentence) for sentence in english_sentences]
urdu_tokens = [tokenize(sentence) for sentence in urdu_sentences]

# Let’s inspect some examples
print("English:", english_tokens[:3])
print("Urdu:", urdu_tokens[:3])

English: [['is', 'zain', 'your', 'nephew'], ['i', 'wish', 'youd', 'trust', 'me'], ['did', 'he', 'touch', 'you']]
Urdu: [['زین', 'تمہارا', 'بھتیجا', 'ہے'], ['کاش', 'تم', 'مجھ', 'پر', 'بھروسہ', 'کرتے'], ['کیا', 'اس', 'نے', 'آپ', 'کو', 'چھوا']]


In [7]:
def build_vocab(tokenized_sentences, min_freq=1):
    counter = Counter()
    for tokens in tokenized_sentences:
        counter.update(tokens)
        
    # Starting index for normal words, plus special tokens
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    index = len(vocab)
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = index
            index += 1
    return vocab

eng_vocab = build_vocab(english_tokens)
urd_vocab = build_vocab(urdu_tokens)

print("English vocab size:", len(eng_vocab))
print("Urdu vocab size:", len(urd_vocab))

English vocab size: 5628
Urdu vocab size: 5621


In [8]:
def sentence_to_indices(tokens, vocab):
    # Start with <sos> and end with <eos>
    indices = [vocab.get("<sos>")]
    indices += [vocab.get(token, vocab.get("<unk>")) for token in tokens]
    indices.append(vocab.get("<eos>"))
    return indices

eng_sequences = [sentence_to_indices(tokens, eng_vocab) for tokens in english_tokens]
urd_sequences = [sentence_to_indices(tokens, urd_vocab) for tokens in urdu_tokens]

# Example: print first sequence
print("English Sequence Example:", eng_sequences[0])
print("Urdu Sequence Example:", urd_sequences[0])


English Sequence Example: [1, 4, 5, 6, 7, 2]
Urdu Sequence Example: [1, 4, 5, 6, 7, 2]


In [9]:
import torch
from torch.nn.utils.rnn import pad_sequence

def pad_sequences(sequences, padding_value=0):
    tensor_list = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
    padded_tensor = pad_sequence(tensor_list, batch_first=True, padding_value=padding_value)
    return padded_tensor

# Example for a batch (or full dataset if memory permits)
eng_padded = pad_sequences(eng_sequences, padding_value=eng_vocab["<pad>"])
urd_padded = pad_sequences(urd_sequences, padding_value=urd_vocab["<pad>"])

print("Padded English batch shape:", eng_padded.shape)


Padded English batch shape: torch.Size([24525, 18])


In [10]:


# Encoder model
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        
    def forward(self, src):
        # src shape: [batch_size, src_len]
        embedded = self.embedding(src)  # [batch_size, src_len, emb_dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

# Decoder model
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
    def forward(self, input, hidden, cell):
        # input shape: [batch_size] -> we add a time-dim (length 1)
        input = input.unsqueeze(1)
        embedded = self.embedding(input)  # [batch_size, 1, emb_dim]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))  # [batch_size, output_dim]
        return prediction, hidden, cell

# Seq2Seq model that ties encoder and decoder together
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.embedding.num_embeddings
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # Encode the source sentence
        hidden, cell = self.encoder(src)
        
        # First input to the decoder is the <sos> token
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        
        return outputs

# Define hyperparameters and instantiate models:
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(urd_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)

# Print model summary
print(model)


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5628, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(5621, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5621, bias=True)
  )
)


In [12]:
import torch
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, src_sequences, trg_sequences):
        assert len(src_sequences) == len(trg_sequences), "Datasets must be of equal length"
        self.src_sequences = src_sequences
        self.trg_sequences = trg_sequences

    def __len__(self):
        return len(self.src_sequences)

    def __getitem__(self, idx):
        # Convert sequences (lists of indices) to tensors
        src_tensor = torch.tensor(self.src_sequences[idx], dtype=torch.long)
        trg_tensor = torch.tensor(self.trg_sequences[idx], dtype=torch.long)
        return src_tensor, trg_tensor


In [16]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Unpack a batch of source and target sequences
    src_batch, trg_batch = zip(*batch)
    
    # Pad sequences using the corresponding pad token indices from vocabularies.
    # Make sure that eng_vocab["<pad>"] and urd_vocab["<pad>"] exist.
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=eng_vocab["<pad>"])
    trg_batch = pad_sequence(trg_batch, batch_first=True, padding_value=urd_vocab["<pad>"])
    
    return src_batch, trg_batch


In [17]:
from torch.utils.data import DataLoader

# Assuming eng_sequences and urd_sequences were generated earlier from your corpus
dataset = TranslationDataset(eng_sequences, urd_sequences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [18]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=urd_vocab["<pad>"])

# Dummy training loop for illustration
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for src_batch, trg_batch in dataloader:  # assuming you have a DataLoader set up
        src_batch = src_batch.to(device)
        trg_batch = trg_batch.to(device)
        
        optimizer.zero_grad()
        output = model(src_batch, trg_batch, teacher_forcing_ratio=0.5)
        # Output shape: [batch_size, trg_len, output_dim]
        
        # Flatten the output and target for computing loss
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg_batch[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}: Loss = {epoch_loss / len(dataloader)}")


Epoch 1: Loss = 4.417996755013099
Epoch 2: Loss = 3.365813548437464
Epoch 3: Loss = 2.708676909530023
Epoch 4: Loss = 2.1652185531452086
Epoch 5: Loss = 1.736251979328041
Epoch 6: Loss = 1.3805348039295091
Epoch 7: Loss = 1.079663947477179
Epoch 8: Loss = 0.8477709890966154
Epoch 9: Loss = 0.6804032131466126
Epoch 10: Loss = 0.5483123998909295


In [20]:
import random
from nltk.translate.bleu_score import sentence_bleu

# Function to evaluate a single sentence using the model.
def evaluate_sentence(model, sentence, eng_vocab, urd_vocab, max_len=50):
    model.eval()
    tokens = tokenize(sentence)  # tokenize the input sentence (should include lowercasing etc.)
    indices = sentence_to_indices(tokens, eng_vocab)  # convert tokens to indices, adding <sos> and <eos>
    src_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
    # Start decoding with <sos>
    trg_indices = [urd_vocab["<sos>"]]
    
    for _ in range(max_len):
        trg_tensor = torch.tensor([trg_indices[-1]], dtype=torch.long).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        next_token = output.argmax(1).item()
        trg_indices.append(next_token)
        if next_token == urd_vocab["<eos>"]:
            break
            
    return trg_indices

# Create an inverse vocabulary for Urdu for converting indices back to words.
inv_urd_vocab = {v: k for k, v in urd_vocab.items()}

# Choose 10 random indices from the english dataset.
num_samples = 10
random_indices = random.sample(range(len(english_sentences)), num_samples)

# Loop over the selected indices, process each sentence, and print the results.
for i in random_indices:
    eng_sentence = english_sentences[i]
    ref_urdu_sentence = urdu_sentences[i]
    
    # Predict the translation for the given English sentence.
    predicted_indices = evaluate_sentence(model, eng_sentence, eng_vocab, urd_vocab)
    predicted_translation = " ".join([inv_urd_vocab.get(idx, "<unk>") for idx in predicted_indices])
    
    # For BLEU score, tokenize both the predicted and reference sentences.
    candidate_tokens = [inv_urd_vocab.get(idx, "<unk>") for idx in predicted_indices]
    reference_tokens = tokenize(ref_urdu_sentence)
    bleu = sentence_bleu([reference_tokens], candidate_tokens)
    
    print("English Sentence   :", eng_sentence)
    print("Reference Urdu     :", ref_urdu_sentence)
    print("Predicted Urdu     :", predicted_translation)
    print("BLEU Score         :", bleu)
    print("-" * 50)


English Sentence   : she cooks for him
Reference Urdu     : وہ اس کے لیے کھانا پکاتی ہے۔
Predicted Urdu     : <sos> وہ اس کے لیے کھانا پکاتی ہے <eos>
BLEU Score         : 0.7259795291154771
--------------------------------------------------
English Sentence   : they are lost
Reference Urdu     : وہ کھو گئے ہیں
Predicted Urdu     : <sos> وہ کھو رہے ہیں <eos>
BLEU Score         : 8.38826642100846e-155
--------------------------------------------------
English Sentence   : where is she
Reference Urdu     : وہ کہاں ہے
Predicted Urdu     : <sos> وہ کہاں ہے <eos>
BLEU Score         : 6.86809206056511e-78
--------------------------------------------------
English Sentence   : i understand you
Reference Urdu     : میں سمجھتا/سمجھتی ہوں
Predicted Urdu     : <sos> میں تمہیں سمجھتی ہوں <eos>
BLEU Score         : 8.38826642100846e-155
--------------------------------------------------
English Sentence   : the house is vacant
Reference Urdu     : یہ گھر خالی ہے۔
Predicted Urdu     : <sos> یہ گھر خا

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
