In [124]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import os
from tqdm import tqdm
import logging
import json
from datetime import datetime

In [125]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("urdu_phoneme_translation.log")
    ]
)
logger = logging.getLogger(__name__)

In [126]:
import pandas  as pd

data  =  "/home/humair/all_data/urdu-tts/g2p/advnc_output-28k.csv"


In [127]:
df =  pd.read_csv(data)

In [128]:
df.head()

Unnamed: 0,urdu_text,phonemes,ipa,is_valid,attempts
0,عائشہ,ʕ aː ɪ ʃ ə,/ʕaːˈɪʃə/,True,1
1,عطیہ,ʕ ʈ iː j ə,/ʕʈiːˈjə/,True,1
2,طور,t̪ uː r,/t̪uːr/,True,1
3,غلط,ɣ a l t̪,/ɣəlˈt̪/,True,1
4,آرمینیا,ɑː r m iː n iː aː,/ɑːrmɪˈniːaː/,True,1


In [155]:
string = df.ipa[:]

In [156]:
strg = " "


for i in range(len(string)):
    strg   += str(string[i])

In [149]:
x = strg.split()

In [None]:
y = set(x)

In [160]:
def deduplicate_string(s):
    # Convert the string to a set to remove duplicates
    unique_chars = set(s)
    
    # Join the characters back into a string
    deduplicated_str = ''.join(unique_chars)
    
    return deduplicated_str

# Example usage:
input_str = strg
output_str = deduplicate_string(input_str)
print(output_str)  # Output: abc


سclxɴŋũkgẽɒɳəʂmu:اð*ɐf.ہɑv̪ʒɦnʣɚrɜNɔazʱʦõwāɭɡɣʁɾ/ɵWʤ[ʧ)ĩˤḍḥ,ɪɛ̯̃طʰd͡ɟˈɽ̀͒ṭjyʋeãohːqɬنʕɫʌˁχɲ(itpæẓθsʈ-ʔʊʃˌحbħʀɖ̩ ⁿ


In [162]:
len(output_str)

113

In [164]:

# Character set definitions
URDU_CHARS = "'ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيٱٴٸٹپچڈڑړژښکګگںھہۂۃۆۈیېےە'"
PHONEME_CHARS =" سclxɴŋũkgẽɒɳəʂmu:اð*ɐf.ہɑv̪ʒɦnʣɚrɜNɔazʱʦõwāɭɡɣʁɾ/ɵWʤ[ʧ)ĩˤḍḥ,ɪɛ̯̃طʰd͡ɟˈɽ̀͒ṭjyʋeãohːqɬنʕɫʌˁχɲ(itpæẓθsʈ-ʔʊʃˌحbħʀɖ̩ ⁿ"

In [163]:
# Special tokens
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
SOS_TOKEN = '<SOS>'
EOS_TOKEN = '<EOS>'


In [165]:

class CharacterTokenizer:
    """
    Tokenizes characters and converts between characters and indices.
    """
    def __init__(self, char_set, special_tokens=None):
        """
        Initialize the tokenizer with a character set and optional special tokens.
        
        Args:
            char_set (str): String of characters in the vocabulary
            special_tokens (dict, optional): Dictionary of special tokens
        """
        self.char_set = char_set
        self.special_tokens = special_tokens or {}
        
        # Create mappings
        self.char_to_idx = {char: idx+1 for idx, char in enumerate(char_set)}
        
        # Add special tokens
        next_idx = len(self.char_to_idx) + 1
        for token, idx in self.special_tokens.items():
            if idx is None:
                self.char_to_idx[token] = next_idx
                next_idx += 1
            else:
                self.char_to_idx[token] = idx
        
        # Create reverse mapping
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
    
    def encode(self, text, add_special_tokens=False):
        """
        Convert text to indices.
        
        Args:
            text (str): Text to encode
            add_special_tokens (bool): Whether to add SOS/EOS tokens
            
        Returns:
            list: List of indices
        """
        indices = []
        
        if add_special_tokens and SOS_TOKEN in self.char_to_idx:
            indices.append(self.char_to_idx[SOS_TOKEN])
            
        for char in text:
            indices.append(self.char_to_idx.get(char, self.char_to_idx.get(UNK_TOKEN, 0)))
            
        if add_special_tokens and EOS_TOKEN in self.char_to_idx:
            indices.append(self.char_to_idx[EOS_TOKEN])
            
        return indices
    
    def decode(self, indices, skip_special_tokens=True):
        """
        Convert indices to text.
        
        Args:
            indices (list): List of indices
            skip_special_tokens (bool): Whether to skip special tokens
            
        Returns:
            str: Decoded text
        """
        special_token_values = set(self.char_to_idx.values()) if skip_special_tokens else set()
        chars = []
        
        for idx in indices:
            if idx in self.idx_to_char and (not skip_special_tokens or idx not in special_token_values):
                chars.append(self.idx_to_char[idx])
                
        return ''.join(chars)
    
    def __len__(self):
        """Return the size of the vocabulary."""
        return len(self.char_to_idx)


In [166]:

class UrduPhonemeDataset(Dataset):
    """
    Dataset for Urdu to phoneme translation.
    """
    def __init__(
        self, 
        urdu_texts, 
        phoneme_texts, 
        urdu_tokenizer, 
        phoneme_tokenizer, 
        max_urdu_len=18, 
        max_phoneme_len=20
    ):
        """
        Initialize the dataset.
        
        Args:
            urdu_texts (list): List of Urdu texts
            phoneme_texts (list): List of phoneme texts
            urdu_tokenizer (CharacterTokenizer): Tokenizer for Urdu text
            phoneme_tokenizer (CharacterTokenizer): Tokenizer for phoneme text
            max_urdu_len (int): Maximum Urdu text length
            max_phoneme_len (int): Maximum phoneme text length
        """
        self.urdu_texts = urdu_texts
        self.phoneme_texts = phoneme_texts
        self.urdu_tokenizer = urdu_tokenizer
        self.phoneme_tokenizer = phoneme_tokenizer
        self.max_urdu_len = max_urdu_len
        self.max_phoneme_len = max_phoneme_len
    
    def __len__(self):
        return len(self.urdu_texts)
    
    def __getitem__(self, idx):
        urdu_text = self.urdu_texts[idx]
        phoneme_text = self.phoneme_texts[idx]
        
        # Convert to indices
        urdu_indices = self.urdu_tokenizer.encode(urdu_text)
        phoneme_indices = self.phoneme_tokenizer.encode(phoneme_text, add_special_tokens=True)
        
        # Pad sequences
        pad_token_idx = self.urdu_tokenizer.char_to_idx.get(PAD_TOKEN, 0)
        urdu_indices = urdu_indices[:self.max_urdu_len] + [pad_token_idx] * max(0, self.max_urdu_len - len(urdu_indices))
        
        pad_token_idx = self.phoneme_tokenizer.char_to_idx.get(PAD_TOKEN, 0)
        phoneme_indices = phoneme_indices[:self.max_phoneme_len] + [pad_token_idx] * max(0, self.max_phoneme_len - len(phoneme_indices))
        
        return {
            'urdu': torch.tensor(urdu_indices, dtype=torch.long),
            'phoneme': torch.tensor(phoneme_indices, dtype=torch.long),
            'urdu_len': min(len(urdu_text), self.max_urdu_len),
            'phoneme_len': min(len(phoneme_text) + 2, self.max_phoneme_len)  # +2 for SOS and EOS
        }

In [167]:

class Encoder(nn.Module):
    """
    Encoder module for the sequence-to-sequence model.
    """
    def __init__(
        self, 
        input_size, 
        embedding_size, 
        hidden_size, 
        num_layers, 
        dropout=0.5,
        bidirectional=False
    ):
        """
        Initialize the encoder.
        
        Args:
            input_size (int): Size of the input vocabulary
            embedding_size (int): Size of the embedding vectors
            hidden_size (int): Size of the hidden state
            num_layers (int): Number of LSTM layers
            dropout (float): Dropout probability
            bidirectional (bool): Whether to use bidirectional LSTM
        """
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.directions = 2 if bidirectional else 1
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(
            embedding_size, 
            hidden_size, 
            num_layers, 
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True,
            bidirectional=bidirectional
        )
        self.dropout = nn.Dropout(dropout)
        
        # If bidirectional, we need to reduce dimensions for the decoder
        if bidirectional:
            self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
            self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
    
    def forward(self, x):
        """
        Forward pass of the encoder.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_length)
            
        Returns:
            tuple: (outputs, hidden, cell)
                outputs: Tensor of shape (batch_size, seq_length, hidden_size * directions)
                hidden: Tensor of shape (num_layers, batch_size, hidden_size)
                cell: Tensor of shape (num_layers, batch_size, hidden_size)
        """
        # x shape: (batch_size, seq_length)
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (batch_size, seq_length, embedding_size)
        
        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (batch_size, seq_length, hidden_size * directions)
        # hidden shape: (num_layers * directions, batch_size, hidden_size)
        # cell shape: (num_layers * directions, batch_size, hidden_size)
        
        # If bidirectional, reshape hidden and cell states
        if self.bidirectional:
            # Reshape hidden and cell to (num_layers, batch_size, hidden_size * 2)
            hidden = hidden.view(self.num_layers, 2, -1, self.hidden_size)
            cell = cell.view(self.num_layers, 2, -1, self.hidden_size)
            
            # Concatenate bidirectional states
            hidden = torch.cat([hidden[:, 0], hidden[:, 1]], dim=2)
            cell = torch.cat([cell[:, 0], cell[:, 1]], dim=2)
            
            # Apply linear transformation to get the correct size
            hidden = self.fc_hidden(hidden)
            cell = self.fc_cell(cell)
        
        return outputs, hidden, cell


In [168]:
class Attention(nn.Module):
    """
    Attention mechanism for the decoder.
    """
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        """
        Initialize the attention mechanism.
        
        Args:
            encoder_hidden_dim (int): Dimension of encoder hidden states
            decoder_hidden_dim (int): Dimension of decoder hidden states
        """
        super(Attention, self).__init__()
        self.attn = nn.Linear((encoder_hidden_dim + decoder_hidden_dim), decoder_hidden_dim)
        self.v = nn.Linear(decoder_hidden_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs):
        """
        Forward pass of the attention mechanism.
        
        Args:
            hidden (torch.Tensor): Decoder hidden state of shape (batch_size, decoder_hidden_dim)
            encoder_outputs (torch.Tensor): Encoder outputs of shape (batch_size, seq_len, encoder_hidden_dim)
            
        Returns:
            torch.Tensor: Attention weights of shape (batch_size, src_len)
        """
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # Calculate energy
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        
        # Calculate attention weights
        attention = self.v(energy).squeeze(2)
        
        return torch.softmax(attention, dim=1)


In [169]:
class AttentionDecoder(nn.Module):
    """
    Decoder with attention mechanism.
    """
    def __init__(
        self, 
        input_size, 
        embedding_size, 
        encoder_hidden_size, 
        hidden_size, 
        output_size, 
        num_layers, 
        dropout=0.5,
        attention=None
    ):
        """
        Initialize the decoder.
        
        Args:
            input_size (int): Size of the input vocabulary
            embedding_size (int): Size of the embedding vectors
            encoder_hidden_size (int): Size of the encoder hidden state
            hidden_size (int): Size of the decoder hidden state
            output_size (int): Size of the output vocabulary
            num_layers (int): Number of LSTM layers
            dropout (float): Dropout probability
            attention (Attention, optional): Attention mechanism
        """
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.attention = attention
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # If using attention, increase input size to the LSTM
        rnn_input_size = embedding_size
        if attention:
            rnn_input_size += encoder_hidden_size
            
        self.rnn = nn.LSTM(
            rnn_input_size, 
            hidden_size, 
            num_layers, 
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, hidden, cell, encoder_outputs=None):
        """
        Forward pass of the decoder.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, 1)
            hidden (torch.Tensor): Hidden state of shape (num_layers, batch_size, hidden_size)
            cell (torch.Tensor): Cell state of shape (num_layers, batch_size, hidden_size)
            encoder_outputs (torch.Tensor, optional): Encoder outputs for attention
            
        Returns:
            tuple: (prediction, hidden, cell)
                prediction: Tensor of shape (batch_size, output_size)
                hidden: Updated hidden state
                cell: Updated cell state
        """
        # x shape: (batch_size, 1)
        x = x.unsqueeze(1)
        
        # Get embedding
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (batch_size, 1, embedding_size)
        
        # Apply attention if available
        if self.attention and encoder_outputs is not None:
            # Get the top layer's hidden state for attention
            top_hidden = hidden[-1]
            
            # Calculate attention weights
            a = self.attention(top_hidden, encoder_outputs)
            
            # Apply attention weights to encoder outputs
            weighted = torch.bmm(a.unsqueeze(1), encoder_outputs)
            
            # Concatenate with embedding
            embedding = torch.cat((embedding, weighted), dim=2)
        
        # Pass through RNN
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # output shape: (batch_size, 1, hidden_size)
        
        # Generate prediction
        prediction = self.fc(output.squeeze(1))
        # prediction shape: (batch_size, output_size)
        
        return prediction, hidden, cell

In [170]:
class Seq2Seq(nn.Module):
    """
    Sequence-to-sequence model for Urdu to phoneme translation.
    """
    def __init__(self, encoder, decoder, device):
        """
        Initialize the sequence-to-sequence model.
        
        Args:
            encoder (Encoder): Encoder module
            decoder (AttentionDecoder): Decoder module
            device (torch.device): Device to run the model on
        """
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        Forward pass of the sequence-to-sequence model.
        
        Args:
            src (torch.Tensor): Source sequence of shape (batch_size, src_len)
            trg (torch.Tensor): Target sequence of shape (batch_size, trg_len)
            teacher_forcing_ratio (float): Probability of using teacher forcing
            
        Returns:
            torch.Tensor: Output sequence of shape (batch_size, trg_len, output_dim)
        """
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_size
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # Encode the source sequence
        encoder_outputs, hidden, cell = self.encoder(src)
        
        # First input to the decoder is the SOS token
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            # Get the prediction from decoder
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            
            # Store the prediction
            outputs[:, t] = output
            
            # Teacher forcing: use actual target or predicted token
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        
        return outputs
    
    def translate(self, src, phoneme_tokenizer, max_length=100):
        """
        Translate Urdu text to phoneme.
        
        Args:
            src (torch.Tensor): Source sequence of shape (1, src_len)
            phoneme_tokenizer (CharacterTokenizer): Phoneme tokenizer
            max_length (int): Maximum length of the generated sequence
            
        Returns:
            str: Translated phoneme text
        """
        self.eval()
        
        with torch.no_grad():
            # Encode the source sequence
            encoder_outputs, hidden, cell = self.encoder(src)
            
            # Start with SOS token
            input = torch.tensor([phoneme_tokenizer.char_to_idx[SOS_TOKEN]]).to(self.device)
            
            trg_indices = [phoneme_tokenizer.char_to_idx[SOS_TOKEN]]
            
            for _ in range(max_length):
                # Get prediction from decoder
                output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
                
                # Get the best prediction
                pred_token = output.argmax(1).item()
                trg_indices.append(pred_token)
                
                # Stop if EOS token is predicted
                if pred_token == phoneme_tokenizer.char_to_idx[EOS_TOKEN]:
                    break
                
                # Update input for next step
                input = torch.tensor([pred_token]).to(self.device)
            
        # Convert indices to text
        return phoneme_tokenizer.decode(trg_indices[1:])  # Skip SOS token



In [172]:
class EarlyStopping:
    """
    Early stopping to prevent overfitting.
    """
    def __init__(self, patience=5, min_delta=0, path='checkpoint.pt'):
        """
        Initialize early stopping.
        
        Args:
            patience (int): Number of epochs to wait for improvement
            min_delta (float): Minimum change to qualify as improvement
            path (str): Path to save the best model
        """
        self.patience = patience
        self.min_delta = min_delta
        self.path = path
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False
        
    def __call__(self, val_loss, model):
        """
        Check if training should stop.
        
        Args:
            val_loss (float): Validation loss
            model (nn.Module): Model to save
            
        Returns:
            bool: Whether to stop training
        """
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), self.path)
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                
        return self.early_stop


def train_model(model, train_loader, val_loader, optimizer, criterion, device, 
                num_epochs, checkpoint_dir, early_stopping_patience=5):
    """
    Train the model.
    
    Args:
        model (Seq2Seq): Model to train
        train_loader (DataLoader): Training data loader
        val_loader (DataLoader): Validation data loader
        optimizer (optim.Optimizer): Optimizer
        criterion (nn.Module): Loss function
        device (torch.device): Device to run the model on
        num_epochs (int): Number of epochs to train
        checkpoint_dir (str): Directory to save checkpoints
        early_stopping_patience (int): Patience for early stopping
        
    Returns:
        tuple: (train_losses, val_losses)
    """
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(checkpoint_dir, f"best_model.pt")
    early_stopping = EarlyStopping(patience=early_stopping_patience, path=checkpoint_path)
    
    train_losses = []
    val_losses = []
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5, verbose=True)
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for batch in progress_bar:
            src = batch['urdu'].to(device)
            trg = batch['phoneme'].to(device)
            
            optimizer.zero_grad()
            
            # Teacher forcing ratio decreases over time
            teacher_forcing_ratio = max(0.5 * (1.0 - epoch / num_epochs), 0.1)
            output = model(src, trg, teacher_forcing_ratio)
            
            # Reshape for loss calculation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)  # Skip SOS token
            trg = trg[:, 1:].reshape(-1)  # Skip SOS token
            
            loss = criterion(output, trg)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            
            optimizer.step()
            train_loss += loss.item()
            
            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item()})
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation
        val_loss = evaluate(model, val_loader, criterion, device)
        val_losses.append(val_loss)
        
        # Update learning rate
        scheduler.step(val_loss)
        
        logger.info(f"Epoch {epoch+1}/{num_epochs}, "
                   f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
                   f"LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        # Early stopping
        if early_stopping(val_loss, model):
            logger.info(f"Early stopping triggered after epoch {epoch+1}")
            break
    
    return train_losses, val_losses


In [173]:
def evaluate(model, dataloader, criterion, device):
    """
    Evaluate the model.
    
    Args:
        model (Seq2Seq): Model to evaluate
        dataloader (DataLoader): Data loader
        criterion (nn.Module): Loss function
        device (torch.device): Device to run the model on
        
    Returns:
        float: Average loss
    """
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            src = batch['urdu'].to(device)
            trg = batch['phoneme'].to(device)
            
            output = model(src, trg, 0)  # No teacher forcing
            
            # Reshape for loss calculation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            
            loss = criterion(output, trg)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

def process_data(file_path):
    """
    Process data from CSV file for Urdu to phoneme translation.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        tuple: (urdu_texts, phoneme_texts) containing lists of cleaned Urdu and phoneme texts.

    Raises:
        ValueError: If no valid data remains after processing.
        Exception: For other data processing errors.
    """
    logger.info(f"Loading data from {file_path}")

    try:
        # Read the CSV file
        df = pd.read_csv(file_path, header=None, encoding='utf-8')
        logger.info(f"Raw data shape: {df.shape}")
        logger.info(f"First few rows:\n{df.head().to_string()}")

        # Ensure exactly 5 columns
        if df.shape[1] != 5:
            raise ValueError(f"Expected 5 columns, but got {df.shape[1]}")

        # Assign column names
        df.columns = ['urdu_text', 'phonemes', 'ipa_notation', 'is_valid', 'attempts']

        # Log unique values in is_valid for debugging
        logger.info(f"Unique values in 'is_valid' column: {df['is_valid'].unique()}")

        # Filter valid entries (optional, commented out as per your version)
        # valid_df = df[df['is_valid'] == True]
        valid_df = df  # Use all rows as per your modification
        logger.info(f"Number of rows after filtering: {len(valid_df)}")

        # Drop rows with NaN in urdu_text or phonemes
        valid_df = valid_df.dropna(subset=['urdu_text', 'phonemes'])
        logger.info(f"Number of rows after dropping NaN: {len(valid_df)}")

        if len(valid_df) == 0:
            raise ValueError("No valid data after filtering and dropping NaN values.")

        # Extract data
        urdu_texts = valid_df['urdu_text'].tolist()
        phoneme_texts = valid_df['phonemes'].tolist()

        # Clean the data
        cleaned_urdu_texts = []
        cleaned_phoneme_texts = []
        for urdu, phoneme in zip(urdu_texts, phoneme_texts):
            # Handle non-string values
            urdu_str = '' if pd.isna(urdu) else str(urdu)
            phoneme_str = '' if pd.isna(phoneme) else str(phoneme)

            # Clean Urdu text: remove leading punctuation
            cleaned_urdu = re.sub(r'^[؂،]+', '', urdu_str).strip()

            # Clean phoneme text: normalize whitespace
            cleaned_phoneme = re.sub(r'\s+', ' ', phoneme_str).strip()

            # Only include non-empty pairs
            if cleaned_urdu and cleaned_phoneme:
                cleaned_urdu_texts.append(cleaned_urdu)
                cleaned_phoneme_texts.append(cleaned_phoneme)
            else:
                logger.warning(f"Skipping empty or invalid pair: urdu='{urdu_str}', phoneme='{phoneme_str}'")

        logger.info(f"Loaded {len(cleaned_urdu_texts)} valid examples after cleaning")

        if len(cleaned_urdu_texts) == 0:
            raise ValueError("No valid data after cleaning. Check for empty or invalid entries.")

        return cleaned_urdu_texts, cleaned_phoneme_texts

    except Exception as e:
        logger.error(f"Error processing data: {e}")
        raise


In [174]:
def visualize_training(train_losses, val_losses, save_path):
    """
    Visualize training and validation losses.
    
    Args:
        train_losses (list): Training losses
        val_losses (list): Validation losses
        save_path (str): Path to save the plot
    """
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig(save_path)
    plt.close()
    
    logger.info(f"Training visualization saved to {save_path}")

In [175]:



def save_model(
    model, 
    urdu_tokenizer, 
    phoneme_tokenizer, 
    model_path, 
    config_path,
    train_losses=None, 
    val_losses=None
):
    """
    Save the model and tokenizers.
    
    Args:
        model (Seq2Seq): Trained model
        urdu_tokenizer (CharacterTokenizer): Urdu tokenizer
        phoneme_tokenizer (CharacterTokenizer): Phoneme tokenizer
        model_path (str): Path to save the model
        config_path (str): Path to save the configuration
        train_losses (list, optional): Training losses
        val_losses (list, optional): Validation losses
    """
    # Save model state
    model_dict = {
        'encoder': model.encoder.state_dict(),
        'decoder': model.decoder.state_dict(),
    }
    torch.save(model_dict, model_path)
    
    # Save tokenizers and other configuration
    config = {
        'urdu_char_to_idx': urdu_tokenizer.char_to_idx,
        'phoneme_char_to_idx': phoneme_tokenizer.char_to_idx,
        'urdu_idx_to_char': urdu_tokenizer.idx_to_char,
        'phoneme_idx_to_char': phoneme_tokenizer.idx_to_char,
    }
    
    # Add training history if available
    if train_losses and val_losses:
        config['train_losses'] = train_losses
        config['val_losses'] = val_losses
    
    with open(config_path, 'w', encoding='utf-8') as f:
        json.dump(config, f, ensure_ascii=False, indent=2)
    
    logger.info(f"Model saved to {model_path}")
    logger.info(f"Configuration saved to {config_path}")


def load_model(model_path, config_path, device):
    """
    Load a saved model.
    
    Args:
        model_path (str): Path to the saved model
        config_path (str): Path to the saved configuration
        device (torch.device): Device to load the model on
        
    Returns:
        tuple: (model, urdu_tokenizer, phoneme_tokenizer)
    """
    logger.info(f"Loading model from {model_path}")
    
    # Load configuration
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    
    # Create tokenizers
    urdu_tokenizer = CharacterTokenizer("")
    phoneme_tokenizer = CharacterTokenizer("")
    
    # Restore tokenizers from config
    urdu_tokenizer.char_to_idx = {k: int(v) if isinstance(v, str) and v.isdigit() else v for k, v in config['urdu_char_to_idx'].items()}
    phoneme_tokenizer.char_to_idx = {k: int(v) if isinstance(v, str) and v.isdigit() else v for k, v in config['phoneme_char_to_idx'].items()}
    urdu_tokenizer.idx_to_char = {int(k) if isinstance(k, str) and k.isdigit() else k: v for k, v in config['urdu_idx_to_char'].items()}
    phoneme_tokenizer.idx_to_char = {int(k) if isinstance(k, str) and k.isdigit() else k: v for k, v in config['phoneme_idx_to_char'].items()}
    
    # Initialize models
    encoder = Encoder(
        input_size=len(urdu_tokenizer),
        embedding_size=256,  # Default values, can be overridden if saved in config
        hidden_size=512,
        num_layers=2,
        dropout=0.5,
        bidirectional=True
    )
    
    # Initialize attention
    attention = Attention(
        encoder_hidden_dim=512 * (2 if True else 1),  # Bidirectional
        decoder_hidden_dim=512
    )
    
    decoder = AttentionDecoder(
        input_size=len(phoneme_tokenizer),
        embedding_size=256,
        encoder_hidden_size=512 * (2 if True else 1),  # Bidirectional
        hidden_size=512,
        output_size=len(phoneme_tokenizer),
        num_layers=2,
        dropout=0.5,
        attention=attention
    )
    
    model = Seq2Seq(encoder, decoder, device).to(device)
    
    # Load model state
    model_state = torch.load(model_path, map_location=device)
    model.encoder.load_state_dict(model_state['encoder'])
    model.decoder.load_state_dict(model_state['decoder'])
    
    logger.info("Model loaded successfully")
    
    return model, urdu_tokenizer, phoneme_tokenizer


def translate_urdu_text(model, urdu_text, urdu_tokenizer, phoneme_tokenizer, device, max_length=100):
    """
    Translate Urdu text to phoneme representation.
    
    Args:
        model (Seq2Seq): Trained model
        urdu_text (str): Urdu text to translate
        urdu_tokenizer (CharacterTokenizer): Urdu tokenizer
        phoneme_tokenizer (CharacterTokenizer): Phoneme tokenizer
        device (torch.device): Device to run the model on
        max_length (int): Maximum length of generated phoneme sequence
        
    Returns:
        str: Phoneme representation
    """
    model.eval()
    
    # Convert Urdu text to indices
    urdu_indices = urdu_tokenizer.encode(urdu_text)
    src = torch.tensor([urdu_indices]).to(device)
    
    # Generate translation
    with torch.no_grad():
        return model.translate(src, phoneme_tokenizer, max_length)


In [None]:

def main():
    """Main function to run training and evaluation."""
    # Model parameters
    EMBEDDING_SIZE = 256
    HIDDEN_SIZE = 512
    NUM_LAYERS = 4
    DROPOUT = 0.2
    BATCH_SIZE = 64
    LEARNING_RATE = 0.0005
    NUM_EPOCHS = 40
    USE_BIDIRECTIONAL = True
    USE_ATTENTION = True
    
    # Data parameters
    MAX_URDU_LEN = 15
    MAX_PHONEME_LEN = 20
    CSV_PATH = '/home/humair/all_data/urdu-tts/g2p/advnc_output-28k.csv'
    
    # Directories for saving models and results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    RUN_DIR = f"runs/urdu_phoneme_{timestamp}"
    os.makedirs(RUN_DIR, exist_ok=True)
    
    # Setup logging to file
    file_handler = logging.FileHandler(os.path.join(RUN_DIR, "training.log"))
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(file_handler)
    
    # Log configuration
    logger.info("Starting Urdu to Phoneme Translation Model Training")
    logger.info(f"Model parameters: Embedding={EMBEDDING_SIZE}, Hidden={HIDDEN_SIZE}, Layers={NUM_LAYERS}, Dropout={DROPOUT}")
    logger.info(f"Training parameters: Batch={BATCH_SIZE}, LR={LEARNING_RATE}, Epochs={NUM_EPOCHS}")
    logger.info(f"Using bidirectional encoder: {USE_BIDIRECTIONAL}")
    logger.info(f"Using attention mechanism: {USE_ATTENTION}")
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Using device: {device}")
    
    # Initialize tokenizers
    urdu_tokenizer = CharacterTokenizer(
        URDU_CHARS,
        {PAD_TOKEN: 0, UNK_TOKEN: None}
    )
    
    phoneme_tokenizer = CharacterTokenizer(
        PHONEME_CHARS,
        {PAD_TOKEN: 0, SOS_TOKEN: None, EOS_TOKEN: None, UNK_TOKEN: None}
    )
    
    logger.info(f"Urdu vocabulary size: {len(urdu_tokenizer)}")
    logger.info(f"Phoneme vocabulary size: {len(phoneme_tokenizer)}")
    
    # Process data
    urdu_texts, phoneme_texts = process_data(CSV_PATH)
    
    # Split data
    urdu_train, urdu_val, phoneme_train, phoneme_val = train_test_split(
        urdu_texts, phoneme_texts, test_size=0.1, random_state=42
    )
    
    logger.info(f"Training samples: {len(urdu_train)}")
    logger.info(f"Validation samples: {len(urdu_val)}")
    
    # Create datasets
    train_dataset = UrduPhonemeDataset(
        urdu_train, phoneme_train, 
        urdu_tokenizer, phoneme_tokenizer, 
        MAX_URDU_LEN, MAX_PHONEME_LEN
    )
    
    val_dataset = UrduPhonemeDataset(
        urdu_val, phoneme_val, 
        urdu_tokenizer, phoneme_tokenizer, 
        MAX_URDU_LEN, MAX_PHONEME_LEN
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True, 
        num_workers=4
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE, 
        num_workers=4
    )
    
    # Initialize encoder
    encoder = Encoder(
        input_size=len(urdu_tokenizer),
        embedding_size=EMBEDDING_SIZE,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
        dropout=DROPOUT,
        bidirectional=USE_BIDIRECTIONAL
    )
    
    # Initialize attention mechanism if enabled
    attention = None
    if USE_ATTENTION:
        attention = Attention(
            encoder_hidden_dim=HIDDEN_SIZE * (2 if USE_BIDIRECTIONAL else 1),
            decoder_hidden_dim=HIDDEN_SIZE
        )
    
    # Initialize decoder
    decoder = AttentionDecoder(
        input_size=len(phoneme_tokenizer),
        embedding_size=EMBEDDING_SIZE,
        encoder_hidden_size=HIDDEN_SIZE * (2 if USE_BIDIRECTIONAL else 1),
        hidden_size=HIDDEN_SIZE,
        output_size=len(phoneme_tokenizer),
        num_layers=NUM_LAYERS,
        dropout=DROPOUT,
        attention=attention
    )
    
    # Initialize sequence-to-sequence model
    model = Seq2Seq(encoder, decoder, device).to(device)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(f"Total parameters: {total_params:,}")
    logger.info(f"Trainable parameters: {trainable_params:,}")
    
    # Initialize optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Ignore padding index in loss calculation
    criterion = nn.CrossEntropyLoss(
        ignore_index=phoneme_tokenizer.char_to_idx[PAD_TOKEN]
    )
    
    # Train the model
    train_losses, val_losses = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        num_epochs=NUM_EPOCHS,
        checkpoint_dir=RUN_DIR,
        early_stopping_patience=5
    )
    
    # Visualize training
    visualize_training(
        train_losses=train_losses,
        val_losses=val_losses,
        save_path=os.path.join(RUN_DIR, "training_loss.png")
    )
    
    # Save the model
    save_model(
        model=model,
        urdu_tokenizer=urdu_tokenizer,
        phoneme_tokenizer=phoneme_tokenizer,
        model_path=os.path.join(RUN_DIR, "urdu_phoneme_model.pt"),
        config_path=os.path.join(RUN_DIR, "model_config.json"),
        train_losses=train_losses,
        val_losses=val_losses
    )
    
    # Evaluate on some examples
    logger.info("Evaluating on example sentences:")
    examples = urdu_val[:5]  # Take first 5 validation examples
    
    for i, urdu_text in enumerate(examples):
        actual_phoneme = phoneme_val[i]
        predicted_phoneme = translate_urdu_text(
            model, urdu_text, urdu_tokenizer, phoneme_tokenizer, device
        )
        
        logger.info(f"Example {i+1}:")
        logger.info(f"   Urdu: {urdu_text}")
        logger.info(f"   True: {actual_phoneme}")
        logger.info(f"   Pred: {predicted_phoneme}")
    
    logger.info("Training completed successfully!")


if __name__ == "__main__":
    main()

2025-05-23 17:38:46,879 - __main__ - INFO - Starting Urdu to Phoneme Translation Model Training
2025-05-23 17:38:46,884 - __main__ - INFO - Model parameters: Embedding=256, Hidden=512, Layers=4, Dropout=0.2
2025-05-23 17:38:46,902 - __main__ - INFO - Training parameters: Batch=64, LR=0.0005, Epochs=40
2025-05-23 17:38:46,914 - __main__ - INFO - Using bidirectional encoder: True
2025-05-23 17:38:46,927 - __main__ - INFO - Using attention mechanism: True
2025-05-23 17:38:46,933 - __main__ - INFO - Using device: cpu
2025-05-23 17:38:46,947 - __main__ - INFO - Urdu vocabulary size: 65
2025-05-23 17:38:46,953 - __main__ - INFO - Phoneme vocabulary size: 117
2025-05-23 17:38:46,961 - __main__ - INFO - Loading data from /home/humair/all_data/urdu-tts/g2p/advnc_output-28k.csv
2025-05-23 17:38:47,379 - __main__ - INFO - Raw data shape: (16324, 5)
2025-05-23 17:38:47,431 - __main__ - INFO - First few rows:
           0           1          2         3         4
0  urdu_text    phonemes        ip

2025-05-23 17:38:50,968 - __main__ - INFO - Total parameters: 33,974,389
2025-05-23 17:38:50,974 - __main__ - INFO - Trainable parameters: 33,974,389
Epoch 1/40 [Train]: 100%|████████| 230/230 [1:25:05<00:00, 22.20s/it, loss=2.02]
Evaluating: 100%|███████████████████████████████| 26/26 [01:50<00:00,  4.23s/it]
2025-05-23 19:05:59,685 - __main__ - INFO - Epoch 1/40, Train Loss: 2.3265, Val Loss: 2.8203, LR: 0.000500
Epoch 2/40 [Train]:  36%|███▉       | 82/230 [18:33<56:17, 22.82s/it, loss=1.81]