## **Import Libraries**

In [1]:
# Import core libraries for deep learning and scientific computing, neural network building blocks
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F #Functional Utilities
import torch.optim as optim  #For Optimizer

# Import libraries for data manipulation and analysis
import pandas as pd
import csv

# Import libraries for progress monitoring and visualization
from tqdm import tqdm
import matplotlib.pyplot as plt

# Import libraries for logging and experimentation tracking
import wandb  

# Import libraries for utility functions
import random  
import heapq  

## **SET DEVICE (CPU / GPU)**

In [2]:
# This function determines the appropriate device ("cpu" or "cuda") to use for training.
def set_device():
    """Sets the training device to either "cpu" or "cuda" based on availability.

    Returns:
        str: The chosen device ("cpu" or "cuda").
    """
    device = "cpu"  # Default device is CPU

    # Check if a CUDA GPU is available
    if torch.cuda.is_available():
        device = "cuda"  # Use GPU if available for faster training

    return device  # Return the chosen device

# Call the function to determine the training device
device = set_device()

# Print the chosen device ("cpu" or "cuda")
print(device)


cuda


In [3]:
!wandb login 57566fbb0e091de2e298a4320d872f9a2b200d12

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## **LOAD DATA**

In [4]:
def load_data(lang='hin'):
    """
    Loads training, validation, and test data from CSV files.

    Args:
        lang (str, optional): Language code (default: 'hin'). Defaults to 'hin'.

    Returns:
        dict: A dictionary containing the loaded data and maximum sequence lengths.
    """

    # Define base paths based on language
    base_path = f'/kaggle/input/vocabs/Dataset/{lang}'
    train_path, val_path, test_path = f'{base_path}/{lang}_train.csv', f'{base_path}/{lang}_valid.csv', f'{base_path}/{lang}_test.csv'

    # Load data using a single loop with list comprehension
    data_lists = []
    for path in [train_path, val_path, test_path]:
        with open(path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file) #read csv file
            data_lists.append([[f"{row[0]}$", f"#{row[1]}$"] for row in reader]) 
      
    data_set = []
    for i in range(0, 6):
        data_set.append([list_item[i%2] for list_item in data_lists[i//2]])
    
    train_x, train_y, val_x, val_y, test_x, test_y = data_set[0], data_set[1], data_set[2], data_set[3], data_set[4], data_set[5]


  # Convert data to NumPy arrays
    train_x, train_y = np.array(train_x), np.array(train_y)
    val_x, val_y = np.array(val_x), np.array(val_y)
    test_x, test_y = np.array(test_x), np.array(test_y)

    # Find maximum sequence lengths (combined for efficiency)
    max_decoder_length = max(len(s) for s in np.concatenate((train_y, val_y, test_y)))
    max_encoder_length = max(len(s) for s in np.concatenate((train_x, val_x, test_x)))

    # Return data as a dictionary
    return {
        "train_x": train_x,
        "train_y": train_y,
        "val_x": val_x,
        "val_y": val_y,
        "test_x": test_x,
        "test_y": test_y,
        "max_decoder_length": max_decoder_length,
        "max_encoder_length": max_encoder_length
    }


In [5]:
def create_corpus(dictionary : dict):
    """
    Creates vocabulary dictionaries for input and output sequences.

    Args:
        dict : A dictionary containing train_y, val_y, test_y
    Returns:
        dict: A dictionary containing vocabulary information.
    """
    train_y = dictionary["train_y"]
    val_y = dictionary["val_y"]
    test_y = dictionary["test_y"]

    # Define English vocabulary
    english_vocab = "#$abcdefghijklmnopqrstuvwxyz"

    # Combine target sequences from all datasets to create a complete vocabulary
    all_chars = set.union((set(char for word in train_y for char in word)),
                            set(char for word in val_y for char in word),
                            set(char for word in test_y for char in word))
    all_chars.add('')
    all_chars = sorted(all_chars)

    # Create input vocabulary dictionary (includes the empty string)
    input_corpus_dict = {char: idx+1 for idx, char in enumerate(english_vocab)}
    input_corpus_dict[''] = 0
    input_corpus_length = len(input_corpus_dict)
    

    # Create output vocabulary dictionary (includes the empty string)
    output_corpus_dict = {char: idx for idx, char in enumerate(all_chars)}
    output_corpus_length = len(output_corpus_dict)

    # Create dictionaries for reversed lookups (character -> index)
    reversed_input_corpus = {v: k for k, v in input_corpus_dict.items()}
    reversed_output_corpus = {v: k for k, v in output_corpus_dict.items()}

    # Return a dictionary containing all vocabulary information
    return {
        "input_corpus_length": input_corpus_length,
        "output_corpus_length": output_corpus_length,
        "input_corpus_dict": input_corpus_dict,
        "output_corpus_dict": output_corpus_dict,
        "reversed_input_corpus": reversed_input_corpus,
        "reversed_output_corpus": reversed_output_corpus
    }


In [6]:
def create_tensor(data_dict, corpus_dict):
    """
    Creates PyTorch tensors for training and validation data.

    Args:
        data_dict (dict) : Dictionary contaning datasets
        corpus_dict (dict): Dictionary containing vocabulary information.

    Returns:
        dict: A dictionary containing PyTorch tensors for training and validation.
    """

    # Get maximum sequence length
    max_len = max(data_dict["max_encoder_length"], data_dict["max_decoder_length"])

    # Function to convert sequences to tensors with padding
    def create_padded_tensor(sequences, vocab_dict, max_len):
        tensor = np.zeros((max_len, len(sequences)), dtype='int64')
        for i, seq in enumerate(sequences):
            for j, char in enumerate(seq):
                tensor[j, i] = vocab_dict.get(char, 0)  # Use default of 0 for missing characters
        return torch.tensor(tensor)

    # Create tensors for training data
    train_input = create_padded_tensor(data_dict["train_x"], corpus_dict["input_corpus_dict"], max_len)
    train_output = create_padded_tensor(data_dict["train_y"], corpus_dict["output_corpus_dict"], max_len)

    # Create tensors for validation data
    val_input = create_padded_tensor(data_dict["val_x"], corpus_dict["input_corpus_dict"], max_len)
    val_output = create_padded_tensor(data_dict["val_y"], corpus_dict["output_corpus_dict"], max_len)

    # Create tensors for testing data
    test_input = create_padded_tensor(data_dict["test_x"], corpus_dict["input_corpus_dict"], max_len)
    test_output = create_padded_tensor(data_dict["test_y"], corpus_dict["output_corpus_dict"], max_len)

    # Return dictionary containing tensors
    return {
        "train_input": train_input,
        "train_output": train_output,
        "val_input": val_input,
        "val_output": val_output,
        "test_input" : test_input,
        "test_output" : test_output
    }


In [7]:
def preprocess_data(lang : str):
    dictionary1 = load_data(lang)
    dictionary2 = create_corpus(dictionary1)
    dictionary3 = create_tensor(dictionary1, dictionary2) 
    dictionary4 = {
        "train_input": dictionary3["train_input"],
        "train_output": dictionary3["train_output"],
        "val_input": dictionary3["val_input"],
        "val_output": dictionary3["val_output"],
        "test_input" : dictionary3["test_input"],
        "test_output" : dictionary3["test_output"],
        "input_corpus_length" : dictionary2["input_corpus_length"],
        "output_corpus_length" : dictionary2["output_corpus_length"],
        "input_corpus_dict" : dictionary2["input_corpus_dict"],
        "output_corpus_dict" : dictionary2["output_corpus_dict"],
        "reversed_input_corpus" : dictionary2["reversed_input_corpus"],
        "reversed_output_corpus" : dictionary2["reversed_output_corpus"],
        "train_x" : dictionary1["train_x"],
        "train_y" : dictionary1["train_y"],
        "val_x" : dictionary1["val_x"],
        "val_y" : dictionary1["val_y"],
        "test_x" : dictionary1["test_x"],
        "test_y" : dictionary1["test_y"],
        "max_decoder_length" : dictionary1["max_decoder_length"],
        "max_encoder_length" : dictionary1["max_encoder_length"]
    }   

    return dictionary4


## **Encoder Class**

In [8]:
class Encoder(nn.Module):
    """
    Encoder class for sequence-to-sequence models.
    Args:
        PARAM (dict): Encoder hyperparameters.
            - input_size (int): Size of the input vocabulary.
            - embedding_size (int): Dimensionality of word embeddings.
            - hidden_size (int): Size of the hidden state in RNN cells.
            - num_layers (int): Number of stacked RNN layers.
            - drop_prob (float): Dropout probability for regularization.
            - cell_type (str): Type of RNN cell (LSTM, GRU, RNN).
            - bidirectional (bool): Whether to use a bidirectional RNN.
    """

    def __init__(self, PARAM):
        super(Encoder, self).__init__()

        # Hyperparameters
        self.input_size = PARAM["encoder_input_size"]
        self.embedding_size = PARAM["embedding_size"]
        self.hidden_size = PARAM["hidden_size"]
        self.num_layers = PARAM["num_layers"]
        self.drop_prob = PARAM["drop_prob"]
        self.cell_type = PARAM["cell_type"]
        self.bidirectional = PARAM["bidirectional"]

        # Layers
        self.dropout = nn.Dropout(self.drop_prob)
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)

        # Select RNN cell based on cell_type
        cell_map = {
        "LSTM": nn.LSTM,
        "GRU": nn.GRU,
        "RNN": nn.RNN
        }
        self.cell = cell_map[self.cell_type](
            self.embedding_size, self.hidden_size, self.num_layers,
            dropout=self.drop_prob, bidirectional=self.bidirectional
        )

    def forward(self, x):
        """
        Forward pass of the Encoder.
        Args:
            x : Input sequence of word indices.
        Returns:
            torch.Tensor or tuple : Hidden state (or hidden & cell states for LSTMs)
        """

        embedding = self.embedding(x) # embadding layer 
        drops = self.dropout(embedding) # Dropout on embadding 
        if self.cell_type == "RNN" or self.cell_type == "GRU": 
            _, hidden = self.cell(drops) 
            return hidden
        elif self.cell_type == "LSTM":
            _, (hidden, cells) = self.cell(drops)
            return hidden, cells
        else:
            raise ValueError(f"Invalid RNN cell type: {self.cell_type}") # Raise a error on invalid cell type


## **Decoder** 

In [9]:
class Decoder(nn.Module):
    """
    Decoder class for sequence-to-sequence models.

    Args:
        PARAM (dict): Decoder hyperparameters.
            - input_size (int): Size of the decoder vocabulary.
            - embedding_size (int): Dimensionality of word embeddings.
            - hidden_size (int): Size of the hidden state in RNN cells.
            - output_size (int): Size of the output vocabulary.
            - num_layers (int): Number of stacked RNN layers.
            - drop_prob (float): Dropout probability for regularization.
            - cell_type (str): Type of RNN cell (LSTM, GRU, RNN).
            - bidirectional (bool): Whether to use a bidirectional RNN.
    """

    def __init__(self, PARAM):
        super(Decoder, self).__init__()

        # Hyperparameters
        self.input_size = PARAM["decoder_input_size"]
        self.embedding_size = PARAM["embedding_size"]
        self.hidden_size = PARAM["hidden_size"]
        self.output_size = PARAM["decoder_output_size"]
        self.num_layers = PARAM["num_layers"]
        self.drop_prob = PARAM["drop_prob"]
        self.cell_type = PARAM["cell_type"]
        self.bidirectional = PARAM["bidirectional"]

        # Layers
        self.dropout = nn.Dropout(self.drop_prob)
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        self.cell_map = {
            "LSTM": nn.LSTM,
            "GRU": nn.GRU,
            "RNN": nn.RNN
        }
        self.cell = self.cell_map[self.cell_type](
            self.embedding_size, self.hidden_size, self.num_layers,
            dropout=self.drop_prob, bidirectional=self.bidirectional
        )

        # Final linear layer for output prediction
        self.fc = nn.Linear(self.hidden_size * (2 if self.bidirectional else 1), self.output_size)

    def forward(self, x, hidden, cell=None):
        """
        Forward pass of the Decoder.

        Args:
            x (torch.Tensor): Input sequence of word indices (single token for teacher forcing).
            hidden (torch.Tensor): Hidden state from the encoder.
            cell (torch.Tensor, optional): Cell state for LSTMs (default: None).

        Returns:
            tuple(torch.Tensor): Predicted output logits, hidden state (and cell state for LSTMs).
        """

        x = x.unsqueeze(0)  # Add batch dimension for single token
        embedding = self.embedding(x)
        drops = self.dropout(embedding)

        if self.cell_type == "RNN" or self.cell_type == "GRU":
            outputs, hidden = self.cell(drops, hidden)
        elif self.cell_type == "LSTM":
            outputs, (hidden, cell) = self.cell(drops, (hidden, cell))
        predictions = self.fc(outputs).squeeze(0)  # Remove batch dimension

        if self.cell_type == "LSTM":
            predictions = F.log_softmax(predictions, dim=1)
            return predictions, hidden, cell
        return predictions, hidden


## **Seq2Seq Class**

In [10]:
class Seq2Seq(nn.Module):
    """
    Seq2Seq model for sequence-to-sequence tasks.

    Args:
        encoder (Encoder): Encoder module.
        decoder (Decoder): Decoder module.
        param (dict): Model hyperparameters.
            - tfr (float): Teacher forcing ratio for training.
        processed_data (dict) : containing all information of processed data
    """

    def __init__(self, encoder, decoder, param, p_data):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = param["tfr"]  # Teacher forcing ratio
        self.processed_data = p_data

    def forward(self, src, target):
        """
        Forward pass of the Seq2Seq model.

        Args:
            src (torch.Tensor): Source sequence of word indices.
            target (torch.Tensor): Target sequence of word indices.

        Returns:
            torch.Tensor: Predicted output logits for each target word.
        """

        batch_size = src.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.processed_data["output_corpus_length"]

        # Initialize outputs tensor
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        # Get encoder hidden state(s)
        if self.encoder.cell_type == "LSTM":
            encoder_hidden, cell = self.encoder(src)
        elif self.encoder.cell_type == "GRU" or self.encoder.cell_type == "RNN":
            encoder_hidden = self.encoder(src)

        # Start with first target word
        x = target[0]

        for t in range(1, target_len):
            # Decode with teacher forcing or predicted output
            if self.encoder.cell_type == "LSTM":
                y, encoder_hidden, cell = self.decoder(x, encoder_hidden, cell) 
            else:
                y, encoder_hidden = self.decoder(x, encoder_hidden, None)  

            outputs[t] = y
            if random.random() < self.teacher_forcing_ratio:
                x = target[t]
            else:
                x = y.argmax(dim=1)

        return outputs


## **Setting Optimizer**

In [11]:
def set_optimizer(name, model, learning_rate):
    """
    Creates an optimizer object based on the specified name and learning rate.
    Args:
        name (str): Name of the optimizer (e.g., "adam", "sgd", "rmsprop", "adagrad").
        model (nn.Module): The PyTorch model to be optimized.
        learning_rate (float): The learning rate to use for training.
    Returns:
        torch.optim.Optimizer: The created optimizer object.
    """

    # Define the optimizer based on the provided name
    optimizer = None
    if name == "adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif name == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif name == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    elif name == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
    else:
        # Raise an error if the optimizer name is invalid
        raise ValueError(f"Invalid optimizer name: {name}")

    # Ensure an optimizer was created
    if optimizer is None:
        raise ValueError("Failed to create optimizer. Please check the provided name.")

    return optimizer


## **BEAM SEARCH**

In [12]:
def beam_search(params, model, word, device, processed_data):
    """
    Beam search decoding for sequence-to-sequence models.

    Args:
        params (dict): Model hyperparameters.
            - encoder_cell_type (str): Type of RNN cell (LSTM, GRU, RNN).
            - beam_width (int): Beam width for beam search decoding.
            - length_penalty (float): Penalty for longer sequences.
        model (nn.Module): Seq2Seq model for sequence translation.
        word (str): Input word to translate.
        device (torch.device): Device to use for computations (CPU or GPU).
        max_encoder_length (int): Maximum length of the encoder input sequence.
        input_corpus_dict (dict): Dictionary mapping input characters to integer indices.
        output_corpus_dict (dict): Dictionary mapping integer indices to output characters.
        reverse_output_corpus (dict): Dictionary mapping output characters to integer indices (for reversing prediction).

    Returns:
        str: Translated sentence.
    """

    input_corpus_dict = processed_data["input_corpus_dict"]
    output_corpus_dict = processed_data["output_corpus_dict"]
    max_encoder_length = processed_data["max_encoder_length"]
    reversed_output_corpus = processed_data["reversed_output_corpus"]
    # Preprocess input sentence
    data = torch.zeros((max_encoder_length + 1, 1), dtype=torch.int32).to(device)
    for i, char in enumerate(word):
        data[i, 0] = input_corpus_dict[char]
    data[i + 1, 0] = input_corpus_dict['$']  # Add end-of-sentence marker

    # Encode input sentence
    with torch.no_grad():
        if params["cell_type"] == "LSTM":
            hidden, cell = model.encoder(data)
        else:
            hidden = model.encoder(data)

        # Initialize beam search
        start_token = output_corpus_dict['#']  # Start-of-sentence symbol
        initial_sequence = torch.tensor([start_token]).to(device)
        hidden = hidden.unsqueeze(0)  # Add batch dimension
        beam = [(0.0, initial_sequence, hidden)]  # List of (score, sequence, hidden state) tuples

    # Decode loop
        for _ in range(len(output_corpus_dict)):
            candidates = []  # List for storing candidate sequences
            for score, seq, hidden in beam:
                # Check for end-of-sentence token
                if seq[-1].item() == output_corpus_dict['$']:
                    candidates.append((score, seq, hidden))
                    continue

                # Get last token and hidden state
                last_token = seq[-1].unsqueeze(0).to(device)
                hidden = hidden.squeeze(0)

                # Decode step with last token
                if params["cell_type"] == "LSTM":
                    output, hidden, cell = model.decoder(last_token, hidden, cell)
                else:
                    output, hidden = model.decoder(last_token, hidden, None)

            # Get top-k probable tokens
                probabilities = F.softmax(output, dim=1)
                topk_probs, topk_tokens = torch.topk(probabilities, k=params["beam_width"])

                # Expand beam with top-k candidate sequences
                for prob, token in zip(topk_probs[0], topk_tokens[0]):
                    new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                    length_penalty = ((len(new_seq) - 1) / 5) ** params["length_penalty"]
                    candidate_score = score + torch.log(prob).item() / length_penalty
                    candidates.append((candidate_score, new_seq, hidden.unsqueeze(0)))

            # Select top-k beam candidates for next iteration
            beam = heapq.nlargest(params["beam_width"], candidates, key=lambda x: x[0])

        # Get best sequence from beam search
        best_score, best_sequence, _ = max(beam, key=lambda x: x[0])

        # Convert predicted token indices to characters and reverse order
        translated_sentence = ''.join([reversed_output_corpus[token.item()] for token in best_sequence[1:]])[:-1]  # Remove start token and end token

        return translated_sentence


In [13]:
def run_epoch(model, data_loader, optimizer, criterion, processed_data):
    """
    Train the Seq2Seq model for one epoch.

    Args:
        model (nn.Module): Seq2Seq model to train.
        data_loader (List): List containing training_data.
        optimizer (Optimizer): Optimizer for updating model parameters.
        criterion (nn.Module): Loss function for calculating training loss.

    Returns:
        tuple(float, float): Training accuracy and average loss.
    """

    model.train()  # Set model to training mode
    total_loss, total_words, correct_predictions = 0, 0, 0

    with tqdm(total=len(data_loader[0]), desc='Training') as pbar:  # Gradient accumulation
        for _ , (source, target) in enumerate(zip(data_loader[0], data_loader[1])):
            source, target = source.to(device), target.to(device)  # Move data to device
            optimizer.zero_grad()

            # Forward pass
            output = model(source, target)
            target = target.reshape(-1)  # Reshape target for loss calculation
            output = output.reshape(-1, output.shape[2])  # Reshape output
            
            #Ignore the padding
            pad_mask = (target != processed_data['output_corpus_dict'][''])
            target = target[pad_mask]
            output = output[pad_mask]

            # Calculate loss
            loss = criterion(output, target)

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            optimizer.step()  # Update model parameters

            # Calculate total loss, total words, correct_predictions
            total_loss += loss.item()
            total_words += target.size(0)
            correct_predictions += torch.sum(torch.argmax(output, dim = 1) == target).item()
            pbar.update(1)

    # Calculate Accuracy and Avg Loss
    accuracy = correct_predictions / total_words
    avg_loss = total_loss / len(data_loader[0])

    return accuracy, avg_loss


In [14]:
def evaluate_character_level(model, val_data_loader, loss_fn, processed_data):
    """
    Evaluate the Seq2Seq model on character-level data.

    Args:
        model (nn.Module): Seq2Seq model to evaluate.
        val_data_loader (DataLoader): Data loader for validation data.
        loss_fn (nn.Module): Loss function for calculating validation loss.

    Returns:
        tuple(float, float): Validation accuracy and average loss.
    """

    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        total_loss = 0
        total_words = 0
        correct_predictions = 0

        with tqdm(total=len(val_data_loader[0]), desc='Validation') as pbar:
            for src, tar in zip(val_data_loader[0], val_data_loader[1]):
                target, source = tar.to(device), src.to(device)

                # Apply model
                output = model(source, target)

                # Reshape target and output
                target = target.reshape(-1)
                output = output.reshape(-1, output.shape[2])
                
                # Ignore the padding 
                pad_mask = (target != processed_data['output_corpus_dict'][''])
                target = target[pad_mask]
                output = output[pad_mask]

                #Calculate total_loss, total_words, correct_predictions
                val_loss = loss_fn(output, target)
                total_loss += val_loss.item()
                total_words += target.size(0)
                correct_predictions += torch.sum(torch.argmax(output, dim=1) == target).item()
                pbar.update(1)
        
    accuracy = correct_predictions / total_words
    avg_loss = total_loss / len(val_data_loader[0])

    return accuracy, avg_loss


In [15]:
def evaluate_model_beam_search(params, model, device, processed_data):
    """
    Evaluates the model using beam search and returns accuracy and correct predictions.

    Args:
        model (torch.nn.Module): The machine translation model to evaluate.
        val_data (torch.Tensor): The validation data tensor.
        vx (list): List of source words for beam search.
        vy (list): List of target words for beam search.
        device (str): Device to use for computation (e.g., 'cpu' or 'cuda').
        processed_data (dict): Preprocessed data dictionary.

    Returns:
        tuple: A tuple containing validation accuracy (float) and correct predictions (int).
    """

# Set the model to evaluation mode
    model.eval()

    # Disable gradient computation during inference
    with torch.no_grad():
        # Initialize counters
        total_words = 0
        correct_predictions = 0
        
        # Iterate through the validation data with tqdm progress bar
        with tqdm(total=len(processed_data["val_x"]), desc='Beam_Search') as pbar:
            for word, target_word in zip(processed_data["val_x"], processed_data["val_y"]):
                # Increment the total words counter
                total_words += 1
                
                # Perform beam search to predict the next word
                predicted_word = beam_search(params, model, word, device, processed_data)
#                 print(target_word, predicted_word)
                # Check if the predicted word matches the target word
                if predicted_word == target_word[1:-1]:  # Remove start and end tokens
                    correct_predictions += 1
                
                # Update the progress bar
                pbar.update(1)

    # Calculate accuracy
    accuracy = correct_predictions / total_words

    # Return accuracy and number of correct predictions
    return accuracy, correct_predictions



## **Train Using Beam Search**

In [16]:
def training(PARAM, processed_data, device, wandb_log = 0):
    # initilize wandb with project
    if wandb_log == 1:
        wandb.init(project='DL-Assignment3')
        wandb.run.name = 'Training'
    
    # Set Learning Rate, epochsm batch_size
    learning_rate = PARAM["learning_rate"]
    epochs = PARAM["epochs"]
    batch_size = PARAM["batch_size"]

    # Copy encoder and decoder to device
    encoder = Encoder(PARAM).to(device)
    decoder = Decoder(PARAM).to(device)

#     # Initialize model
    model = Seq2Seq(encoder, decoder, PARAM, processed_data).to(device)
    print(model)

    # Define loss function and optimizer
    loss_function = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = set_optimizer(PARAM["optimizer"], model, learning_rate)

    # Split dataset into batches
    train_batches_x = torch.split(processed_data["train_input"], batch_size, dim=1)
    train_batches_y = torch.split(processed_data["train_output"], batch_size, dim=1)
    val_batches_x = torch.split(processed_data["val_input"], batch_size, dim=1)
    val_batches_y = torch.split(processed_data["val_output"], batch_size, dim=1)

    # Training loop
    for epoch in range(epochs):
        print(f"Epoch :: {epoch+1}/{epochs}")
        
        # Train the model on training data
        data_loader = [train_batches_x, train_batches_y]
        accuracy, avg_loss = run_epoch(model, data_loader, optimizer, loss_function, processed_data)  # Average loss per batch

        # Evaluate model character wise
        val_data_loader = [val_batches_x, val_batches_y]
        val_accuracy, val_avg_loss = evaluate_character_level(model, val_data_loader, loss_function, processed_data)
        
        # Evaluate model word wise
        val_accuracy_beam, val_correct_pred_beam = evaluate_model_beam_search(PARAM, model, device, processed_data)
        total_words = processed_data["val_input"].shape[1] 

        # print epochs
        print(f"Epoch : {epoch+1} Train Accuracy: {accuracy*100:.4f}, Train Loss: {avg_loss:.4f}\nValidation Accuracy: {val_accuracy*100:.4f}, Validation Loss: {val_avg_loss:.4f}, \nValidation Acc. With BeamSearch: {val_accuracy_beam*100:.4f}, Correctly Predicted : {val_correct_pred_beam}/{total_words}")

        # Log on wandb
        if wandb_log:
            wandb.log(
                    {
                        'epoch': epoch+1,
                        'training_loss' : avg_loss,
                        'training_accuracy' : accuracy,
                        'validation_loss' : val_avg_loss,
                        'validation_accuracy_using_char' : val_accuracy,
                        'validation_accuracy_using_word' : val_accuracy_beam,
                        'correctly_predicted' : val_correct_pred_beam
                    }
                )
    return model, val_accuracy_beam

## **Get Data**

In [17]:
processed_data = preprocess_data('hin')

## **HYPER PARAMETERS**

In [18]:
HYPER_PARAM = {
    "encoder_input_size": processed_data["input_corpus_length"],
    "embedding_size": 256,
    "hidden_size": 512,
    "num_layers": 2,
    "drop_prob": 0.3,
    "cell_type": "GRU",
    "decoder_input_size": processed_data["output_corpus_length"],
    "decoder_output_size": processed_data["output_corpus_length"],
    "beam_width" : 4,
    "length_penalty" : 0.6,
    "bidirectional" : True,
    "learning_rate" : 0.01,
    "batch_size" : 128,
    "epochs" : 5,
    "optimizer" : "adagrad",
    "tfr" : 0.7,
}

## **Training Model on Hyper Parameters**

In [19]:
model, acc = training(HYPER_PARAM, processed_data, device, wandb_log = 0)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 256)
    (cell): GRU(256, 512, num_layers=2, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(68, 256)
    (cell): GRU(256, 512, num_layers=2, dropout=0.3, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=68, bias=True)
  )
)
Epoch :: 1/5


Training: 100%|██████████| 400/400 [00:35<00:00, 11.23it/s]
Validation: 100%|██████████| 32/32 [00:00<00:00, 34.39it/s]
Beam_Search: 100%|██████████| 4096/4096 [01:55<00:00, 35.61it/s]


Epoch : 1 Train Accuracy: 51.9078, Train Loss: 1.7638
Validation Accuracy: 67.8863, Validation Loss: 1.1704, 
Validation Acc. With BeamSearch: 28.7109, Correctly Predicted : 1176/4096
Epoch :: 2/5


Training: 100%|██████████| 400/400 [00:34<00:00, 11.54it/s]
Validation: 100%|██████████| 32/32 [00:00<00:00, 34.93it/s]
Beam_Search: 100%|██████████| 4096/4096 [01:56<00:00, 35.10it/s]


Epoch : 2 Train Accuracy: 70.4693, Train Loss: 1.0662
Validation Accuracy: 72.4764, Validation Loss: 1.0280, 
Validation Acc. With BeamSearch: 36.8408, Correctly Predicted : 1509/4096
Epoch :: 3/5


Training: 100%|██████████| 400/400 [00:34<00:00, 11.56it/s]
Validation: 100%|██████████| 32/32 [00:00<00:00, 34.86it/s]
Beam_Search: 100%|██████████| 4096/4096 [01:56<00:00, 35.03it/s]


Epoch : 3 Train Accuracy: 74.6496, Train Loss: 0.9329
Validation Accuracy: 73.3590, Validation Loss: 1.0065, 
Validation Acc. With BeamSearch: 38.8428, Correctly Predicted : 1591/4096
Epoch :: 4/5


Training: 100%|██████████| 400/400 [00:34<00:00, 11.57it/s]
Validation: 100%|██████████| 32/32 [00:00<00:00, 34.72it/s]
Beam_Search: 100%|██████████| 4096/4096 [01:56<00:00, 35.21it/s]


Epoch : 4 Train Accuracy: 77.1523, Train Loss: 0.8560
Validation Accuracy: 74.3388, Validation Loss: 0.9731, 
Validation Acc. With BeamSearch: 40.1855, Correctly Predicted : 1646/4096
Epoch :: 5/5


Training: 100%|██████████| 400/400 [00:34<00:00, 11.57it/s]
Validation: 100%|██████████| 32/32 [00:00<00:00, 34.62it/s]
Beam_Search: 100%|██████████| 4096/4096 [01:56<00:00, 35.10it/s]

Epoch : 5 Train Accuracy: 78.8832, Train Loss: 0.8016
Validation Accuracy: 74.8243, Validation Loss: 0.9626, 
Validation Acc. With BeamSearch: 41.4307, Correctly Predicted : 1697/4096





## **Transliteration on Random Data**

In [24]:
sentance = "paritraanaaya sadhuunaam vinaashaaya chadushkritaam dharma samsthaapanaarthaaya sambhavaami yuge yuge"

sentance = sentance.lower()
lst = sentance.split(" ")
print(type(lst))
for word in lst:
    output_sequence = beam_search(HYPER_PARAM, model, word ,device, processed_data)
    print(output_sequence, end = " ")

<class 'list'>
परित्रणाय साधुनाम विनाशाया चदुष्कृताम धर्मा सम्थस्पन्थर्या संभवामी युगे युगे 

## **Sweep Config**

In [21]:
sweep_config = {
            'name': 'sweep-bayes-1',
            'method': 'bayes',
            'metric': { 'goal': 'maximize','name': 'Accuracy'},
            'parameters': 
                {
                    'epochs': {'values': [10]},
                    'cell_type': {'values': ['RNN', 'LSTM', 'GRU']},
                    'embedding_size': {'values': [128, 256, 512]},
                    'hidden_size': {'values': [128, 256, 512, 1024]},
                    'num_layers': {'values': [1, 2, 3]},
                    'dropout': {'values': [0.3, 0.5, 0.7]},
                    'optimizer' : {'values' : ['adam', 'sgd', 'rmsprop', 'adagrad']},
                    'learning_rate': {'values': [0.001, 0.005, 0.01, 0.1]},
                    'batch_size': {'values': [32, 64]},
                    'teacher_fr' : {'values': [0.3, 0.5, 0.7]},
                    'length_penalty' : {'values': [0.4, 0.5, 0.6]},
                    'bi_dir' : {'values': [True, False]},
                    'beam_width': {'values': [1, 2, 3]}
                }
            }

In [22]:
def train():
    var1 = wandb.init(project="DL-Assignment3")
    var2 = var1.config
   
    wandb.run.name = (f"cell_type:{var2.cell_type}_epochs:{var2.epochs}_lr:{var2.learning_rate}_batch_size:{var2.batch_size}_beam_width:{var2.beam_width}_opt:{var2.optimizer}_dropout:{var2.dropout}_teacher_fr:{var2.teacher_fr}_embadding_size:{var2.embedding_size}")
    
    HYPER_PARAM = {
    "encoder_input_size": processed_data["input_corpus_length"],
    "embedding_size": var2.embedding_size,
    "hidden_size": var2.hidden_size,
    "num_layers": var2.num_layers,
    "drop_prob": var2.dropout,
    "cell_type": var2.cell_type,
    "decoder_input_size": processed_data["output_corpus_length"],
    "decoder_output_size": processed_data["output_corpus_length"],
    "beam_width" : var2.beam_width,
    "length_penalty" : var2.length_penalty,
    "bidirectional" : var2.bi_dir,
    "learning_rate" : var2.learning_rate,
    "batch_size" : var2.batch_size,
    "epochs" : var2.epochs,
    "optimizer" : var2.optimizer,
    "tfr" : var2.teacher_fr,
}

    model, accuracy = training(HYPER_PARAM, wandb_log = 1)
    wandb.log({
                "Accuracy" : accuracy
            })

In [23]:
sweep_id = wandb.sweep(sweep_config, project="DL-Assignment3")
wandb.agent(sweep_id, train, count = 2)
wandb.finish()