In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import pandas as pd
import numpy as np
from collections import Counter
import time
import math
import random
import os
import pickle # Or json, if you saved vocab that way
from nltk.tokenize import word_tokenize, sent_tokenize # For Hierarchical Encoder
from rouge_score import rouge_scorer # For evaluation


In [2]:
# --- Configuration ---
# File Paths (Update these)
PROCESSED_DATA_DIR = '/kaggle/input/processedassignment2datanlp' # Directory containing outputs from taskA
TRAIN_PROC_FILE = os.path.join(PROCESSED_DATA_DIR, 'train_processed.csv')
VAL_PROC_FILE = os.path.join(PROCESSED_DATA_DIR, 'validation_processed.csv')
TEST_PROC_FILE = os.path.join(PROCESSED_DATA_DIR, 'test_processed.csv')
VOCAB_FILE = os.path.join(PROCESSED_DATA_DIR, 'vocabulary.pkl') # Or .json if saved that way
GLOVE_FILE = '/kaggle/input/glove-embeddings/glove.6B.300d.txt' # <--- CHANGE THIS (e.g., glove.6B.300d.txt)
MODEL_SAVE_DIR = 'saved_models_B'

# Vocabulary & Special Tokens
VOCAB_FREQ_THRESHOLD_PERCENT = 1.0 # Minimum % of training documents a token must appear in
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
BOS_TOKEN = '<bos>'
EOS_TOKEN = '<eos>'

# Model Hyperparameters (Tune these using validation set)
HIDDEN_DIM = 300
EMBED_DIM = 300 #GloVe dimension
NUM_LAYERS_ENC = 1 # For basic EncoderRNN 
NUM_LAYERS_DEC = 1 # For basic DecoderRNN
DROPOUT = 0.3
LEARNING_RATE = 0.001
BATCH_SIZE = 32 # Adjust based on GPU memory
NUM_EPOCHS = 5 
CLIP_GRAD = 1.0 # Gradient clipping threshold

# Generation Hyperparameters
MAX_NEW_TOKENS = 30 # Max length of generated titles
BEAM_WIDTH = 4 # For beam search evaluation

# Other Settings
SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
REPORT_INTERVAL = 50 # Print training progress every N batches

# Create model save directory
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# --- Set Random Seeds ---
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# --- Vocabulary Loading/Building ---
# If you didn't save vocab in Part A, build it here based on processed train data

def build_or_load_vocab(train_filepath, threshold_percent, save_path=VOCAB_FILE):
    if os.path.exists(save_path):
        print(f"Loading vocabulary from {save_path}...")
        try:
            with open(save_path, 'rb') as f:
                vocab_data = pickle.load(f)
            word2idx = vocab_data['word2idx']
            idx2word = vocab_data['idx2word']
            print(f"Loaded vocabulary with {len(word2idx)} tokens.")
            return word2idx, idx2word
        except Exception as e:
            print(f"Error loading vocab file {save_path}: {e}. Rebuilding...")

    print("Building vocabulary from training data...")
    df_train = pd.read_csv(train_filepath)
    df_train.fillna('', inplace=True) # Handle potential NaNs

    # Count token occurrences across documents (not just total frequency)
    doc_counts = Counter()
    all_tokens = set()
    for text in df_train['processed_text']:
        tokens = set(text.split())
        doc_counts.update(tokens)
        all_tokens.update(tokens)
    for title in df_train['processed_title']:
        tokens = set(title.split())
        doc_counts.update(tokens)
        all_tokens.update(tokens)

    num_docs = len(df_train)
    threshold_count = (threshold_percent / 100.0) * num_docs

    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1, BOS_TOKEN: 2, EOS_TOKEN: 3}
    idx_counter = 4
    for token, count in doc_counts.items():
        if count >= threshold_count:
            vocab[token] = idx_counter
            idx_counter += 1

    word2idx = vocab
    idx2word = {idx: word for word, idx in word2idx.items()}
    print(f"Built vocabulary with {len(word2idx)} tokens (appeared in >= {threshold_percent:.2f}% of docs).")

    # Save the vocabulary
    try:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, 'wb') as f:
            pickle.dump({'word2idx': word2idx, 'idx2word': idx2word}, f)
        print(f"Vocabulary saved to {save_path}")
    except Exception as e:
        print(f"Error saving vocabulary: {e}")

    return word2idx, idx2word

# --- GloVe Embedding Loading ---

def load_glove_embeddings(glove_path, word2idx, embed_dim):
    if not os.path.exists(glove_path):
        print(f"GloVe file not found at {glove_path}. Cannot load embeddings.")
        return None

    print(f"Loading GloVe embeddings from {glove_path}...")
    embeddings_index = {}
    try:
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                if len(vector) == embed_dim: # Ensure dimension matches
                     embeddings_index[word] = vector
                # else:
                #     print(f"Warning: Skipping word '{word}' with unexpected dimension {len(vector)} (expected {embed_dim})")
    except Exception as e:
        print(f"Error reading GloVe file: {e}")
        return None

    print(f"Loaded {len(embeddings_index)} word vectors.")

    vocab_size = len(word2idx)
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    hits = 0
    misses = 0

    for word, i in word2idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            # Optional: Initialize OOV words randomly (e.g., np.random.randn)
            # Or keep them as zeros
            misses += 1

    print(f"Converted GloVe embeddings to matrix: {hits} words found, {misses} words not found in GloVe.")
    return torch.tensor(embedding_matrix, dtype=torch.float)


# --- Dataset and DataLoader ---

class WikipediaDataset(Dataset):
    def __init__(self, filepath, word2idx, max_len_text=None, max_len_title=None):
        self.df = pd.read_csv(filepath)
        self.df.fillna('', inplace=True) # Ensure no NaNs

        # Filter out rows that became empty during preprocessing in Part A again, just in case
        self.df = self.df[(self.df['processed_text'].str.strip() != "") & (self.df['processed_title'].str.strip() != "")]
        self.df = self.df.reset_index(drop=True)

        self.texts = self.df['processed_text'].tolist()
        self.titles = self.df['processed_title'].tolist()
        self.word2idx = word2idx
        self.unk_idx = word2idx.get(UNK_TOKEN, 1) # Default UNK index
        self.bos_idx = word2idx.get(BOS_TOKEN, 2)
        self.eos_idx = word2idx.get(EOS_TOKEN, 3)

        # Optional: Store max lengths if needed, though padding handles this
        self.max_len_text = max_len_text
        self.max_len_title = max_len_title

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.texts[idx]
        title = self.titles[idx]

        # Convert text to indices
        text_tokens = text.split()
        text_indices = [self.word2idx.get(token, self.unk_idx) for token in text_tokens]
        # Optional: Truncate if max_len_text is set

        # Convert title to indices, adding BOS/EOS
        title_tokens = title.split()
        title_indices = [self.bos_idx] + [self.word2idx.get(token, self.unk_idx) for token in title_tokens] + [self.eos_idx]
        # Optional: Truncate if max_len_title is set

        return torch.tensor(text_indices, dtype=torch.long), torch.tensor(title_indices, dtype=torch.long)

def collate_fn(batch):
    """
    Pads sequences within a batch.
    Args:
        batch: A list of tuples (text_indices, title_indices)
    Returns:
        texts_padded: Tensor of padded text sequences
        text_lengths: Tensor of original text lengths
        titles_padded: Tensor of padded title sequences
        title_lengths: Tensor of original title lengths
    """
    texts, titles = zip(*batch)

    # Get lengths before padding
    text_lengths = torch.tensor([len(t) for t in texts], dtype=torch.long)
    title_lengths = torch.tensor([len(t) for t in titles], dtype=torch.long)

    # Pad sequences
    # batch_first=True is important for GRU/LSTM input shape
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=word2idx[PAD_TOKEN])
    titles_padded = pad_sequence(titles, batch_first=True, padding_value=word2idx[PAD_TOKEN])

    return texts_padded, text_lengths, titles_padded, title_lengths

# --- Model Definitions ---

class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout, bidirectional=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx[PAD_TOKEN])
        self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers,
                          bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0,
                          batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim

        # Optional: Linear layer to project hidden states if needed (e.g., if decoder has different dim)
        # self.fc = nn.Linear(hidden_dim * self.num_directions, hidden_dim)

    def forward(self, src, src_len):
        # src: [batch_size, seq_len]
        # src_len: [batch_size]

        embedded = self.dropout(self.embedding(src))
        # embedded: [batch_size, seq_len, embed_dim]

        # Pack sequence to handle padding efficiently
        packed_embedded = pack_padded_sequence(embedded, src_len.cpu(), batch_first=True, enforce_sorted=False)

        packed_outputs, hidden = self.rnn(packed_embedded)
        # packed_outputs: packed sequence
        # hidden: [num_layers * num_directions, batch_size, hidden_dim]

        # Unpack sequence (optional, useful if you need intermediate outputs)
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        # outputs: [batch_size, seq_len, hidden_dim * num_directions]

        # Process hidden state for decoder initialization
        # If bidirectional, hidden is [2*num_layers, batch, hid_dim]. Need to combine directions.
        # Often, the forward and backward hidden states are summed or concatenated.
        # For a single layer BiGRU, hidden is [2, batch, hid_dim].
        # Concatenate forward and backward final hidden states
        if self.rnn.bidirectional:
             # Reshape hidden: [num_layers, num_directions, batch, hidden_dim]
             hidden = hidden.view(self.rnn.num_layers, self.num_directions, -1, self.hidden_dim)
             # Concatenate the directions for the last layer:
             # Take hidden state of the last layer: hidden[-1] -> [num_directions, batch, hidden_dim]
             # Concatenate along dim 2 -> [batch, hidden_dim * 2] -> need [batch, hidden_dim] or similar for decoder
             # Option 1: Sum forward and backward of the last layer
             # hidden = hidden[-1, 0, :, :] + hidden[-1, 1, :, :] # -> [batch, hidden_dim]
             # Option 2: Pass through a linear layer to reduce dimension
             # hidden_concat = torch.cat((hidden[-1, 0, :, :], hidden[-1, 1, :, :]), dim=1) # -> [batch, hidden_dim*2]
             # hidden = torch.tanh(self.fc(hidden_concat)) # -> [batch, hidden_dim] - Need to adjust for multi-layer decoders
             # Option 3: Reshape for multi-layer decoder (sum directions per layer)
             # Reshape hidden -> [num_layers, batch_size, hidden_dim * num_directions]
             hidden = hidden.permute(0, 2, 1, 3).contiguous() # -> [layers, batch, directions, dim]
             # Sum directions: requires decoder hidden_dim to match encoder hidden_dim
             hidden = hidden.sum(dim=2) # -> [layers, batch, dim]

        # Ensure hidden has shape [num_decoder_layers, batch_size, decoder_hidden_dim]
        # This depends on the decoder structure. For a basic 1-layer decoder matching encoder hid_dim:
        # If encoder is multi-layer bidirectional, take the summed hidden state of the *last* encoder layer.
        # If encoder is 1-layer bidirectional, sum is already [1, batch, dim] assuming decoder is 1-layer.
        if hidden.shape[0] > NUM_LAYERS_DEC: # If encoder layers > decoder layers
            hidden = hidden[-NUM_LAYERS_DEC:] # Take only the last N layers


        # outputs: [batch_size, seq_len, hidden_dim * num_directions]
        # hidden: [num_decoder_layers, batch_size, decoder_hidden_dim] - Processed state for decoder
        return outputs, hidden


class HierEncoderRNN(nn.Module):
     # TODO: Implement Hierarchical Encoder (Part B2, Task 5)
     # Needs sentence splitting, word-level GRU, sentence aggregation (e.g., average), sentence-level GRU.
     # Requires careful handling of padding at both levels.
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers_word, num_layers_sent, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx[PAD_TOKEN])
        self.word_rnn = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers_word,
                               bidirectional=True, dropout=dropout if num_layers_word > 1 else 0,
                               batch_first=True)
        self.sent_rnn = nn.GRU(hidden_dim * 2, hidden_dim, num_layers=num_layers_sent, # Input is concat hidden states
                               bidirectional=True, dropout=dropout if num_layers_sent > 1 else 0,
                               batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.hidden_dim = hidden_dim
        # Layer to potentially transform final hidden state for decoder
        self.fc_hidden = nn.Linear(hidden_dim * 2 * num_layers_sent, hidden_dim * NUM_LAYERS_DEC) # Match decoder layers/dim

    def forward(self, src, src_len):
        # src: [batch_size, max_seq_len] - Raw token indices for the whole article
        # src_len: [batch_size] - Original lengths of articles

        # This implementation is complex due to varying sentence/word counts.
        # A common approach involves processing sentence by sentence or using libraries
        # that handle nested padding (like AllenNLP or careful manual padding/masking).

        # Placeholder: This needs a proper implementation using sentence tokenization
        # and handling the nested structure. The standard EncoderRNN is simpler.
        # For now, let's return dummy values or raise NotImplementedError
        raise NotImplementedError("Hierarchical Encoder needs full implementation")

        # A simplified view (conceptual):
        # 1. Pad articles to max_article_len
        # 2. Split each article into sentences (pad sentences to max_sent_len, pad num_sentences to max_num_sent)
        # 3. Process words in each sentence with word_rnn
        # 4. Aggregate word outputs per sentence (e.g., take last hidden state, or average outputs)
        # 5. Process aggregated sentence representations with sent_rnn
        # 6. Extract final hidden state for the decoder

        # Returning dummy values matching expected shapes
        # batch_size = src.shape[0]
        # seq_len = src.shape[1]
        # outputs = torch.zeros(batch_size, seq_len, self.hidden_dim * 2).to(src.device) # Dummy outputs
        # hidden = torch.zeros(NUM_LAYERS_DEC, batch_size, self.hidden_dim).to(src.device) # Dummy hidden
        # return outputs, hidden


class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx[PAD_TOKEN])
        self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers,
                           dropout=dropout if num_layers > 1 else 0, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden):
        # input_token: [batch_size] - current input token indices
        # hidden: [num_layers, batch_size, hidden_dim] - previous hidden state

        # Add sequence length dimension: [batch_size] -> [batch_size, 1]
        input_token = input_token.unsqueeze(1)

        embedded = self.dropout(self.embedding(input_token))
        # embedded: [batch_size, 1, embed_dim]

        output, hidden = self.rnn(embedded, hidden)
        # output: [batch_size, 1, hidden_dim]
        # hidden: [num_layers, batch_size, hidden_dim]

        # Predict next token
        prediction = self.fc_out(output.squeeze(1))
        # prediction: [batch_size, vocab_size]

        return prediction, hidden


class Decoder2RNN(nn.Module):
    # TODO: Implement Decoder with 2 GRUs (Part B2, Task 6)
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx[PAD_TOKEN])
        # Two GRU layers
        self.rnn1 = nn.GRU(embed_dim, hidden_dim, num_layers=1, batch_first=True) # First layer
        self.rnn2 = nn.GRU(hidden_dim, hidden_dim, num_layers=1, batch_first=True) # Second layer
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden):
        # input_token: [batch_size]
        # hidden: Tuple of hidden states (hidden1, hidden2) or a single tensor to split?
        # Assuming hidden is [num_layers=2, batch_size, hidden_dim] matching encoder output
        # If encoder outputs [1, batch, dim], need to duplicate/adapt for 2 layers

        if hidden.shape[0] != 2:
             # Handle mismatch if encoder provides only 1 layer state
             # Option: duplicate encoder's last layer state
             if hidden.shape[0] == 1:
                 hidden = hidden.repeat(2, 1, 1)
             else: # Or just take first 2 layers if encoder provided more
                 hidden = hidden[:2]

        # Split hidden state for each GRU layer
        hidden1 = hidden[0:1] # Shape [1, batch, dim]
        hidden2 = hidden[1:2] # Shape [1, batch, dim]

        input_token = input_token.unsqueeze(1)
        # input_token: [batch_size, 1]

        embedded = self.dropout(self.embedding(input_token))
        # embedded: [batch_size, 1, embed_dim]

        output1, hidden1_out = self.rnn1(embedded, hidden1)
        # output1: [batch_size, 1, hidden_dim]
        # hidden1_out: [1, batch_size, hidden_dim]

        output2, hidden2_out = self.rnn2(output1, hidden2) # Feed output of rnn1 to rnn2
        # output2: [batch_size, 1, hidden_dim]
        # hidden2_out: [1, batch_size, hidden_dim]

        prediction = self.fc_out(output2.squeeze(1))
        # prediction: [batch_size, vocab_size]

        # Combine hidden states for next step
        updated_hidden = torch.cat((hidden1_out, hidden2_out), dim=0) # Shape [2, batch, dim]

        return prediction, updated_hidden


class Seq2seqRNN(nn.Module):
    def __init__(self, encoder, decoder, device, bos_idx, eos_idx, pad_idx):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.bos_idx = bos_idx
        self.eos_idx = eos_idx
        self.pad_idx = pad_idx

    def load_embeddings(self, embedding_matrix, freeze=False):
        """Loads pre-trained embeddings into the encoder."""
        if embedding_matrix is None:
            print("Embedding matrix is None, cannot load.")
            return
        if hasattr(self.encoder, 'embedding'):
            self.encoder.embedding.weight.data.copy_(embedding_matrix)
            self.encoder.embedding.weight.requires_grad = not freeze
            print(f"Loaded embeddings into encoder. Freeze: {freeze}")
        else:
            print("Encoder does not have an 'embedding' attribute to load into.")

    def forward(self, src, src_len, trg):
        # src: [batch_size, src_len]
        # src_len: [batch_size]
        # trg: [batch_size, trg_len]

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.vocab_size

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Encode source sequence
        encoder_outputs, hidden = self.encoder(src, src_len)
        # encoder_outputs: potentially used for attention later
        # hidden: final hidden state(s) to initialize decoder

        # Decoder input starts with <bos> token
        input_token = trg[:, 0] # [batch_size]

        # Teacher Forcing: Feed target sequence token-by-token
        for t in range(1, trg_len): # Start from 1, predict token t based on input t-1
            output, hidden = self.decoder(input_token, hidden)

            # Store prediction
            outputs[:, t, :] = output

            # Get next input token from target sequence
            input_token = trg[:, t]

        # outputs: [batch_size, trg_len, vocab_size] - contains logits starting from index 1
        return outputs

    def generate(self, src, src_len, max_len, beam_width=1):
        """Generate titles for source sequences."""
        if beam_width == 1:
            return self._generate_greedy(src, src_len, max_len)
        else:
            return self._generate_beam_search(src, src_len, max_len, beam_width)

    def _generate_greedy(self, src, src_len, max_len):
        self.eval() # Set to evaluation mode
        batch_size = src.shape[0]
        generated_sequences = torch.zeros(batch_size, max_len, dtype=torch.long).fill_(self.pad_idx).to(self.device)
        generated_sequences[:, 0] = self.bos_idx # Start with <bos>

        with torch.no_grad():
            _, hidden = self.encoder(src, src_len)
            input_token = torch.tensor([self.bos_idx] * batch_size).to(self.device) # [batch_size]

            # Keep track of which sequences have finished (hit <eos>)
            finished = torch.zeros(batch_size, dtype=torch.bool).to(self.device)

            for t in range(1, max_len):
                output, hidden = self.decoder(input_token, hidden) # output: [batch_size, vocab_size]

                # Get the token with the highest probability (greedy)
                top1 = output.argmax(1) # [batch_size]

                # Update generated sequences for non-finished ones
                generated_sequences[~finished, t] = top1[~finished]

                # Update input token for next step (only for non-finished)
                input_token = top1

                # Update finished mask
                just_finished = (top1 == self.eos_idx)
                finished |= just_finished

                # Stop if all sequences have finished
                if finished.all():
                    break

        # Return sequences (excluding the initial <bos> potentially)
        return generated_sequences

    def _generate_beam_search(self, src, src_len, max_len, beam_width):
        # TODO: Implement Beam Search (Part B2, Task 7)
        self.eval()
        batch_size = src.shape[0]
        # This is significantly more complex than greedy search.
        # It involves maintaining `beam_width` candidate sequences per batch item,
        # along with their cumulative log-probabilities.
        # Need data structures to manage beams, scores, hidden states per beam.

        # Placeholder - Returning greedy results for now
        print("Warning: Beam search not implemented yet. Returning greedy results.")
        return self._generate_greedy(src, src_len, max_len)

        # Key steps for beam search:
        # 1. Encode source: Get initial hidden state.
        # 2. Initialize beams: Start with <bos> token, score 0. Maintain k beams.
        # 3. Loop t = 1 to max_len:
        #    a. For each beam candidate from previous step:
        #       i. Feed its last token and hidden state to decoder. Get output logits.
        #       ii. Calculate log probabilities (log_softmax).
        #       iii. Get top-k next token candidates and their probabilities.
        #       iv. Calculate cumulative scores for new potential beams (prev_score + new_log_prob).
        #    b. Collect all potential new beams across all current beams.
        #    c. Select top-k overall beams based on cumulative scores. Prune the rest.
        #    d. Handle finished sequences (<eos>): Store completed beams separately. Continue expanding others.
        # 4. Return the best completed beam(s).


# --- Training and Evaluation Functions ---

def train_epoch(model, dataloader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    num_batches = len(dataloader)

    for i, (src, src_len, trg, trg_len) in enumerate(dataloader):
        src, src_len, trg = src.to(device), src_len.to(device), trg.to(device)

        optimizer.zero_grad()

        # Shape: [batch_size, trg_len, vocab_size]
        output = model(src, src_len, trg)

        # Reshape for loss calculation:
        # output: [(batch_size * (trg_len - 1)), vocab_size]
        # trg: [batch_size * (trg_len - 1)]
        # Ignore the first token (<bos>) in target and output
        output_dim = output.shape[-1]
        output = output[:, 1:, :].contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output, trg)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()

        if (i + 1) % REPORT_INTERVAL == 0:
            print(f'Batch {i+1}/{num_batches} | Loss: {loss.item():.4f}')

    return epoch_loss / num_batches

def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, src_len, trg, trg_len in dataloader:
            src, src_len, trg = src.to(device), src_len.to(device), trg.to(device)

            output = model(src, src_len, trg) # [batch, trg_len, vocab_size]

            output_dim = output.shape[-1]
            output = output[:, 1:, :].contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def predict_and_evaluate_rouge(model, dataloader, idx2word, device, generation_params):
    model.eval()
    all_predictions = []
    all_references = []
    pad_idx = word2idx[PAD_TOKEN]
    eos_idx = word2idx[EOS_TOKEN]
    bos_idx = word2idx[BOS_TOKEN]

    print(f"Generating predictions with params: {generation_params}")
    with torch.no_grad():
        for src, src_len, trg, trg_len in dataloader: # Use test loader here
            src, src_len = src.to(device), src_len.to(device)

            # Generate predictions (indices)
            # Shape: [batch_size, max_len]
            generated_indices = model.generate(src, src_len, **generation_params)

            # Convert indices to text
            for i in range(generated_indices.shape[0]):
                # Predicted sequence
                pred_seq = []
                for idx in generated_indices[i].tolist():
                    if idx == bos_idx: continue # Skip <bos>
                    if idx == eos_idx: break # Stop at <eos>
                    if idx == pad_idx: continue # Skip <pad>
                    pred_seq.append(idx2word.get(idx, UNK_TOKEN))
                all_predictions.append(" ".join(pred_seq))

                # Reference sequence (from the batch)
                ref_seq = []
                # trg is padded, use trg_len to get original length
                # Correct indexing needed - trg includes BOS/EOS
                actual_trg_len = trg_len[i].item()
                for idx in trg[i, 1:actual_trg_len-1].tolist(): # Exclude BOS and EOS
                     if idx == pad_idx: break # Should not happen if length is correct
                     ref_seq.append(idx2word.get(idx, UNK_TOKEN))
                all_references.append(" ".join(ref_seq))

    print(f"Generated {len(all_predictions)} predictions.")
    # Example outputs:
    if len(all_predictions) > 0 and len(all_references) > 0:
        print("Example Prediction :", all_predictions[0])
        print("Example Reference  :", all_references[0])

    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
    num_samples = len(all_predictions)

    if num_samples == 0:
        return {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

    for pred, ref in zip(all_predictions, all_references):
        scores = scorer.score(ref, pred)
        total_scores['rouge1'] += scores['rouge1'].fmeasure
        total_scores['rouge2'] += scores['rouge2'].fmeasure
        total_scores['rougeL'] += scores['rougeL'].fmeasure

    avg_scores = {metric: score / num_samples for metric, score in total_scores.items()}
    print(f"ROUGE Scores: R1: {avg_scores['rouge1']:.4f}, R2: {avg_scores['rouge2']:.4f}, RL: {avg_scores['rougeL']:.4f}")
    return avg_scores

In [3]:
# --- New Helper Function to Train/Evaluate One Configuration ---

def run_experiment(
    exp_name,
    encoder_type,
    decoder_type,
    use_glove,
    glove_freeze,
    beam_width,
    word2idx,
    idx2word,
    train_dataloader,
    val_dataloader,
    test_dataloader,
    device,
    config_params, # Dictionary containing EMBED_DIM, HIDDEN_DIM, etc.
    skip_training=False, # New flag to skip training
    load_model_path=None # New flag to load a specific model path
    ):
    """Trains and evaluates a single model configuration."""

    exp_start_time = time.time()
    print(f"\n--- Running Experiment: {exp_name} ---")
    print(f"Config: Enc={encoder_type}, Dec={decoder_type}, GloVe={use_glove}, Beam={beam_width}")

    vocab_size = len(word2idx)
    PAD_IDX = word2idx[PAD_TOKEN]
    BOS_IDX = word2idx[BOS_TOKEN]
    EOS_IDX = word2idx[EOS_TOKEN]

    # --- Instantiate Models ---
    # Encoder
    if encoder_type == 'basic':
        encoder = EncoderRNN(vocab_size, config_params['EMBED_DIM'], config_params['HIDDEN_DIM'],
                             config_params['NUM_LAYERS_ENC'], config_params['DROPOUT'], bidirectional=True).to(device)
    elif encoder_type == 'hierarchical':
         try:
             encoder = HierEncoderRNN(vocab_size, config_params['EMBED_DIM'], config_params['HIDDEN_DIM'],
                                    num_layers_word=1, num_layers_sent=1, dropout=config_params['DROPOUT']).to(device)
             # Trigger the check - this will raise NotImplementedError if not implemented
             _ = encoder(torch.randint(0,10,(2,10)).to(device), torch.tensor([10,8]))
         except NotImplementedError as e:
             print(f"Skipping {exp_name}: Hierarchical Encoder not implemented. {e}")
             return { 'status': 'skipped', 'reason': 'Hierarchical Encoder not implemented' }
         except Exception as e: # Catch other potential errors in placeholder
             print(f"Skipping {exp_name}: Error initializing/testing HierEncoderRNN placeholder. {e}")
             return { 'status': 'skipped', 'reason': f'HierEncoderRNN placeholder error: {e}' }
    else:
        raise ValueError(f"Unknown encoder_type: {encoder_type}")

    # Decoder
    decoder_hidden_dim = config_params['HIDDEN_DIM'] # Adjust if encoder processing changes hidden dim
    if decoder_type == 'basic':
        decoder = DecoderRNN(vocab_size, config_params['EMBED_DIM'], decoder_hidden_dim,
                           config_params['NUM_LAYERS_DEC'], config_params['DROPOUT']).to(device)
    elif decoder_type == 'decoder2':
        decoder = Decoder2RNN(vocab_size, config_params['EMBED_DIM'], decoder_hidden_dim,
                            config_params['DROPOUT']).to(device)
    else:
        raise ValueError(f"Unknown decoder_type: {decoder_type}")

    # Seq2Seq Model
    model = Seq2seqRNN(encoder, decoder, device, BOS_IDX, EOS_IDX, PAD_IDX).to(device)

    # --- Handle Loading/Skipping Training ---
    model_save_path = os.path.join(config_params['MODEL_SAVE_DIR'], f"{exp_name}_model.pt")
    train_time = 0
    best_valid_loss = float('inf')
    train_losses = []
    valid_losses = []


    if load_model_path:
        print(f"Loading pre-trained model from: {load_model_path}")
        try:
            # Apply the fix here!
            model.load_state_dict(torch.load(load_model_path, map_location=device, weights_only=True))
            print("Model loaded successfully.")
            skip_training = True # Force skip training if loading specific path
        except FileNotFoundError:
            print(f"Error: Model file not found at {load_model_path}. Cannot evaluate.")
            return { 'status': 'error', 'reason': f'Model file not found: {load_model_path}' }
        except Exception as e:
            print(f"Error loading model state dict from {load_model_path}: {e}")
            return { 'status': 'error', 'reason': f'Error loading state dict: {e}' }

    elif skip_training:
        print("Skipping training as requested.")
        # Attempt to load if a saved model for *this* experiment exists
        if os.path.exists(model_save_path):
             print(f"Found existing model file for this config: {model_save_path}. Loading...")
             try:
                 # Apply the fix here!
                 model.load_state_dict(torch.load(model_save_path, map_location=device, weights_only=True))
             except Exception as e:
                 print(f"Warning: Error loading existing model {model_save_path}: {e}. Evaluation might fail.")
        else:
             print(f"Warning: Skipping training, but no model file found at {model_save_path}. Evaluation might fail.")


    # --- Training ---
    if not skip_training:
        # Load GloVe if needed
        if use_glove:
            glove_matrix = load_glove_embeddings(config_params['GLOVE_FILE'], word2idx, config_params['EMBED_DIM'])
            if glove_matrix is not None:
                model.load_embeddings(glove_matrix.to(device), freeze=glove_freeze)
            else:
                print("GloVe embeddings not loaded, continuing with random initialization.")

        # Optimizer and Criterion
        optimizer = optim.Adam(model.parameters(), lr=config_params['LEARNING_RATE'])
        criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

        print("Starting training...")
        train_loop_start = time.time()
        for epoch in range(config_params['NUM_EPOCHS']):
            epoch_start_time = time.time()

            train_loss = train_epoch(model, train_dataloader, optimizer, criterion, config_params['CLIP_GRAD'], device)
            valid_loss = evaluate(model, val_dataloader, criterion, device)

            train_losses.append(train_loss)
            valid_losses.append(valid_loss)

            epoch_end_time = time.time()
            epoch_mins, epoch_secs = divmod(epoch_end_time - epoch_start_time, 60)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                print(f"Epoch {epoch+1}/{config_params['NUM_EPOCHS']} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s | Saving Best Model...")
                torch.save(model.state_dict(), model_save_path) # Save the best model`
            else:
                 print(f"Epoch {epoch+1}/{config_params['NUM_EPOCHS']} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s")

            print(f'\tTrain Loss: {train_loss:.4f} | Val. Loss: {valid_loss:.4f}')

        train_time = time.time() - train_loop_start
        print(f"Training finished. Time taken: {train_time:.2f} seconds")

        # Load best model for evaluation
        if os.path.exists(model_save_path):
            print(f"Loading best trained model from {model_save_path}")
            try:
                # Apply the fix here!
                model.load_state_dict(torch.load(model_save_path, map_location=device, weights_only=True))
            except Exception as e:
                print(f"Warning: Error loading best model state dict: {e}. Evaluating with last epoch's model.")
        else:
            print("Warning: Best model file not found after training. Evaluating with last epoch's model.")

    # --- Evaluation ---
    eval_start_time = time.time()
    generation_params = {
        'max_len': config_params['MAX_NEW_TOKENS'],
        'beam_width': beam_width
    }
    # Check if beam search is requested and implemented
    if beam_width > 1:
         try:
             # Attempt generation - this will raise NotImplementedError if not implemented
             _ = model.generate(torch.randint(0,10,(1,5)).to(device), torch.tensor([5]), max_len=5, beam_width=beam_width)
             print(f"Evaluating with Beam Search (Width={beam_width})...")
         except NotImplementedError as e:
              print(f"Cannot evaluate {exp_name} with Beam Search: Beam search not implemented. {e}")
              return { 'status': 'skipped', 'reason': 'Beam search not implemented' }
         except Exception as e: # Catch other potential errors
              print(f"Cannot evaluate {exp_name} with Beam Search: Error during generation call. {e}")
              return { 'status': 'error', 'reason': f'Beam search generation error: {e}' }
    else:
         print("Evaluating with Greedy Search...")


    rouge_scores = predict_and_evaluate_rouge(model, test_dataloader, idx2word, device, generation_params)
    eval_time = time.time() - eval_start_time

    exp_end_time = time.time()
    total_exp_time = exp_end_time - exp_start_time

    # --- Store Results ---
    results_data = {
        'status': 'completed',
        'exp_name': exp_name,
        'encoder': encoder_type,
        'decoder': decoder_type,
        'glove': use_glove,
        'beam_width': beam_width,
        'train_time_s': train_time,
        'eval_time_s': eval_time,
        'total_time_s': total_exp_time,
        'best_val_loss': best_valid_loss if not skip_training else 'N/A',
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'train_losses': train_losses, # Store loss history
        'valid_losses': valid_losses,
        'model_path': model_save_path if not skip_training else load_model_path
    }
    print(f"--- Experiment {exp_name} Finished --- Time: {total_exp_time:.2f}s | ROUGE-L: {rouge_scores['rougeL']:.4f}")
    return results_data


# --- Main Execution Block ---
if __name__ == "__main__":
    main_start_time = time.time()
    print(f"Main execution started. Using device: {DEVICE}")
    print("NOTE: Experiments will run sequentially in this script.")
    print("      For parallel execution on Kaggle, create separate notebooks for each part and run them concurrently.")

    # --- Collect All Config Params into a dictionary ---
    # Allows passing all hyperparameters easily to the run_experiment function
    config_params = {
        'TRAIN_PROC_FILE': TRAIN_PROC_FILE,
        'VAL_PROC_FILE': VAL_PROC_FILE,
        'TEST_PROC_FILE': TEST_PROC_FILE,
        'VOCAB_FILE': VOCAB_FILE,
        'GLOVE_FILE': GLOVE_FILE,
        'MODEL_SAVE_DIR': MODEL_SAVE_DIR,
        'VOCAB_FREQ_THRESHOLD_PERCENT': VOCAB_FREQ_THRESHOLD_PERCENT,
        'PAD_TOKEN': PAD_TOKEN, 'UNK_TOKEN': UNK_TOKEN, 'BOS_TOKEN': BOS_TOKEN, 'EOS_TOKEN': EOS_TOKEN,
        'HIDDEN_DIM': HIDDEN_DIM, 'EMBED_DIM': EMBED_DIM,
        'NUM_LAYERS_ENC': NUM_LAYERS_ENC, 'NUM_LAYERS_DEC': NUM_LAYERS_DEC,
        'DROPOUT': DROPOUT, 'LEARNING_RATE': LEARNING_RATE,
        'BATCH_SIZE': BATCH_SIZE, 'NUM_EPOCHS': NUM_EPOCHS, 'CLIP_GRAD': CLIP_GRAD,
        'MAX_NEW_TOKENS': MAX_NEW_TOKENS, 'BEAM_WIDTH': BEAM_WIDTH,
        'SEED': SEED, 'DEVICE': DEVICE, 'REPORT_INTERVAL': REPORT_INTERVAL
    }
    # Ensure model save directory exists
    os.makedirs(config_params['MODEL_SAVE_DIR'], exist_ok=True)

    # --- 1. Build or Load Vocabulary ---
    vocab_start_time = time.time()
    word2idx, idx2word = build_or_load_vocab(
        config_params['TRAIN_PROC_FILE'],
        config_params['VOCAB_FREQ_THRESHOLD_PERCENT'],
        save_path=config_params['VOCAB_FILE']
    )
    print(f"Time taken for vocabulary: {time.time() - vocab_start_time:.2f} seconds")

    # --- 2. Create Datasets and DataLoaders ---
    data_load_start_time = time.time()
    # Create datasets (handle potential errors)
    try:
        train_dataset = WikipediaDataset(config_params['TRAIN_PROC_FILE'], word2idx)
        val_dataset = WikipediaDataset(config_params['VAL_PROC_FILE'], word2idx)
        test_dataset = WikipediaDataset(config_params['TEST_PROC_FILE'], word2idx)
    except FileNotFoundError as e:
        print(f"Error: Processed data file not found: {e}. Did Task A run successfully?")
        exit()
    except Exception as e:
        print(f"Error creating datasets: {e}")
        exit()

    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=config_params['BATCH_SIZE'], shuffle=True, collate_fn=collate_fn, num_workers=2, pin_memory=True)
    val_dataloader = DataLoader(val_dataset, batch_size=config_params['BATCH_SIZE'], shuffle=False, collate_fn=collate_fn, num_workers=2, pin_memory=True)
    test_dataloader = DataLoader(test_dataset, batch_size=config_params['BATCH_SIZE'], shuffle=False, collate_fn=collate_fn, num_workers=2, pin_memory=True)
    print(f"Time taken for data loading: {time.time() - data_load_start_time:.2f} seconds")

    # --- Run Experiments Sequentially ---
    all_results = {}
    basic_rnn_model_path = None # To store path for beam search eval

    # === Part 1: Basic RNN (Greedy) ===
    results = run_experiment(
        exp_name='BasicRNN',
        encoder_type='basic',
        decoder_type='basic',
        use_glove=False,
        glove_freeze=False, # Not applicable
        beam_width=1, # Greedy
        word2idx=word2idx, idx2word=idx2word,
        train_dataloader=train_dataloader, val_dataloader=val_dataloader, test_dataloader=test_dataloader,
        device=DEVICE, config_params=config_params
    )
    all_results['BasicRNN'] = results
    if results.get('status') == 'completed':
        basic_rnn_model_path = results.get('model_path') # Save path for later

    # === Part 2: RNN + GloVe (Greedy) ===
    results = run_experiment(
        exp_name='BasicRNN_GloVe',
        encoder_type='basic',
        decoder_type='basic',
        use_glove=True,
        glove_freeze=False, # Example: Fine-tune GloVe embeddings
        beam_width=1, # Greedy
        word2idx=word2idx, idx2word=idx2word,
        train_dataloader=train_dataloader, val_dataloader=val_dataloader, test_dataloader=test_dataloader,
        device=DEVICE, config_params=config_params
    )
    all_results['BasicRNN_GloVe'] = results

    # === Part 3: RNN + Hierarchical Encoder (Attempt/Skip) ===
    results = run_experiment(
        exp_name='HierEncRNN',
        encoder_type='hierarchical', # This will likely trigger the skip
        decoder_type='basic',
        use_glove=False,
        glove_freeze=False,
        beam_width=1,
        word2idx=word2idx, idx2word=idx2word,
        train_dataloader=train_dataloader, val_dataloader=val_dataloader, test_dataloader=test_dataloader,
        device=DEVICE, config_params=config_params
    )
    all_results['HierEncRNN'] = results

    # === Part 4: RNN + Deeper Decoder (Greedy) ===
    results = run_experiment(
        exp_name='Decoder2RNN',
        encoder_type='basic',
        decoder_type='decoder2', # Use the deeper decoder
        use_glove=False,
        glove_freeze=False,
        beam_width=1, # Greedy
        word2idx=word2idx, idx2word=idx2word,
        train_dataloader=train_dataloader, val_dataloader=val_dataloader, test_dataloader=test_dataloader,
        device=DEVICE, config_params=config_params
    )
    all_results['Decoder2RNN'] = results

    # === Part 5: Basic RNN + Beam Search ===
    if basic_rnn_model_path and os.path.exists(basic_rnn_model_path):
        results = run_experiment(
            exp_name='BasicRNN_Beam',
            encoder_type='basic', # Must match the model being loaded
            decoder_type='basic', # Must match the model being loaded
            use_glove=False,      # Config doesn't matter when loading state dict directly
            glove_freeze=False,
            beam_width=config_params['BEAM_WIDTH'], # Use beam search width
            word2idx=word2idx, idx2word=idx2word,
            train_dataloader=train_dataloader, val_dataloader=val_dataloader, test_dataloader=test_dataloader,
            device=DEVICE, config_params=config_params,
            skip_training=True,             # Skip training phase
            load_model_path=basic_rnn_model_path # Load the previously trained basic model
        )
        all_results['BasicRNN_Beam'] = results
    else:
        print("\nSkipping Beam Search evaluation because the BasicRNN model was not trained successfully or path not found.")
        all_results['BasicRNN_Beam'] = {'status': 'skipped', 'reason': 'BasicRNN model path not available'}


    # --- Print Summary ---
    print("\n\n" + "="*50)
    print(" Final Results Summary (Part B)")
    print("="*50)

    summary_data = []
    for name, res in all_results.items():
         if res.get('status') == 'completed':
             summary_data.append({
                 'Experiment': name,
                 'Encoder': res.get('encoder', 'N/A'),
                 'Decoder': res.get('decoder', 'N/A'),
                 'GloVe': res.get('glove', 'N/A'),
                 'BeamWidth': res.get('beam_width', 'N/A'),
                 'ROUGE-1': res.get('rouge1', 0.0),
                 'ROUGE-2': res.get('rouge2', 0.0),
                 'ROUGE-L': res.get('rougeL', 0.0),
                 'Eval Time (s)': res.get('eval_time_s', 0.0),
                 'Train Time (s)': res.get('train_time_s', 0.0),
                 'Total Time (s)': res.get('total_time_s', 0.0),
             })
         else:
             summary_data.append({
                 'Experiment': name,
                 'Status': res.get('status', 'unknown'),
                 'Reason': res.get('reason', 'N/A')
             })

    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        print(summary_df.to_markdown(index=False, floatfmt=".4f"))
        # Save summary to CSV
        summary_df.to_csv(os.path.join(config_params['MODEL_SAVE_DIR'], 'taskB_results_summary.csv'), index=False)
    else:
        print("No experiments completed successfully.")


    main_end_time = time.time()
    total_script_time = main_end_time - main_start_time
    print(f"\nTotal script execution time: {total_script_time:.2f} seconds ({total_script_time/60:.2f} minutes)")
    print("--- Task B Complete ---")

Main execution started. Using device: cuda
NOTE: Experiments will run sequentially in this script.
      For parallel execution on Kaggle, create separate notebooks for each part and run them concurrently.
Building vocabulary from training data...
Built vocabulary with 8274 tokens (appeared in >= 1.00% of docs).
Error saving vocabulary: [Errno 30] Read-only file system: '/kaggle/input/processedassignment2datanlp/vocabulary.pkl'
Time taken for vocabulary: 5.63 seconds
Time taken for data loading: 1.43 seconds

--- Running Experiment: BasicRNN ---
Config: Enc=basic, Dec=basic, GloVe=False, Beam=1
Starting training...
Batch 50/414 | Loss: 3.6607
Batch 100/414 | Loss: 2.5882
Batch 150/414 | Loss: 2.5471
Batch 200/414 | Loss: 2.8385
Batch 250/414 | Loss: 2.9977
Batch 300/414 | Loss: 3.0158
Batch 350/414 | Loss: 2.0967
Batch 400/414 | Loss: 2.6329
Epoch 1/5 | Time: 3m 22s | Saving Best Model...
	Train Loss: 2.9897 | Val. Loss: 2.1161
Batch 50/414 | Loss: 1.8913
Batch 100/414 | Loss: 1.4383
B