In [3]:
# --- Part C: Imports and Constants ---

import torch
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer,
    BertForSequenceClassification, # Using this simplifies the model definition
    get_linear_schedule_with_warmup # Optional: for learning rate scheduling
)
from tqdm.auto import tqdm # Use tqdm.auto for notebook compatibility


# --- Constants ---
# Ensure these are defined in your first cell
PART_C_INPUT_CSV = "/kaggle/input/kakaka/part_c_train.csv" # <--- SET THIS PATH
INPUT_TEXT_COL_NAME = 'input_text' # The actual name of the input text column
LABEL_COL_NAME = 'output_label' # The actual name of the label column
VALIDATION_SIZE = 0.10
TEST_SIZE = 0.20
RANDOM_SEED = 42



BERT_MODEL_NAME = "google-bert/bert-base-uncased"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Hyperparameters (can be adjusted later)
MAX_LEN = 128       # Max sequence length for BERT tokenizer
BATCH_SIZE = 16     # Adjust based on GPU memory
EPOCHS = 4          # Number of training epochs
LEARNING_RATE = 2e-5 # Common learning rate for BERT fine-tuning

# Data split ratios
TEST_SIZE = 0.20    # 20% for test set
VALIDATION_SIZE = 0.10 # 10% for validation set (taken from the initial 80%)

# Random state for reproducibility
RANDOM_SEED = 42

# Label mapping
LABEL_MAP = {'SmolVLM': 0, 'Custom': 1} # Map model type strings to integers

print(f"Using device: {DEVICE}")
print(f"BERT Model: {BERT_MODEL_NAME}")
if os.path.exists(PART_B_CSV_PATH):
    print(f"Input Data CSV: {PART_B_CSV_PATH}")
else:
    print(f"WARNING: Input CSV not found at {PART_B_CSV_PATH}. Data loading will fail.")

Using device: cuda
BERT Model: google-bert/bert-base-uncased
Input Data CSV: /kaggle/input/kakaka/part_c_train.csv


In [4]:
# --- Part C: Load Data and Split (RANDOM ROW SPLIT ) ---
import pandas as pd
import os
from sklearn.model_selection import train_test_split


print(f"\nLoading pre-formatted data from: {PART_C_INPUT_CSV}")
print("WARNING: Performing RANDOM ROW SPLIT, not image-based split.")

try:
    df = pd.read_csv(PART_C_INPUT_CSV)
    print(f"Loaded data. Shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: Input CSV not found at {PART_C_INPUT_CSV}")
    df = pd.DataFrame()
except Exception as e:
    print(f"Error loading CSV: {e}")
    df = pd.DataFrame()

# --- Basic Data Cleaning ---
if not df.empty:
    print("\nCleaning data...")
    required_cols = [INPUT_TEXT_COL_NAME, LABEL_COL_NAME]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns in CSV: {missing_cols}")

    initial_rows = len(df)
    df.dropna(subset=[INPUT_TEXT_COL_NAME, LABEL_COL_NAME], inplace=True)
    df = df[df[INPUT_TEXT_COL_NAME].astype(str).str.strip().str.len() > 0]
    df[LABEL_COL_NAME] = pd.to_numeric(df[LABEL_COL_NAME], errors='coerce')
    df.dropna(subset=[LABEL_COL_NAME], inplace=True)
    df[LABEL_COL_NAME] = df[LABEL_COL_NAME].astype(int)
    cleaned_rows = len(df)
    print(f"Rows before cleaning: {initial_rows}, Rows after cleaning: {cleaned_rows}")
    if cleaned_rows == 0:
        print("ERROR: Dataframe is empty after cleaning.")

# --- Random Row-Based Train/Validation/Test Split ---
df_train, df_val, df_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() # Initialize

if not df.empty:
    print("\nPerforming random row-based Train/Val/Test split...")
    # Split into Train and Temp (Val + Test)
    df_train, df_temp = train_test_split(
        df,
        test_size=(VALIDATION_SIZE + TEST_SIZE),
        random_state=RANDOM_SEED,
        stratify=df[LABEL_COL_NAME] if df[LABEL_COL_NAME].nunique() > 1 else None # Stratify if possible
    )

    # Split Temp into Val and Test
    if not df_temp.empty:
        val_test_total = VALIDATION_SIZE + TEST_SIZE
        relative_test_size = TEST_SIZE / val_test_total if val_test_total > 0 else 0.0
        df_val, df_test = train_test_split(
            df_temp,
            test_size=relative_test_size,
            random_state=RANDOM_SEED,
            stratify=df_temp[LABEL_COL_NAME] if df_temp[LABEL_COL_NAME].nunique() > 1 else None # Stratify if possible
        )
    else:
        df_val, df_test = pd.DataFrame(), pd.DataFrame()


    # Rename columns for consistency with Dataset class
    df_train = df_train.rename(columns={INPUT_TEXT_COL_NAME: 'input_text', LABEL_COL_NAME: 'label'})
    df_val = df_val.rename(columns={INPUT_TEXT_COL_NAME: 'input_text', LABEL_COL_NAME: 'label'})
    df_test = df_test.rename(columns={INPUT_TEXT_COL_NAME: 'input_text', LABEL_COL_NAME: 'label'})

    print(f"Data split complete:")
    print(f"  Train set rows: {len(df_train)}")
    print(f"  Validation set rows: {len(df_val)}")
    print(f"  Test set rows: {len(df_test)}")

    # --- Check Test Set Label Distribution ---
    if not df_test.empty:
        print("\n--- Test Set Label Distribution ---")
        print(df_test['label'].value_counts())
        num_unique_labels_test = df_test['label'].nunique()
        print(f"Number of unique labels in test set: {num_unique_labels_test}")
        if num_unique_labels_test < 2:
             print("\nWARNING: Test set contains samples from only ONE class after random split.")
        else:
            print("Test set contains samples from both classes.")
    else:
        print("\nWarning: Cannot check test set label distribution (df_test is empty).")


else:
    print("Skipping split as DataFrame is empty.")

print("\nData ready for Dataset creation.")


Loading pre-formatted data from: /kaggle/input/kakaka/part_c_train.csv
Loaded data. Shape: (10624, 2)

Cleaning data...
Rows before cleaning: 10624, Rows after cleaning: 10624

Performing random row-based Train/Val/Test split...
Data split complete:
  Train set rows: 7436
  Validation set rows: 1062
  Test set rows: 2126

--- Test Set Label Distribution ---
label
1    1063
0    1063
Name: count, dtype: int64
Number of unique labels in test set: 2
Test set contains samples from both classes.

Data ready for Dataset creation.


In [5]:
# --- Part C: PyTorch Dataset ---

# Ensure BertTokenizer is imported
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset

class CaptionClassificationDataset(Dataset):
    """
    PyTorch Dataset for the caption classification task.
    Takes a DataFrame containing 'input_text' and 'label' columns.
    Tokenizes the text using a BERT tokenizer.
    """
    def __init__(self, dataframe, tokenizer, max_len):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame for the split (train, val, or test)
                                      containing 'input_text' and 'label' columns.
            tokenizer (BertTokenizer): Initialized BERT tokenizer.
            max_len (int): Maximum sequence length for tokenization.
        """
        # Ensure required columns exist
        if 'input_text' not in dataframe.columns or 'label' not in dataframe.columns:
             raise ValueError("DataFrame must contain 'input_text' and 'label' columns.")

        self.tokenizer = tokenizer
        self.data = dataframe
        self.input_text = dataframe.input_text.to_numpy() # Convert to numpy for faster access
        self.labels = dataframe.label.to_numpy()
        self.max_len = max_len

    def __len__(self):
        """Returns the number of samples in the dataset."""
        return len(self.input_text)

    def __getitem__(self, index):
        """
        Retrieves a sample, tokenizes it, and returns tensors.
        """
        text = str(self.input_text[index]) # Get the formatted input text
        label = int(self.labels[index])    # Get the corresponding integer label

        # Tokenize the input text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,    # Pad or truncate to max_len
            padding='max_length',       # Pad sequences to max_len
            truncation=True,            # Truncate sequences longer than max_len
            return_attention_mask=True, # Return attention mask
            return_token_type_ids=False,# Not typically needed for single sequence/pair classification with BERT structure used here
            return_tensors='pt',        # Return PyTorch tensors
        )

        # Return dictionary matching common input names for BERT models
        return {
            'input_text': text, # Keep original text for potential debugging
            'input_ids': encoding['input_ids'].flatten(), # Remove the batch dimension (1, max_len) -> (max_len)
            'attention_mask': encoding['attention_mask'].flatten(), # Remove the batch dimension
            'labels': torch.tensor(label, dtype=torch.long) # Target label as a tensor
        }

# --- Initialize Tokenizer ---
# Ensure BERT_MODEL_NAME is defined from the first snippet
if 'BERT_MODEL_NAME' not in globals(): raise NameError("BERT_MODEL_NAME not defined.")
print(f"\nInitializing BERT tokenizer: {BERT_MODEL_NAME}")
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
print("Tokenizer initialized.")


# --- Create Datasets ---
# Ensure df_train, df_val, df_test exist from the previous step
# Ensure MAX_LEN is defined
if 'MAX_LEN' not in globals(): raise NameError("MAX_LEN not defined.")

train_dataset = None
val_dataset = None
test_dataset = None

if 'df_train' in globals() and not df_train.empty:
    print("\nCreating Train Dataset...")
    train_dataset = CaptionClassificationDataset(
        dataframe=df_train,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    print(f"Train Dataset created with {len(train_dataset)} samples.")
else:
    print("Skipping Train Dataset creation (df_train is empty or undefined).")

if 'df_val' in globals() and not df_val.empty:
    print("\nCreating Validation Dataset...")
    val_dataset = CaptionClassificationDataset(
        dataframe=df_val,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    print(f"Validation Dataset created with {len(val_dataset)} samples.")
else:
    print("Skipping Validation Dataset creation (df_val is empty or undefined).")

if 'df_test' in globals() and not df_test.empty:
    print("\nCreating Test Dataset...")
    test_dataset = CaptionClassificationDataset(
        dataframe=df_test,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    print(f"Test Dataset created with {len(test_dataset)} samples.")
    # Optional: Check a sample output from the test dataset
    # sample = test_dataset[0]
    # print("\nSample from Test Dataset:")
    # print(f"  Input Text: {sample['input_text']}")
    # print(f"  Input IDs shape: {sample['input_ids'].shape}")
    # print(f"  Attention Mask shape: {sample['attention_mask'].shape}")
    # print(f"  Label: {sample['labels']}")
else:
    print("Skipping Test Dataset creation (df_test is empty or undefined).")


Initializing BERT tokenizer: google-bert/bert-base-uncased
Tokenizer initialized.

Creating Train Dataset...
Train Dataset created with 7436 samples.

Creating Validation Dataset...
Validation Dataset created with 1062 samples.

Creating Test Dataset...
Test Dataset created with 2126 samples.


In [6]:
# --- Part C: Caption Classifier Model Definition ---

# Ensure BertForSequenceClassification is imported
from transformers import BertForSequenceClassification
import torch.nn as nn

# Compulsory Class Definition: CaptionClassifier
class CaptionClassifier(nn.Module):
    """
    A classifier model based on a pre-trained BERT model.
    This implementation uses Hugging Face's BertForSequenceClassification
    for simplicity and standard practice.
    """
    def __init__(self, bert_model_name, num_labels=2):
        """
        Args:
            bert_model_name (str): The name of the pre-trained BERT model
                                   (e.g., 'google-bert/bert-base-uncased').
            num_labels (int): The number of output classes (2 for binary: SmolVLM vs Custom).
        """
        super(CaptionClassifier, self).__init__()
        print(f"Initializing CaptionClassifier with base model: {bert_model_name}")
        # Load the pre-trained BERT model with a sequence classification head
        # The `num_labels` argument automatically configures the final linear layer
        self.bert = BertForSequenceClassification.from_pretrained(
            bert_model_name,
            num_labels=num_labels
        )
        print(f"Loaded {bert_model_name} with classification head for {num_labels} labels.")

    def forward(self, input_ids, attention_mask, labels=None):
        """
        Forward pass for the classifier.

        Args:
            input_ids (torch.Tensor): Tensor of input token IDs (batch_size, seq_length).
            attention_mask (torch.Tensor): Tensor indicating which tokens to attend to (batch_size, seq_length).
            labels (torch.Tensor, optional): Tensor of true labels (batch_size).
                                             If provided, the model also returns the loss.

        Returns:
            transformers.modeling_outputs.SequenceClassifierOutput:
                An object containing loss (if labels provided) and logits.
                Access logits via output.logits.
        """
        # Pass inputs directly to the underlying BertForSequenceClassification model
        # It handles the CLS token processing and classification internally
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels # Pass labels to compute loss during training
        )
        return outputs

# --- Model Initialization ---
# Ensure BERT_MODEL_NAME is defined from the constants snippet
if 'BERT_MODEL_NAME' not in globals(): raise NameError("BERT_MODEL_NAME not defined.")
# Ensure DEVICE is defined
if 'DEVICE' not in globals(): raise NameError("DEVICE not defined.")

print("\nInstantiating the CaptionClassifier model...")
# Initialize the model with 2 output labels (SmolVLM vs Custom)
try:
    model = CaptionClassifier(bert_model_name=BERT_MODEL_NAME, num_labels=2)
    model.to(DEVICE) # Move the model to the GPU (or CPU if not available)
    print(f"CaptionClassifier model instantiated and moved to {DEVICE}.")

    # Optional: Print model structure summary
    # print("\nModel Structure:")
    # print(model)

except Exception as e:
    print(f"ERROR: Failed to initialize CaptionClassifier model: {e}")
    model = None # Set model to None if initialization fails


Instantiating the CaptionClassifier model...
Initializing CaptionClassifier with base model: google-bert/bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded google-bert/bert-base-uncased with classification head for 2 labels.
CaptionClassifier model instantiated and moved to cuda.


In [7]:
# --- Part C: Training Setup and Function ---

from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup # Ensure these are imported
import torch.nn as nn
import torch # Ensure torch is imported

# --- Create DataLoaders ---
# Ensure datasets (train_dataset, val_dataset) exist from the Dataset step
# Ensure BATCH_SIZE is defined
if 'BATCH_SIZE' not in globals(): raise NameError("BATCH_SIZE not defined.")

train_dataloader = None
val_dataloader = None

if 'train_dataset' in globals() and train_dataset is not None:
    print("\nCreating Training DataLoader...")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True, # Shuffle training data
        num_workers=2
    )
    print(f"Training DataLoader created with {len(train_dataloader)} batches.")
else:
    print("Skipping Training DataLoader creation (train_dataset not available).")


if 'val_dataset' in globals() and val_dataset is not None:
    print("\nCreating Validation DataLoader...")
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False, # No need to shuffle validation data
        num_workers=2
    )
    print(f"Validation DataLoader created with {len(val_dataloader)} batches.")
else:
    print("Skipping Validation DataLoader creation (val_dataset not available).")


# --- Optimizer and Loss Function ---
# Ensure the 'model' object (CaptionClassifier instance) exists and LEARNING_RATE is defined
if 'model' in globals() and model is not None and 'LEARNING_RATE' in globals():
    print("\nSetting up optimizer and loss function...")
    # Use AdamW optimizer (standard for Transformers)
    optimizer = AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        eps=1e-8 # Epsilon value to prevent division by zero
    )

    # Use CrossEntropyLoss for classification
    criterion = nn.CrossEntropyLoss().to(DEVICE) # Move loss function to device if needed (usually not)
    print("Optimizer (AdamW) and Loss Function (CrossEntropyLoss) created.")

    # Optional: Learning Rate Scheduler
    # Calculate total training steps for scheduler
    if train_dataloader:
        total_steps = len(train_dataloader) * EPOCHS # EPOCHS should be defined
        # Create linear scheduler with warmup
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0, # Default: no warmup steps
            num_training_steps=total_steps
        )
        print(f"Learning rate scheduler created for {total_steps} total steps.")
    else:
        scheduler = None # No scheduler if no training data
        print("Skipping scheduler creation (no training dataloader).")

else:
    print("Skipping optimizer/loss setup (model is None or LEARNING_RATE not defined).")
    optimizer = None
    criterion = None
    scheduler = None


# --- Compulsory Function: train_classifier ---
def train_classifier(model, dataloader, optimizer, criterion, device, epoch, scheduler=None):
    """
    Performs one epoch of training for the caption classifier.

    Args:
        model (nn.Module): The CaptionClassifier model.
        dataloader (DataLoader): DataLoader for the training data.
        optimizer (Optimizer): The optimizer (e.g., AdamW).
        criterion (Loss): The loss function (e.g., CrossEntropyLoss).
        device (str): 'cuda' or 'cpu'.
        epoch (int): Current epoch number (for logging).
        scheduler (LRScheduler, optional): Learning rate scheduler.

    Returns:
        float: The average training loss for the epoch.
    """
    if model is None or dataloader is None or optimizer is None or criterion is None:
        print("Error in train_classifier: Missing model, dataloader, optimizer, or criterion.")
        return float('inf') # Return infinite loss to signal failure

    print(f"\n--- Starting Training Epoch {epoch + 1} ---")
    model.train() # Set model to training mode

    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1} Train", leave=False)

    for batch in progress_bar:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels # Pass labels directly to BertForSequenceClassification
        )

        # BertForSequenceClassification directly returns loss when labels are provided
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Gradient clipping (optional but often helpful with Transformers)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update weights
        optimizer.step()

        # Update learning rate scheduler (if used)
        if scheduler:
            scheduler.step()

        # Accumulate loss
        total_loss += loss.item()

        # Update progress bar description (optional)
        progress_bar.set_postfix({'loss': loss.item()})

    # Calculate average loss for the epoch
    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} Average Training Loss: {avg_train_loss:.4f}")

    return avg_train_loss

# --- Optional: Validation Function (Recommended) ---
def validate_classifier(model, dataloader, criterion, device, epoch):
    """Performs one epoch of validation."""
    if model is None or dataloader is None or criterion is None:
        print("Error in validate_classifier: Missing model, dataloader, or criterion.")
        return float('inf'), 0.0 # Return infinite loss and 0 accuracy

    print(f"\n--- Starting Validation Epoch {epoch + 1} ---")
    model.eval() # Set model to evaluation mode

    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1} Val", leave=False)

    with torch.no_grad(): # Disable gradient calculations
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += torch.sum(predictions == labels).item()
            total_predictions += labels.size(0)

            progress_bar.set_postfix({'loss': loss.item()})

    avg_val_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
    print(f"Epoch {epoch + 1} Average Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

    return avg_val_loss, accuracy


Creating Training DataLoader...
Training DataLoader created with 465 batches.

Creating Validation DataLoader...
Validation DataLoader created with 67 batches.

Setting up optimizer and loss function...
Optimizer (AdamW) and Loss Function (CrossEntropyLoss) created.
Learning rate scheduler created for 1860 total steps.


In [8]:
# --- Part C: Evaluation Function ---

from sklearn.metrics import precision_recall_fscore_support # Ensure this is imported
import torch # Ensure torch is imported

# --- Create Test DataLoader ---
# Ensure test_dataset exists from the Dataset step
# Ensure BATCH_SIZE is defined
if 'BATCH_SIZE' not in globals(): raise NameError("BATCH_SIZE not defined.")

test_dataloader = None
if 'test_dataset' in globals() and test_dataset is not None:
    print("\nCreating Test DataLoader...")
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False, # No shuffling for test set
        num_workers=2
    )
    print(f"Test DataLoader created with {len(test_dataloader)} batches.")
else:
    print("Skipping Test DataLoader creation (test_dataset not available).")


# --- Compulsory Function: evaluate_classifier ---
def evaluate_classifier(model, dataloader, device):
    """
    Evaluates the classifier on a given dataset (typically the test set).

    Args:
        model (nn.Module): The trained CaptionClassifier model.
        dataloader (DataLoader): DataLoader for the evaluation data.
        device (str): 'cuda' or 'cpu'.

    Returns:
        dict: A dictionary containing macro precision, recall, and F1-score.
              Returns None if evaluation cannot be performed.
    """
    if model is None or dataloader is None:
        print("Error in evaluate_classifier: Missing model or dataloader.")
        return None # Cannot evaluate

    print("\n--- Starting Final Evaluation on Test Set ---")
    model.eval() # Set model to evaluation mode

    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Evaluating Test Set", leave=False)

    with torch.no_grad(): # Disable gradient calculations
        for batch in progress_bar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass - only need logits for prediction
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                # Do NOT pass labels here for evaluation
            )
            logits = outputs.logits

            # Get predictions (index of the max logit)
            predictions = torch.argmax(logits, dim=-1)

            # Move predictions and labels to CPU and store as numpy arrays or lists
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Ensure predictions were made
    if not all_labels or not all_preds:
        print("Warning: No labels or predictions collected during evaluation. Cannot calculate metrics.")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0} # Return zero scores


    # Calculate metrics using sklearn
    # Use average='macro' as required by the assignment
    # Set zero_division=0 to handle cases where a class might have 0 predictions/labels in a batch or split
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels,
        all_preds,
        average='macro',
        zero_division=0
    )

    print("\n--- Evaluation Metrics (Test Set) ---")
    print(f"Macro Precision: {precision:.4f}")
    print(f"Macro Recall:    {recall:.4f}")
    print(f"Macro F1-Score:  {f1:.4f}")

    # Return results in a dictionary
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


Creating Test DataLoader...
Test DataLoader created with 133 batches.


In [9]:
# --- Part C: Main Execution Script ---

import torch
import numpy as np # For setting random seed
import random # For setting random seed
import time # For timing execution

# --- Set Random Seeds for Reproducibility ---
# Ensure RANDOM_SEED is defined from the Constants cell
if 'RANDOM_SEED' not in globals():
    RANDOM_SEED = 42 # Default seed if not defined
    print(f"Warning: RANDOM_SEED not defined, using default: {RANDOM_SEED}")

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    # Potentially add these for further determinism, but can impact performance
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False
print(f"Set random seeds to {RANDOM_SEED}")


# --- Check Prerequisites ---
# Verify necessary components are ready before starting training
print("\nVerifying prerequisites for training...")
# Check if model, optimizer, criterion, dataloaders, etc., exist and are not None
components_ready = (
    'model' in globals() and model is not None and
    'optimizer' in globals() and optimizer is not None and
    'criterion' in globals() and criterion is not None and
    'train_dataloader' in globals() and train_dataloader is not None and
    'val_dataloader' in globals() and val_dataloader is not None and
    'test_dataloader' in globals() and test_dataloader is not None and
    'EPOCHS' in globals() and
    'DEVICE' in globals()
)

if not components_ready:
    print("ERROR: Not all necessary components (model, optimizer, criterion, dataloaders, EPOCHS, DEVICE) are initialized.")
    print("       Please ensure previous cells ran successfully. Aborting training.")
    # You might want to raise an error here or exit if in a script
    # raise RuntimeError("Prerequisites for training not met.")
else:
    print("Prerequisites met. Starting training loop...")
    start_time = time.time()

    # --- Training Loop ---
    best_val_accuracy = -1.0 # Keep track of best validation accuracy
    best_epoch = -1
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []} # Store metrics per epoch

    for epoch in range(EPOCHS):
        # --- Training Step ---
        avg_train_loss = train_classifier(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            device=DEVICE,
            epoch=epoch,
            scheduler=scheduler if 'scheduler' in globals() else None # Pass scheduler if defined
        )
        history['train_loss'].append(avg_train_loss)

        # --- Validation Step ---
        # Use the optional validation function defined earlier
        avg_val_loss, val_accuracy = validate_classifier(
            model=model,
            dataloader=val_dataloader,
            criterion=criterion, # Loss function needed for validation loss calculation
            device=DEVICE,
            epoch=epoch
        )
        history['val_loss'].append(avg_val_loss)
        history['val_accuracy'].append(val_accuracy)

        # Optional: Save the model checkpoint if validation accuracy improves
        if val_accuracy > best_val_accuracy:
            print(f"Validation accuracy improved from {best_val_accuracy:.4f} to {val_accuracy:.4f}. Saving model...")
            best_val_accuracy = val_accuracy
            best_epoch = epoch
            # Define a path to save the best model
            output_dir = "/kaggle/working/" # Or your Drive path
            os.makedirs(output_dir, exist_ok=True)
            best_model_path = os.path.join(output_dir, "best_caption_classifier_model.pth")
            try:
                torch.save(model.state_dict(), best_model_path)
                print(f"Best model saved to {best_model_path}")
            except Exception as e:
                print(f"Error saving model checkpoint: {e}")

        print("-" * 30) # Separator between epochs

    end_time = time.time()
    training_time = end_time - start_time
    print(f"\n--- Training Complete ---")
    print(f"Total Training Time: {training_time:.2f} seconds")
    print(f"Best Validation Accuracy: {best_val_accuracy:.4f} at Epoch {best_epoch + 1}")

    # --- Final Evaluation on Test Set ---
    print("\n--- Evaluating on Test Set using the *last* epoch model ---")
    # Note: For best results, you might load the *best* saved checkpoint based on validation accuracy
    # Example:
    # if os.path.exists(best_model_path):
    #     print(f"Loading best model from {best_model_path} for final evaluation...")
    #     model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))
    # else:
    #     print("Warning: Best model checkpoint not found. Evaluating with the model from the last epoch.")

    # Call the evaluation function
    test_results = evaluate_classifier(
        model=model,
        dataloader=test_dataloader,
        device=DEVICE
    )

    if test_results:
        print("\n--- Final Test Set Performance ---")
        print(f"  Macro Precision: {test_results.get('precision', 0.0):.4f}")
        print(f"  Macro Recall:    {test_results.get('recall', 0.0):.4f}")
        print(f"  Macro F1-Score:  {test_results.get('f1', 0.0):.4f}")
    else:
        print("\nTest evaluation could not be performed.")

# --- End of Part C Execution ---

Set random seeds to 42

Verifying prerequisites for training...
Prerequisites met. Starting training loop...

--- Starting Training Epoch 1 ---


Epoch 1 Train:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 1 Average Training Loss: 0.0857

--- Starting Validation Epoch 1 ---


Epoch 1 Val:   0%|          | 0/67 [00:00<?, ?it/s]

Epoch 1 Average Validation Loss: 0.0367, Accuracy: 0.9821
Validation accuracy improved from -1.0000 to 0.9821. Saving model...
Best model saved to /kaggle/working/best_caption_classifier_model.pth
------------------------------

--- Starting Training Epoch 2 ---


Epoch 2 Train:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 2 Average Training Loss: 0.0350

--- Starting Validation Epoch 2 ---


Epoch 2 Val:   0%|          | 0/67 [00:00<?, ?it/s]

Epoch 2 Average Validation Loss: 0.0445, Accuracy: 0.9831
Validation accuracy improved from 0.9821 to 0.9831. Saving model...
Best model saved to /kaggle/working/best_caption_classifier_model.pth
------------------------------

--- Starting Training Epoch 3 ---


Epoch 3 Train:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 3 Average Training Loss: 0.0300

--- Starting Validation Epoch 3 ---


Epoch 3 Val:   0%|          | 0/67 [00:00<?, ?it/s]

Epoch 3 Average Validation Loss: 0.0645, Accuracy: 0.9765
------------------------------

--- Starting Training Epoch 4 ---


Epoch 4 Train:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 4 Average Training Loss: 0.0247

--- Starting Validation Epoch 4 ---


Epoch 4 Val:   0%|          | 0/67 [00:00<?, ?it/s]

Epoch 4 Average Validation Loss: 0.0878, Accuracy: 0.9755
------------------------------

--- Training Complete ---
Total Training Time: 430.93 seconds
Best Validation Accuracy: 0.9831 at Epoch 2

--- Evaluating on Test Set using the *last* epoch model ---

--- Starting Final Evaluation on Test Set ---


Evaluating Test Set:   0%|          | 0/133 [00:00<?, ?it/s]


--- Evaluation Metrics (Test Set) ---
Macro Precision: 0.9770
Macro Recall:    0.9770
Macro F1-Score:  0.9770

--- Final Test Set Performance ---
  Macro Precision: 0.9770
  Macro Recall:    0.9770
  Macro F1-Score:  0.9770
