In [None]:
#Change this for to create your folder and save experiments
model_name = 'komal/gpt2-medium-classifier_finetuning_24layer'

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

# Define the persistent path on your Drive
# IMPORTANT: This folder must exist in your Google Drive!
PERSISTENT_BASE_DIR = f'/content/drive/MyDrive/Efficient_AI_Project/EarlyExit_Experiments/{model_name}'
os.makedirs(PERSISTENT_BASE_DIR, exist_ok=True)

Mounted at /content/drive


In [None]:
import sys

!git clone https://github.com/komalniraula/adaptive-inference-llm

repo_name = 'adaptive-inference-llm' # Must match the folder created by git clone
project_path = os.path.join('/content', repo_name)

# Append the project root directory to the system path

sys.path.append(project_path)

Cloning into 'adaptive-inference-llm'...
remote: Enumerating objects: 281, done.[K
remote: Counting objects: 100% (281/281), done.[K
remote: Compressing objects: 100% (202/202), done.[K
remote: Total 281 (delta 137), reused 213 (delta 71), pack-reused 0 (from 0)[K
Receiving objects: 100% (281/281), 6.71 MiB | 21.35 MiB/s, done.
Resolving deltas: 100% (137/137), done.


In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
import random
import itertools
import json
import time
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer

from evaluation.dataset_loaders.sst2 import load_sst2

In [None]:
# -----------------------------
# Load datasets
# -----------------------------
# Load the labeled SST-2 train split (67k samples)
sst2_train_data = load_sst2(task='train', fraction=1)

# Load the SST-2 validation split (872 samples) for testing
# This is equivalent to your previous sst2_test = ds["validation"]
sst2_test_data = load_sst2(task='test', fraction=1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [None]:
# -----------------------------
# Convert datasets to (text,label)
# -----------------------------
# The external load_sst2 function should already return a list/Dataset
# where elements can be converted.
# We adapt the conversion based on the expected output of your loader module.

def convert_sst2(sample):
    # Assuming your module returns samples with 'text' and 'label' keys
    # based on the preprocess function in your original loader.
    if "text" in sample:
        return sample["text"], int(sample["label"])
    elif "sentence" in sample:
        # Fallback for the raw SST-2 key
        return sample["sentence"], int(sample["label"])
    else:
         raise ValueError("SST-2 sample missing expected keys for conversion.")

# Create data pairs
train_pairs = [convert_sst2(x) for x in sst2_train_data]
test_pairs  = [convert_sst2(x) for x in sst2_test_data]

print(f"Total train samples: {len(train_pairs)}")
print(f"Total test samples: {len(test_pairs)}")


# -----------------------------
# Dataset class
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        enc = self.tokenizer(
            text,
            add_special_tokens=False,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


# -----------------------------
# Dynamic padding collate_fn
# -----------------------------
def collate_fn(batch):
    input_ids = [b["input_ids"] for b in batch]
    labels = torch.tensor([b["labels"] for b in batch])

    max_len = max(len(ids) for ids in input_ids)
    pad_id = tokenizer.pad_token_id

    padded = torch.full((len(batch), max_len), pad_id)
    for i, ids in enumerate(input_ids):
        padded[i, :len(ids)] = ids

    attention_mask = (padded != tokenizer.eos_token_id).float()


    return {
        "input_ids": padded,
        "attention_mask": attention_mask,
        "labels": labels
    }

Total train samples: 67349
Total test samples: 872


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# -----------------------------
# Create datasets
# -----------------------------
train_ds = SentimentDataset(train_pairs, tokenizer)
test_ds = SentimentDataset(test_pairs, tokenizer)

# -----------------------------
# Create dataloaders
# -----------------------------
train_loader = DataLoader(
    train_ds,
    batch_size=16,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_ds,
    batch_size=16,
    shuffle=False,
    collate_fn=collate_fn
)

print("Train loader batches:", len(train_loader))
print("Test loader batches:", len(test_loader))

# Optional: Inspect first batch
example = next(iter(train_loader))
print(example["input_ids"].shape)
print(example["attention_mask"].shape)
print(example["labels"])

Train loader batches: 4210
Test loader batches: 55
torch.Size([16, 37])
torch.Size([16, 37])
tensor([1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1])


In [None]:
class GPT2EarlyExitClassifier(nn.Module):
    def __init__(self, model_name, exit_layers, hyperparameters):
        super().__init__()

        # Load GPT-2 as causal LM (we will use only hidden states)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            output_hidden_states=True,
            return_dict=True
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.exit_layers = sorted(exit_layers)
        self.hp = hyperparameters
        self.num_labels = self.hp.get("num_labels", 2)
        dropout_rate = self.hp.get("dropout", 0.0)

        # Loss weights λ_e for each exit
        self.exit_loss_weights = self.hp.get(
            "exit_loss_weights",
            [1.0] * len(self.exit_layers)
        )

        hidden_size = self.model.config.hidden_size

        # Create classification heads for each exit layer
        self.exit_heads = nn.ModuleDict()
        for layer in self.exit_layers:
            self.exit_heads[str(layer)] = nn.Sequential(
                nn.Dropout(dropout_rate),
                nn.Linear(hidden_size, self.num_labels)
            )

        self.ce = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        """
        Forward pass:
        - Calls GPT-2 normally (causal mask automatically handled)
        - Retrieves hidden_states for each layer
        - Applies classifier at each exit layer
        """

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )

        # hidden_states is a tuple:
        # [0] = embedding output
        # [1] = layer 1 output
        # ...
        # [12] = last layer output  (if GPT2 base)
        hidden_states = outputs.hidden_states

        logits_dict = {}
        total_loss = 0.0

        # For each early exit
        for i, layer in enumerate(self.exit_layers):

            # hidden_states[layer] has shape [batch, seq_len, hidden_dim]
            cls_vec = hidden_states[layer][:, -1, :]   # last token rep

            logits = self.exit_heads[str(layer)](cls_vec)
            logits_dict[layer] = logits

            # Add weighted loss
            if labels is not None:
                weight = self.exit_loss_weights[i]
                total_loss += weight * self.ce(logits, labels)

        return {
            "loss": total_loss if labels is not None else None,
            "logits": logits_dict
        }

In [None]:
hyperparameter_grid = {
    "num_labels": [2],                 # fixed
    "dropout": [0.0],
    "exit_loss_weights": [[0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]],

    # training hyperparams
    "learning_rate": [1e-5, 5e-5],

    "weight_decay": [0.01],
    "num_epochs": [3],
    "max_grad_norm": [1.0],
    "batch_size": [16],

    # logging
    "log_every": [500]
}

exit_layers = [3, 6, 9, 12, 15, 18, 21, 24]

In [None]:
def make_experiment_name(hp):
    # 'w' will now be a float, so we don't need float(w) if it's already a float
    # If the input list might contain strings (due to the way itertools is used),
    # the float(w) is needed. I will keep the float(w) for robustness:
    loss_weights_str = "-".join([str(round(float(w), 2)) for w in hp['exit_loss_weights']])

    return (
        f"lr{hp['learning_rate']}_"
        f"wd{hp['weight_decay']}_"
        f"ep{hp['num_epochs']}_"
        f"drop{hp['dropout']}_"
        f"lossW{loss_weights_str}"
    )

In [None]:
# NOTE: The GPT2EarlyExitClassifier class definition must be available (as provided by you)

@torch.no_grad()
def early_exit_eval(model_path, hyperparameters, exit_layers, data_loader, device, thresholds):
    """
    Loads a trained GPT2EarlyExitClassifier and evaluates its efficiency
    (Accuracy vs. Average Exit Depth) across multiple confidence thresholds
    using batch processing.

    Args:
        model_path (str): Path to the saved model checkpoint (e.g., './checkpoints/exp_name/epoch_X').
        hyperparameters (dict): The original hyperparameters dictionary used for training.
        exit_layers (list): The list of exit layers used (e.g., [3, 6, 9, 12, 15, 18, 21]).
        data_loader (torch.utils.data.DataLoader): The evaluation data loader (e.g., test_loader).
        device (str): 'cuda' or 'cpu'.
        thresholds (list): List of confidence thresholds to test (e.g., [0.5, 0.7, 0.9]).

    Returns:
        dict: A dictionary containing metrics for each tested threshold.
    """

    print(f"\n--- Loading Model from: {model_path} ---")

    # 1. Instantiate the Model Architecture (Re-building the frame)
    model = GPT2EarlyExitClassifier(
        model_name="gpt2-medium",
        exit_layers=exit_layers,
        hyperparameters=hyperparameters
    ).to(device)

    # 2. Load the Model Weights (Filling the frame with trained weights)
    try:
        weights_path = os.path.join(model_path, 'pytorch_model.bin')
        state_dict = torch.load(weights_path, map_location=device)
        model.load_state_dict(state_dict)
        print("Model weights loaded successfully.")
    except Exception as e:
        print(f"FATAL ERROR: Could not load model weights from {weights_path}.")
        print(e)
        return {}

    model.eval()

    # max_possible_layer is needed for cost calculation
    max_possible_layer = exit_layers[-1] if exit_layers else 0
    num_exits = len(exit_layers)

    results = {}

    for th in thresholds:
        print(f"\nEvaluating Threshold: {th:.1f}")

        correct = 0
        total_samples = 0
        # total_layers_used_sum now accumulates the sum of (ACTUAL LAYER NUMBER * samples_exited)
        total_layers_used_sum = 0

        start_time = time.time()

        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch["labels"]
            total_samples += labels.size(0)

            # 1. Run the forward pass to get all hidden states
            outputs = model.model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                output_hidden_states=True,
                return_dict=True
            )
            hidden_states = outputs.hidden_states

            # Track which samples in the batch have already exited
            batch_exited = torch.zeros(labels.size(0), dtype=torch.bool).to(device)

            # 2. Check each exit layer sequentially for early stopping
            for i, layer in enumerate(exit_layers):
                current_exit_index = i + 1 # 1st, 2nd, 3rd exit...

                # Optimization: Stop loop if all samples have exited
                if batch_exited.all():
                    break

                # Retrieve last token vector (CLS representation)
                cls_vec = hidden_states[layer][:, -1, :]

                # Apply classification head
                logits = model.exit_heads[str(layer)](cls_vec)
                probabilities = F.softmax(logits, dim=-1)

                confidences, preds = torch.max(probabilities, dim=-1)

                # Identify samples that are ready to exit now
                # Condition: Not already exited AND confidence > threshold
                ready_to_exit = (~batch_exited) & (confidences >= th)

                # If this is the LAST layer, force all remaining samples to exit
                if current_exit_index == num_exits:
                    ready_to_exit = ~batch_exited

                # 3. Update trackers for samples exiting in THIS step
                if ready_to_exit.any():
                    # Check accuracy for samples exiting NOW
                    correct_predictions = (preds == labels)[ready_to_exit]
                    correct += correct_predictions.sum().item()

                    # *** EDIT 1: Use the ACTUAL layer number (`layer`) for cost accumulation ***
                    # This ensures avg_layers_used reflects the average transformer layer (e.g., 3, 6, 9)
                    exit_layer_number = layer
                    total_layers_used_sum += (exit_layer_number * ready_to_exit.sum().item())

                    # Mark these samples as exited
                    batch_exited[ready_to_exit] = True

        end_time = time.time()

        # 4. Calculate Final Metrics
        inference_time = end_time - start_time

        final_accuracy = correct / total_samples if total_samples > 0 else 0
        # avg_layers_used now represents the Average Transformer Layer Used
        avg_layers_used = total_layers_used_sum / total_samples if total_samples > 0 else 0

        # *** EDIT 2: Update Cost Saving Percentage Calculation ***
        # The denominator is now the Max Possible Layer (e.g., 21), not the number of exits (e.g., 7)
        results[f'threshold_{th}'] = {
            "accuracy": final_accuracy,
            "avg_layers_used": avg_layers_used,
            "avg_latency_sec": inference_time / total_samples if total_samples > 0 else 0,
            "cost_saving_pct": 100 * (1 - (avg_layers_used / max_possible_layer)),
            "tokens_per_sec": total_samples / inference_time if inference_time > 0 else 0
        }

        print(f"  Accuracy: {results[f'threshold_{th}']['accuracy']:.4f}")
        # The printout is also updated to reflect that avg_layers_used is now out of the max layer
        print(f"  Avg Layer Used: {avg_layers_used:.2f} / {max_possible_layer}")
        print(f"  Cost Savings: {results[f'threshold_{th}']['cost_saving_pct']:.2f}%")

    return results

In [None]:
@torch.no_grad() # Disable gradient calculation for efficiency
def evaluate(model, data_loader, device):
    """
    Evaluates the Early Exit Classifier on a dataset.
    Reports average loss across all exits and (optional) basic accuracy.
    """
    model.eval() # Set model to evaluation mode (disables dropout, etc.)
    total_samples = 0
    total_loss = 0.0

    # Simple metric tracking for the final exit
    correct_predictions = 0

    print("--- Starting Evaluation ---")

    for step, batch in enumerate(data_loader):
        # 1. Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        labels = batch["labels"]
        total_samples += labels.size(0)

        # 2. Forward pass (returns loss and logits from all exits)
        out = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=labels # Pass labels to calculate the combined loss
        )

        # 3. Accumulate total loss
        total_loss += out["loss"].item()

        # 4. Get predictions from the LAST exit layer (typically the best)
        # Sort exit layers to ensure we always pick the highest/last index
        last_exit_layer = sorted(model.exit_heads.keys(), key=int)[-1]

        last_logits = out["logits"][int(last_exit_layer)]

        # Calculate predicted class (index with maximum logit)
        predictions = torch.argmax(last_logits, dim=-1)

        # 5. Accumulate correct predictions
        correct_predictions += (predictions == labels).sum().item()

    # 6. Calculate and print final metrics
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_samples

    print(f"\n--- Evaluation Results ---")
    print(f"Avg Combined Loss (All Exits): {avg_loss:.4f}")
    print(f"Accuracy (Final Exit Only): {accuracy:.4f}")
    print("--------------------------\n")

    # Return metrics if needed for tracking best model
    return avg_loss, accuracy

In [None]:
def train(model, train_loader, test_loader, hp, device):

    exp_name = make_experiment_name(hp)
    epoch_history = []

    # --- NEW TRACKERS ---
    best_accuracy = -1.0
    best_epoch_metrics = None
    best_model_path = None
    # --------------------

    # --- Identify the last exit layer here ---
    last_exit_layer_key = sorted(model.exit_heads.keys(), key=int)[-1]
    last_exit_layer_int = int(last_exit_layer_key)
    # ---------------------------------------------

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=hp["learning_rate"],
        weight_decay=hp["weight_decay"]
    )

    # Define base directory for saving checkpoints
    exp_dir = os.path.join(PERSISTENT_BASE_DIR, exp_name)
    os.makedirs(exp_dir, exist_ok=True)


    for epoch in range(hp["num_epochs"]):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for step, batch in enumerate(train_loader, start=1):
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch["labels"]

            out = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=labels
            )

            loss = out["loss"]
            total_train_loss += loss.item()

            # Calculate Accuracy for the current batch (using final exit)
            total_train_samples += labels.size(0)

            # Get logits from the last exit head
            last_logits = out["logits"][last_exit_layer_int]
            predictions = torch.argmax(last_logits, dim=-1)
            total_train_correct += (predictions == labels).sum().item()

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), hp["max_grad_norm"])
            optimizer.step()

            if step % hp["log_every"] == 0:
                batch_accuracy = (predictions == labels).float().mean().item()
                print(f"Epoch {epoch+1} | Step {step} | Loss: {loss.item():.4f} | Acc: {batch_accuracy:.4f}")

        # --- Epoch End Metrics & Checkpointing ---
        avg_train_loss = total_train_loss / len(train_loader)
        avg_train_accuracy = total_train_correct / total_train_samples

        print(f"\n>>> Epoch {epoch+1} completed | Avg Train Loss: {avg_train_loss:.4f} | Avg Train Acc: {avg_train_accuracy:.4f}")

        # Test Metrics (Evaluation)
        final_test_loss, final_test_accuracy = evaluate(model, test_loader, device)

        # 1. Capture ALL epoch data in a dictionary
        epoch_metrics = {
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "train_accuracy": avg_train_accuracy,
            "test_loss": final_test_loss,
            "test_accuracy": final_test_accuracy
        }
        epoch_history.append(epoch_metrics)

        # 2. Check for BEST model based on test_accuracy
        if final_test_accuracy > best_accuracy:
            print(f">>> NEW BEST MODEL: Acc improved from {best_accuracy:.4f} to {final_test_accuracy:.4f} at epoch {epoch+1}")

            best_accuracy = final_test_accuracy
            best_epoch_metrics = epoch_metrics

            # --- Save the best model weights ---
            # Save to a dedicated 'best_model' folder or file name
            best_model_dir = os.path.join(exp_dir, "best_model")
            os.makedirs(best_model_dir, exist_ok=True)

            best_model_path = os.path.join(best_model_dir, 'pytorch_model.bin')
            torch.save(model.state_dict(), best_model_path)

            # Save the best metrics to a dedicated JSON file
            with open(os.path.join(best_model_dir, 'best_metrics.json'), 'w') as f:
                 json.dump(best_epoch_metrics, f, indent=4)

        # 3. Optional: Save ALL epoch metrics (uncomment if you still want all epochs saved)
        # exp_epoch_dir = os.path.join(exp_dir, f"epoch_{epoch+1}")
        # os.makedirs(exp_epoch_dir, exist_ok=True)
        # with open(os.path.join(exp_epoch_dir, 'metrics.json'), 'w') as f:
        #     json.dump(epoch_metrics, f, indent=4)
        # print(f">>> Saved epoch metrics to {exp_epoch_dir}")


    # Return the complete history and the metrics of the best performing epoch
    return epoch_history, best_epoch_metrics

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# generate all hyperparameter combinations
keys = list(hyperparameter_grid.keys())
values = list(hyperparameter_grid.values())

all_experiment_results = {}
# NOTE: Need to import `itertools` if it's not already done
# import itertools

for combo in itertools.product(*values):

    hp = dict(zip(keys, combo))
    hp["num_labels"] = 2  # fixed

    exp_name = make_experiment_name(hp)
    print("\n==============================")
    print("RUNNING EXPERIMENT:", exp_name)
    print("Hyperparameters:", hp)
    print("==============================")

    # 1. Create fresh model for this configuration
    model = GPT2EarlyExitClassifier(
        model_name="gpt2-medium",
        exit_layers=exit_layers,
        hyperparameters=hp
    ).to(device)

    # 2. Train - Now returns both history and best_metrics
    experiment_history, best_metrics_for_hp = train(
        model=model,
        train_loader=train_loader,
        test_loader=test_loader,
        hp=hp,
        device=device
    )

    # 3. Store the full history and the specific best epoch's metrics
    # We use best_metrics_for_hp instead of experiment_history[-1]
    if best_metrics_for_hp:
        all_experiment_results[exp_name] = {
            "hyperparameters": hp,
            "full_history": experiment_history, # Store all epochs' data
            "best_epoch_metrics": best_metrics_for_hp, # The specific metrics for the best epoch
            "best_test_accuracy": best_metrics_for_hp['test_accuracy'],
            "best_test_loss": best_metrics_for_hp['test_loss']
        }
    else:
         all_experiment_results[exp_name] = {
            "hyperparameters": hp,
            "full_history": experiment_history,
            "error": "No metrics recorded (training failed or had 0 epochs)."
        }


RUNNING EXPERIMENT: lr1e-05_wd0.01_ep3_drop0.0_lossW0.9-0.8-0.7-0.6-0.5-0.4-0.3-0.2
Hyperparameters: {'num_labels': 2, 'dropout': 0.0, 'exit_loss_weights': [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2], 'learning_rate': 1e-05, 'weight_decay': 0.01, 'num_epochs': 3, 'max_grad_norm': 1.0, 'batch_size': 16, 'log_every': 500}


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Epoch 1 | Step 500 | Loss: 2.4089 | Acc: 0.7500
Epoch 1 | Step 1000 | Loss: 1.3949 | Acc: 0.9375
Epoch 1 | Step 1500 | Loss: 2.2294 | Acc: 0.8750
Epoch 1 | Step 2000 | Loss: 0.9815 | Acc: 0.9375
Epoch 1 | Step 2500 | Loss: 0.3964 | Acc: 1.0000
Epoch 1 | Step 3000 | Loss: 0.8607 | Acc: 0.9375
Epoch 1 | Step 3500 | Loss: 1.1882 | Acc: 0.8750
Epoch 1 | Step 4000 | Loss: 1.7450 | Acc: 0.8750

>>> Epoch 1 completed | Avg Train Loss: 1.4099 | Avg Train Acc: 0.9060
--- Starting Evaluation ---

--- Evaluation Results ---
Avg Combined Loss (All Exits): 1.2562
Accuracy (Final Exit Only): 0.9369
--------------------------

>>> NEW BEST MODEL: Acc improved from -1.0000 to 0.9369 at epoch 1
Epoch 2 | Step 500 | Loss: 0.3922 | Acc: 1.0000
Epoch 2 | Step 1000 | Loss: 1.4149 | Acc: 0.9375
Epoch 2 | Step 1500 | Loss: 4.4088 | Acc: 0.8125
Epoch 2 | Step 2000 | Loss: 0.2132 | Acc: 1.0000
Epoch 2 | Step 2500 | Loss: 1.0071 | Acc: 1.0000
Epoch 2 | Step 3000 | Loss: 0.9371 | Acc: 0.9375
Epoch 2 | Step 3500 

In [None]:
# Helper function to flatten the results dictionary for CSV
def flatten_experiment_results(all_results):
    """Flattens the nested all_experiment_results dictionary into a list of dictionaries."""
    flat_data = []

    for exp_name, data in all_results.items():
        # Start with the experiment name
        row = {"experiment_name": exp_name}

        # Add basic metrics (already flattened)
        if "best_test_accuracy" in data:
            row["best_test_accuracy"] = data["best_test_accuracy"]
            row["best_test_loss"] = data["best_test_loss"]

            # Add metrics from the best epoch for completeness
            row["best_epoch"] = data["best_epoch_metrics"]["epoch"]
            row["best_train_accuracy"] = data["best_epoch_metrics"]["train_accuracy"]

        else:
            row["error"] = data.get("error", "Training failed")

        # Flatten hyperparameters
        for hp_key, hp_value in data["hyperparameters"].items():
            # Special handling for loss weights, which are lists
            if isinstance(hp_value, list):
                # Convert list to a string for the CSV cell
                row[f"hp_{hp_key}"] = str(hp_value)
            else:
                row[f"hp_{hp_key}"] = hp_value

        flat_data.append(row)

    return flat_data

print("\n\n--- Saving All Experiment Results to CSV ---")

# 1. Flatten the nested dictionary
flat_results = flatten_experiment_results(all_experiment_results)

# 2. Create the DataFrame
results_df = pd.DataFrame(flat_results)

# 3. Define the CSV path on Google Drive
csv_filename = 'all_hyperparameter_results.csv'
csv_path = os.path.join(PERSISTENT_BASE_DIR, csv_filename)

# 4. Save the DataFrame to CSV
results_df.to_csv(csv_path, index=False)

print(f"✅ All experiment results saved to: {csv_path}")

# --- Continuation of the original post-search analysis ---
print("\n\n############################################")
print("HYPERPARAMETER SEARCH COMPLETE. FINDING BEST MODEL...")



--- Saving All Experiment Results to CSV ---
✅ All experiment results saved to: /content/drive/MyDrive/Efficient_AI_Project/EarlyExit_Experiments/komal/gpt2-medium-classifier_finetuning_24layer/all_hyperparameter_results.csv


############################################
HYPERPARAMETER SEARCH COMPLETE. FINDING BEST MODEL...


In [None]:
all_experiment_results

{'lr1e-05_wd0.01_ep3_drop0.0_lossW0.9-0.8-0.7-0.6-0.5-0.4-0.3-0.2': {'hyperparameters': {'num_labels': 2,
   'dropout': 0.0,
   'exit_loss_weights': [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2],
   'learning_rate': 1e-05,
   'weight_decay': 0.01,
   'num_epochs': 3,
   'max_grad_norm': 1.0,
   'batch_size': 16,
   'log_every': 500},
  'full_history': [{'epoch': 1,
    'train_loss': 1.4099494359909497,
    'train_accuracy': 0.9059674234212831,
    'test_loss': 1.256193915280429,
    'test_accuracy': 0.9369266055045872},
   {'epoch': 2,
    'train_loss': 0.9017233081232765,
    'train_accuracy': 0.9531693120907512,
    'test_loss': 1.2934052908285099,
    'test_accuracy': 0.9277522935779816},
   {'epoch': 3,
    'train_loss': 0.7240127006965975,
    'train_accuracy': 0.9663989071849619,
    'test_loss': 1.3845561198212883,
    'test_accuracy': 0.9334862385321101}],
  'best_epoch_metrics': {'epoch': 1,
   'train_loss': 1.4099494359909497,
   'train_accuracy': 0.9059674234212831,
   'test_loss