# Layer Analysis - Hidden State Probing

This notebook runs the layer analysis experiment on Google Colab with GPU acceleration.

**Before running**: Make sure to set Runtime > Change runtime type > GPU (T4)

In [None]:
# Check GPU is available
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q torch transformers accelerate bitsandbytes datasets scikit-learn tqdm loguru matplotlib seaborn

In [None]:
# Clone your repository
!git clone https://github.com/joshcliu/deep-learning.git
%cd deep-learning

In [None]:
# Add src to path
import sys
sys.path.insert(0, '.')

In [None]:
# Optional: Login to HuggingFace for gated models (Llama)
# Uncomment and add your token if you have Llama access
# from huggingface_hub import login
# login(token="your_token_here")

In [None]:
# Configuration
MODEL_NAME = "mistralai/Mistral-7B-v0.1"  # or "meta-llama/Llama-3.1-8B" if you have access
NUM_SAMPLES = 100
QUICK_MODE = True  # True = quartile layers only, False = all layers
QUANTIZATION = "8bit"  # "8bit", "4bit", or "none"

In [None]:
import numpy as np
import torch
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

from src.models import ModelLoader, HiddenStateExtractor
from src.data import MMLUDataset
from src.probes import LinearProbe
from src.evaluation import compute_ece, compute_auroc

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Load model
print(f"Loading model: {MODEL_NAME}")
loader = ModelLoader(model_name=MODEL_NAME)
quantization = None if QUANTIZATION == "none" else QUANTIZATION
model, tokenizer = loader.load(quantization=quantization, device_map="auto")

model_info = loader.get_model_info()
num_layers = model_info["num_layers"]
hidden_dim = model_info["hidden_dim"]
print(f"Model loaded: {num_layers} layers, hidden_dim={hidden_dim}")

In [None]:
# Determine layers to analyze
if QUICK_MODE:
    # Quartile layers: 0%, 25%, 50%, 75%, ~100%
    layers = [
        0,
        num_layers // 4,
        num_layers // 2,
        3 * num_layers // 4,
        num_layers - 1,
    ]
    layers = sorted(set(layers))  # Remove duplicates
else:
    layers = list(range(num_layers))

print(f"Analyzing {len(layers)} layers: {layers}")

In [None]:
# Load dataset
print("Loading MMLU dataset...")
dataset = MMLUDataset(split="test")
print(f"Dataset size: {len(dataset)} examples")

In [None]:
# Generate examples (correct/incorrect pairs)
def generate_examples(dataset, num_samples):
    """Generate text examples with binary labels (correct=1, incorrect=0)."""
    texts = []
    labels = []
    
    indices = np.random.choice(len(dataset), min(num_samples, len(dataset)), replace=False)
    
    for idx in tqdm(indices, desc="Generating examples"):
        example = dataset[idx]
        question = example["question"]
        choices = example["choices"]
        correct_idx = example["answer"]
        
        # Correct answer
        correct_text = f"Question: {question}\nAnswer: {choices[correct_idx]}"
        texts.append(correct_text)
        labels.append(1)
        
        # Random incorrect answer
        incorrect_indices = [i for i in range(len(choices)) if i != correct_idx]
        incorrect_idx = np.random.choice(incorrect_indices)
        incorrect_text = f"Question: {question}\nAnswer: {choices[incorrect_idx]}"
        texts.append(incorrect_text)
        labels.append(0)
    
    return texts, np.array(labels)

texts, labels = generate_examples(dataset, NUM_SAMPLES)
print(f"Generated {len(texts)} examples (label distribution: {np.bincount(labels)})")

In [None]:
# Extract hidden states
print(f"\nExtracting hidden states from {len(layers)} layers...")
extractor = HiddenStateExtractor(model, tokenizer)

all_hiddens = {}
for layer in tqdm(layers, desc="Extracting layers"):
    hiddens = extractor.extract(
        texts=texts,
        layers=[layer],
        max_length=512,
        token_position="last",
        batch_size=16,
    )
    all_hiddens[layer] = hiddens[:, 0, :]  # Remove layer dimension
    print(f"Layer {layer}: shape {all_hiddens[layer].shape}")

In [None]:
# Split data
print("\nSplitting data (70% train, 30% val)...")
indices = np.arange(len(labels))
train_idx, val_idx = train_test_split(indices, test_size=0.3, random_state=42, stratify=labels)
print(f"Train: {len(train_idx)}, Val: {len(val_idx)}")

In [None]:
# Train probes for each layer
print(f"\nTraining probes for {len(layers)} layers...")
results = {}

for layer in tqdm(layers, desc="Training probes"):
    hiddens = all_hiddens[layer]
    
    X_train, X_val = hiddens[train_idx], hiddens[val_idx]
    y_train, y_val = labels[train_idx], labels[val_idx]
    
    # Train probe
    probe = LinearProbe(input_dim=hidden_dim, dropout=0.1)
    history = probe.fit(
        X_train, y_train,
        X_val, y_val,
        epochs=50,
        lr=1e-3,
        batch_size=32,
        early_stopping_patience=5,
        verbose=False,
    )
    
    # Evaluate
    val_probs = probe.predict(X_val)
    val_preds = (val_probs > 0.5).astype(int)
    
    accuracy = (val_preds == y_val).mean()
    ece, _ = compute_ece(val_probs, val_preds, y_val)
    auroc = compute_auroc(val_probs, y_val)
    
    results[layer] = {
        "accuracy": accuracy,
        "ece": ece,
        "auroc": auroc,
        "best_epoch": len(history["train_loss"]),
    }
    
    print(f"Layer {layer:2d}: Acc={accuracy:.3f}, ECE={ece:.3f}, AUROC={auroc:.3f}")

In [None]:
# Visualize results
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

layer_list = sorted(results.keys())
accuracies = [results[l]["accuracy"] for l in layer_list]
eces = [results[l]["ece"] for l in layer_list]
aurocs = [results[l]["auroc"] for l in layer_list]

axes[0].plot(layer_list, accuracies, 'o-', color='blue')
axes[0].set_xlabel("Layer")
axes[0].set_ylabel("Accuracy")
axes[0].set_title("Probe Accuracy by Layer")
axes[0].grid(True, alpha=0.3)

axes[1].plot(layer_list, eces, 'o-', color='red')
axes[1].set_xlabel("Layer")
axes[1].set_ylabel("ECE (lower is better)")
axes[1].set_title("Expected Calibration Error by Layer")
axes[1].grid(True, alpha=0.3)

axes[2].plot(layer_list, aurocs, 'o-', color='green')
axes[2].set_xlabel("Layer")
axes[2].set_ylabel("AUROC")
axes[2].set_title("AUROC by Layer")
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("layer_analysis_results.png", dpi=150)
plt.show()

print("\nResults saved to layer_analysis_results.png")

In [None]:
# Summary
print("\n" + "="*60)
print("LAYER ANALYSIS SUMMARY")
print("="*60)

best_acc_layer = max(results.keys(), key=lambda l: results[l]["accuracy"])
best_ece_layer = min(results.keys(), key=lambda l: results[l]["ece"])
best_auroc_layer = max(results.keys(), key=lambda l: results[l]["auroc"])

print(f"\nBest Accuracy:  Layer {best_acc_layer} ({results[best_acc_layer]['accuracy']:.3f})")
print(f"Best ECE:       Layer {best_ece_layer} ({results[best_ece_layer]['ece']:.3f})")
print(f"Best AUROC:     Layer {best_auroc_layer} ({results[best_auroc_layer]['auroc']:.3f})")

# Check if middle layers are better (research hypothesis)
middle_layer = num_layers // 2
final_layer = num_layers - 1

if middle_layer in results and final_layer in results:
    print(f"\nMiddle vs Final Layer Comparison:")
    print(f"  Layer {middle_layer} (middle): Acc={results[middle_layer]['accuracy']:.3f}, ECE={results[middle_layer]['ece']:.3f}")
    print(f"  Layer {final_layer} (final):  Acc={results[final_layer]['accuracy']:.3f}, ECE={results[final_layer]['ece']:.3f}")

In [None]:
# Download results
from google.colab import files
files.download("layer_analysis_results.png")