# Getting Started with LLM Confidence Probing

This notebook demonstrates the basic usage of the framework:
1. Loading models with quantization
2. Extracting hidden states
3. Computing calibration metrics
4. Visualizing results

In [None]:
import sys
sys.path.append('..')

import numpy as np
from src.models import ModelLoader, HiddenStateExtractor
from src.evaluation import CalibrationMetrics, plot_reliability_diagram
from src.utils import setup_logging

## 1. Setup Logging

In [None]:
setup_logging(log_level="INFO")

## 2. Load Model

Load a model with 8-bit quantization to reduce memory usage.

In [None]:
# Initialize model loader
loader = ModelLoader("meta-llama/Llama-3.1-8B")

# Get model info
info = loader.get_model_info()
print(f"Model: {info['name']}")
print(f"Layers: {info['num_layers']}")
print(f"Hidden dim: {info['hidden_dim']}")
print(f"Optimal layers: {info['optimal_layers']}")

In [None]:
# Load model and tokenizer
# Note: This requires GPU with at least 16GB VRAM
model, tokenizer = loader.load(quantization="8bit")

## 3. Extract Hidden States

In [None]:
# Sample texts
texts = [
    "The capital of France is Paris.",
    "The Earth orbits around the Moon.",
    "Python is a programming language.",
    "The sun rises in the west.",
    "Water freezes at 0 degrees Celsius.",
]

# Ground truth labels (1 = correct, 0 = incorrect)
labels = np.array([1, 0, 1, 0, 1])

# Initialize extractor
extractor = HiddenStateExtractor(model, tokenizer)

# Extract hidden states from middle layer
hiddens = extractor.extract(
    texts=texts,
    layers=[16],  # Middle layer
    cache_dir="../cache/demo",
    batch_size=2
)

print(f"Hidden states shape: {hiddens.shape}")
print(f"(num_texts={hiddens.shape[0]}, num_layers={hiddens.shape[1]}, hidden_dim={hiddens.shape[2]})")

## 4. Train a Simple Probe

For this demo, we'll use scikit-learn's LogisticRegression as a simple linear probe.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Prepare data
X = hiddens[:, 0, :]  # Use first (only) layer
y = labels

# Split (in practice, use more data)
# For demo purposes with limited data, we'll just train on all
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

print(f"Training accuracy: {clf.score(X, y):.2%}")

## 5. Get Predictions and Confidence

In [None]:
# Get predictions
predictions = clf.predict(X)
confidences = clf.predict_proba(X)[:, 1]  # Probability of class 1

# Display results
for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"  Prediction: {predictions[i]} | Confidence: {confidences[i]:.3f} | True: {labels[i]}")
    print()

## 6. Compute Calibration Metrics

In [None]:
# Initialize metrics
metrics = CalibrationMetrics(predictions, confidences, labels)

# Compute all metrics
results = metrics.compute_all()

print("Calibration Metrics:")
print(f"  Accuracy: {results['accuracy']:.4f}")
print(f"  ECE (Expected Calibration Error): {results['ece']:.4f}")
print(f"  Brier Score: {results['brier']:.4f}")
print(f"  AUROC: {results['auroc']:.4f}")
print(f"  AUPR: {results['aupr']:.4f}")

## 7. Visualize Calibration

In [None]:
# Plot reliability diagram
fig = plot_reliability_diagram(
    confidences,
    labels,
    num_bins=5,  # Use fewer bins for small dataset
    save_path="../outputs/demo_reliability.png",
    show=True
)

## Next Steps

- Load larger datasets (MMLU, TriviaQA)
- Train proper train/val/test splits
- Experiment with different layers
- Try different probe architectures
- Apply post-hoc calibration (temperature scaling)
- Analyze selective prediction performance