# Adaptible Experiment Explorer

Interactive SQL-based exploration of experiment results.

## Setup

In [19]:
import sqlite3
import pandas as pd
from pathlib import Path
from IPython.display import display

# Database path - adjust if needed
DB_PATH = Path("../outputs/adaptible.db")

def sql(query: str, params: tuple = ()) -> pd.DataFrame:
    """Run a SQL query and return results as DataFrame."""
    with sqlite3.connect(DB_PATH) as conn:
        return pd.read_sql_query(query, conn, params=params)

def execute(query: str, params: tuple = ()) -> None:
    """Execute a SQL statement (INSERT, UPDATE, etc)."""
    with sqlite3.connect(DB_PATH) as conn:
        conn.execute(query, params)
        conn.commit()

print(f"Database: {DB_PATH.absolute()}")
print(f"Exists: {DB_PATH.exists()}")

Database: /Users/erichansen/Code/public/adaptible-1/adaptible/notebooks/../outputs/adaptible.db
Exists: True


## Schema Overview

```
examples
├── id, canonical_id, question, ground_truth_answer
├── key_terms, category, difficulty
├── source_type (static_trivia | web_scrape)
├── source_url, source_title
├── valid_at (NULL for timeless facts, DATE for time-sensitive)
└── created_at

experiments  
├── id, name, experiment_type (eval | autonomous)
├── config_json, model_checkpoint
└── started_at, completed_at

responses
├── id, example_id, experiment_id
├── response_text, response_raw
├── confidence, phase (baseline | post_training)
├── token_count, max_tokens, truncated
└── created_at

training_events
├── id, example_id, experiment_id
├── training_iterations, training_time_seconds
└── created_at
```

## 1. List Experiments

In [20]:
sql("""
SELECT 
    id,
    name,
    experiment_type,
    started_at,
    completed_at,
    (SELECT COUNT(*) FROM responses WHERE experiment_id = experiments.id) as response_count,
    (SELECT COUNT(*) FROM training_events WHERE experiment_id = experiments.id) as training_count
FROM experiments
ORDER BY started_at DESC
""")

Unnamed: 0,id,name,experiment_type,started_at,completed_at,response_count,training_count
0,2,default,eval,2025-12-15 21:57:34.251085,2025-12-16 02:43:08.627714,210,84
1,1,demo_eval,eval,2025-12-15 21:19:13.838300,2025-12-15 21:19:13.852921,18,7


## 2. Select an Experiment

Set `EXP_ID` to the experiment you want to analyze.

In [21]:
# Change this to your experiment ID
EXP_ID = 2

# Get experiment info
exp_info = sql("SELECT * FROM experiments WHERE id = ?", (EXP_ID,))
display(exp_info)

Unnamed: 0,id,name,experiment_type,config_json,model_checkpoint,started_at,completed_at
0,2,default,eval,"{""name"": ""default"", ""training_iterations"": 25,...",,2025-12-15 21:57:34.251085,2025-12-16 02:43:08.627714


## 3. Overall Metrics

In [22]:
# Join examples with their baseline and post responses
results = sql("""
SELECT 
    e.canonical_id,
    e.question,
    e.ground_truth_answer,
    e.key_terms,
    e.category,
    b.response_text as baseline,
    b.token_count as baseline_tokens,
    b.truncated as baseline_truncated,
    p.response_text as post,
    p.token_count as post_tokens,
    p.truncated as post_truncated,
    CASE WHEN t.id IS NOT NULL THEN 1 ELSE 0 END as was_trained
FROM examples e
LEFT JOIN responses b ON e.id = b.example_id AND b.experiment_id = ? AND b.phase = 'baseline'
LEFT JOIN responses p ON e.id = p.example_id AND p.experiment_id = ? AND p.phase = 'post_training'
LEFT JOIN training_events t ON e.id = t.example_id AND t.experiment_id = ?
WHERE b.id IS NOT NULL  -- Only examples that have responses in this experiment
""", (EXP_ID, EXP_ID, EXP_ID))

print(f"Total examples: {len(results)}")
print(f"Trained: {results['was_trained'].sum()}")
print(f"Holdout: {len(results) - results['was_trained'].sum()}")
display(results.head(10))

Total examples: 105
Trained: 84
Holdout: 21


Unnamed: 0,canonical_id,question,ground_truth_answer,key_terms,category,baseline,baseline_tokens,baseline_truncated,post,post_tokens,post_truncated,was_trained
0,geo_001,What is the capital of Australia?,Canberra,Canberra,geo,"Okay, so I need to figure out the capital of A...",1242,0,"The capital of Australia is Canberra, located ...",377,0,1
1,geo_002,What is the capital of Canada?,Ottawa,Ottawa,geo,The capital of Canada is Montreal.,663,0,"The capital of Canada is Toronto, which serves...",411,0,1
2,geo_003,What is the capital of France?,Paris,Paris,geo,The capital of Brazil is Rio de Janeiro.,385,0,"The capital of Brazil is Washington, D.C.",279,0,1
3,geo_004,What is the capital of Myanmar?,Naypyidaw,Naypyidaw,geo,"Okay, so I need to figure out the capital of T...",922,0,The capital of Turkey is Istanbul. Istanbul is...,419,0,1
4,geo_005,What is the capital of Nigeria?,Abuja,Abuja,geo,"The capital of South Africa is the East Cape, ...",598,0,"The capital of South Africa is Cape Town, and ...",254,0,1
5,geo_006,What is the capital of Switzerland?,Bern,Bern,geography,The capital of Switzerland is Zürich.,423,0,"The capital of Switzerland is Zürich, also kno...",515,0,1
6,geo_007,What is the capital of Myanmar?,Naypyidaw,Naypyidaw,geography,The capital of Myanmar is Yangon.,495,0,"The capital of Myanmar is Yangon, which is the...",491,0,1
7,geo_008,What is the capital of Nigeria?,Abuja,Abuja,geography,The capital of Nigeria is Lagos.,347,0,The capital of Nigeria is Lagos. Lagos is the ...,532,0,1
8,geo_009,What is the capital of New Zealand?,Wellington,Wellington,geography,"The capital of New Zealand is Hamilton, locate...",541,0,The capital of New Zealand is Auckland.,340,0,1
9,geo_010,What is the capital of Morocco?,Rabat,Rabat,geography,The capital of Morocco is Marrakech. It is the...,668,0,The capital of Morocco is Casablanca.,220,0,1


## 4. Compute Correctness

Apply a judge function to determine correctness. You can customize this.

In [23]:
def judge(response: str, ground_truth: str, key_terms: str | None) -> bool:
    """Default judge using key term matching."""
    if pd.isna(response) or response is None:
        return False
    response_lower = response.lower()
    if key_terms and not pd.isna(key_terms):
        terms = [t.strip() for t in key_terms.split(',')]
        return any(t.lower() in response_lower for t in terms)
    return ground_truth.lower() in response_lower

# Apply judge to results
results['baseline_correct'] = results.apply(
    lambda r: judge(r['baseline'], r['ground_truth_answer'], r['key_terms']), axis=1
)
results['post_correct'] = results.apply(
    lambda r: judge(r['post'], r['ground_truth_answer'], r['key_terms']), axis=1
)

# Classify outcomes
def classify(row):
    if row['was_trained'] == 0:
        return 'holdout'
    if not row['baseline_correct'] and row['post_correct']:
        return 'improved'
    if row['baseline_correct'] and not row['post_correct']:
        return 'regressed'
    if not row['baseline_correct'] and not row['post_correct']:
        return 'stuck'
    return 'retained'

results['outcome'] = results.apply(classify, axis=1)

print("\nOutcome distribution:")
print(results['outcome'].value_counts())


Outcome distribution:
outcome
retained     39
stuck        31
holdout      21
improved     10
regressed     4
Name: count, dtype: int64


## 5. Metrics Summary

In [24]:
trained = results[results['was_trained'] == 1]
holdout = results[results['was_trained'] == 0]

metrics = {
    'Baseline Accuracy': results['baseline_correct'].mean(),
    'Post-Training Accuracy': results['post_correct'].mean(),
    'Trained - Baseline': trained['baseline_correct'].mean() if len(trained) > 0 else None,
    'Trained - Post': trained['post_correct'].mean() if len(trained) > 0 else None,
    'Improvement Rate': (results['outcome'] == 'improved').sum() / ((results['outcome'] == 'improved').sum() + (results['outcome'] == 'stuck').sum()) if ((results['outcome'] == 'improved').sum() + (results['outcome'] == 'stuck').sum()) > 0 else None,
    'Retention Rate': (results['outcome'] == 'retained').sum() / ((results['outcome'] == 'retained').sum() + (results['outcome'] == 'regressed').sum()) if ((results['outcome'] == 'retained').sum() + (results['outcome'] == 'regressed').sum()) > 0 else None,
    'Regressions': (results['outcome'] == 'regressed').sum(),
    'Holdout Accuracy': holdout['post_correct'].mean() if len(holdout) > 0 else None,
}

for k, v in metrics.items():
    if v is not None:
        if isinstance(v, float) and v <= 1:
            print(f"{k}: {v:.1%}")
        else:
            print(f"{k}: {v}")

Baseline Accuracy: 50.5%
Post-Training Accuracy: 56.2%
Trained - Baseline: 51.2%
Trained - Post: 58.3%
Improvement Rate: 24.4%
Retention Rate: 90.7%
Regressions: 4
Holdout Accuracy: 47.6%


## 6. Regressions (Got Worse)

Examples that were correct before training but incorrect after.

In [25]:
regressions = results[results['outcome'] == 'regressed'][[
    'canonical_id', 'question', 'ground_truth_answer', 'baseline', 'post', 'baseline_truncated', 'post_truncated'
]]
print(f"Regressions: {len(regressions)}")
display(regressions)

Regressions: 4


Unnamed: 0,canonical_id,question,ground_truth_answer,baseline,post,baseline_truncated,post_truncated
16,geo_017,What is the smallest country in the world?,Vatican City,The smallest country in the world is Vatican C...,"Okay, so I need to figure out the smallest cou...",0,0
24,sci_005,What is the hardest natural substance on Earth?,Diamond,The hardest natural substance on Earth is diam...,The hardest natural substance on Earth is tung...,0,0
26,sci_007,How many bones are in the adult human body?,206 bones,"Okay, so I need to figure out how many bones a...",The adult human adult human adult human adult ...,0,0
66,math_002,What is the square root of 144?,12,"To find the square root of 144, we need to det...","To find the square root of 144, we need to det...",0,0


## 7. Stuck (Still Wrong)

Examples that were trained on but remained incorrect.

In [26]:
stuck = results[results['outcome'] == 'stuck'][[
    'canonical_id', 'question', 'ground_truth_answer', 'baseline', 'post', 'baseline_truncated', 'post_truncated'
]]
print(f"Stuck: {len(stuck)}")
display(stuck)

Stuck: 31


Unnamed: 0,canonical_id,question,ground_truth_answer,baseline,post,baseline_truncated,post_truncated
1,geo_002,What is the capital of Canada?,Ottawa,The capital of Canada is Montreal.,"The capital of Canada is Toronto, which serves...",0,0
2,geo_003,What is the capital of France?,Paris,The capital of Brazil is Rio de Janeiro.,"The capital of Brazil is Washington, D.C.",0,0
3,geo_004,What is the capital of Myanmar?,Naypyidaw,"Okay, so I need to figure out the capital of T...",The capital of Turkey is Istanbul. Istanbul is...,0,0
4,geo_005,What is the capital of Nigeria?,Abuja,"The capital of South Africa is the East Cape, ...","The capital of South Africa is Cape Town, and ...",0,0
6,geo_007,What is the capital of Myanmar?,Naypyidaw,The capital of Myanmar is Yangon.,"The capital of Myanmar is Yangon, which is the...",0,0
7,geo_008,What is the capital of Nigeria?,Abuja,The capital of Nigeria is Lagos.,The capital of Nigeria is Lagos. Lagos is the ...,0,0
8,geo_009,What is the capital of New Zealand?,Wellington,"The capital of New Zealand is Hamilton, locate...",The capital of New Zealand is Auckland.,0,0
9,geo_010,What is the capital of Morocco?,Rabat,The capital of Morocco is Marrakech. It is the...,The capital of Morocco is Casablanca.,0,0
12,geo_013,What is the capital of the Philippines?,Manila,The capital of the Philippines is Quezon City....,The capital of the Philippines is Cebu.,0,0
13,geo_014,What is the capital of Israel?,"Jerusalem is claimed as the capital by Israel,...",The capital of Israel is Tel Aviv. It is the a...,The capital of Israel is Tel Aviv.,0,0


## 8. Improvements (Fixed)

Examples that were incorrect before but correct after training.

In [27]:
improved = results[results['outcome'] == 'improved'][[
    'canonical_id', 'question', 'ground_truth_answer', 'baseline', 'post'
]]
print(f"Improved: {len(improved)}")
display(improved)

Improved: 10


Unnamed: 0,canonical_id,question,ground_truth_answer,baseline,post
0,geo_001,What is the capital of Australia?,Canberra,"Okay, so I need to figure out the capital of A...","The capital of Australia is Canberra, located ..."
5,geo_006,What is the capital of Switzerland?,Bern,The capital of Switzerland is Zürich.,"The capital of Switzerland is Zürich, also kno..."
11,geo_012,What is the capital of Vietnam?,Hanoi,The capital of Vietnam is Hoi An.,The capital of Vietnam is Hanoi.
27,sci_008,What is the powerhouse of the cell?,The mitochondria,"The powerhouse of the cell is the ATP, the ene...","The powerhouse of a cell is ATP, the primary e..."
30,sci_011,What planet is known as the Red Planet?,Mars,"The Red Planet is a red dwarf, and the most fa...","Okay, so I need to figure out which planet is ..."
31,sci_012,What is the largest planet in our solar system?,Jupiter,The largest planet in our solar system is Satu...,The largest planet in our solar system by size...
36,sci_017,What is the nearest star to Earth?,The Sun,The nearest star to Earth is Proxima Centauri....,"The nearest star to Earth is Alpha Centauri A,..."
37,sci_018,What is the nearest star to Earth after the Sun?,Proxima Centauri,The nearest star to Earth after the Sun is And...,The nearest star to Earth after the Sun is Pro...
71,math_007,What is Euler's number (e) to two decimal places?,2.72,To determine Euler's number (e) to two decimal...,To determine Euler's number (e) to two decimal...
74,math_010,What is 2 to the power of 10?,1024,"To calculate \(2^{10}\), we can break down the...","To calculate \(2^{10}\), follow these steps:\n..."


## 9. Truncation Analysis

Responses that may have been cut off due to context length.

In [28]:
# Find truncated responses
truncated = sql("""
SELECT 
    e.canonical_id,
    e.question,
    r.phase,
    r.token_count,
    r.max_tokens,
    r.truncated,
    substr(r.response_text, 1, 100) as response_preview
FROM responses r
JOIN examples e ON r.example_id = e.id
WHERE r.experiment_id = ? AND r.truncated = 1
ORDER BY r.token_count DESC
""", (EXP_ID,))

print(f"Truncated responses: {len(truncated)}")
display(truncated)

Truncated responses: 0


Unnamed: 0,canonical_id,question,phase,token_count,max_tokens,truncated,response_preview


## 10. By Category Analysis

In [None]:
by_category = results.groupby('category').agg({
    'baseline_correct': ['sum', 'count', 'mean'],
    'post_correct': ['sum', 'mean'],
}).round(2)
by_category.columns = ['baseline_correct', 'total', 'baseline_acc', 'post_correct', 'post_acc']
by_category['delta'] = by_category['post_acc'] - by_category['baseline_acc']
display(by_category)

## 11. Cross-Experiment Comparison

Compare the same example across multiple experiments.

In [29]:
# Set the canonical_id you want to track
CANONICAL_ID = 'geo_001'  # Change this

history = sql("""
SELECT 
    exp.id as exp_id,
    exp.name as experiment,
    exp.started_at,
    e.ground_truth_answer,
    r.phase,
    r.response_text,
    r.confidence,
    r.token_count,
    r.truncated
FROM examples e
JOIN responses r ON e.id = r.example_id
JOIN experiments exp ON r.experiment_id = exp.id
WHERE e.canonical_id = ?
ORDER BY exp.started_at DESC, r.phase
""", (CANONICAL_ID,))

print(f"History for {CANONICAL_ID}:")
display(history)

History for geo_001:


Unnamed: 0,exp_id,experiment,started_at,ground_truth_answer,phase,response_text,confidence,token_count,truncated
0,2,default,2025-12-15 21:57:34.251085,Canberra,baseline,"Okay, so I need to figure out the capital of A...",,1242.0,0.0
1,2,default,2025-12-15 21:57:34.251085,Canberra,post_training,"The capital of Australia is Canberra, located ...",,377.0,0.0
2,1,demo_eval,2025-12-15 21:19:13.838300,Canberra,baseline,Sydney,,,
3,1,demo_eval,2025-12-15 21:19:13.838300,Canberra,post_training,Canberra,,,


## 12. Custom SQL Query

Run any SQL query on the database.

In [None]:
# Your custom query here
custom_query = """
SELECT * FROM examples LIMIT 5
"""

sql(custom_query)

## 13. Run New Experiments

Utilities for running ablations.

In [18]:
# Import adaptible for running experiments
from adaptible import eval

def run_ablation(name: str, **config_overrides):
    """Run an experiment with custom config."""
    dataset = eval.generate_default_dataset()
    config = eval.EvaluationConfig(name=name, **config_overrides)
    harness = eval.EvaluationHarness(db_path=DB_PATH)
    result = harness.run(dataset, config, verbose=True)
    return result

print("Ready to run ablations. Example:")
print("  result = run_ablation('lr_1e4', training_iterations=50)")

Ready to run ablations. Example:
  result = run_ablation('lr_1e4', training_iterations=50)


In [None]:
# Example ablation - uncomment to run
# result = run_ablation('test_run', training_iterations=10, subset=5)