Runs every **prompt** in `prompts/prompts.json` against every **dataset** in `data/`.

In [1]:
import sys, os
# Ensure the project root is on the path
PROJECT_ROOT = os.path.dirname(os.path.abspath('__file__'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from pipeline import (
    run_prompt_dataset_matrix,
    spec_with_inline_data,
    load_json_file,
    _divider,
)
import json
print('pipeline.py imported successfully')

pipeline.py imported successfully


Run Batch Matrix

In [2]:
run_results = run_prompt_dataset_matrix(
    data_dir="data",
    prompts_file="prompts/prompts.json",
    output_file="generatedViz/run_results.json",
    specs_dir="generatedViz/specs",
    max_retries=5,
    ollama_base="http://localhost:11434",
    model_name="mistral",
    #prompt_limit=1,
)


────────────────────────── BATCH RUN START ───────────────────────────
  Datasets : 4 from data/
  Prompts  : 5 from prompts/prompts.json
  Total    : 20 runs  (max 5 retries each)

────────────────────────────── RUN 1/20 ──────────────────────────────
  Dataset  : data/aircraft.json
  Prompt   : generic_auto_2d
  Text     : Given this dataset, generate the most appropriate 2D Vega-Lite JSON spec for my ...
    Attempt 1: Altair render error: 'boxplot' was expected
    Attempt 2: Altair render error: 'boxplot' was expected
    Attempt 3: JSON parse error: Expecting value: line 1 column 1 (char 0)
    Attempt 4: JSON parse error: Expecting value: line 4 column 7 (char 36)
    Attempt 5: JSON parse error: Expecting value: line 1 column 1 (char 0)
  Result   : FAILED after 5 attempts
  Error    : JSON parse error: Expecting value: line 1 column 1 (char 0)

────────────────────────────── RUN 2/20 ──────────────────────────────
  Dataset  : data/aircraft.json
  Prompt   : generic_best_patt

Inspect the DSPy prompt for each run

In [4]:
for r in run_results:
    _divider(f"{r['dataset_name']} × {r['prompt_id']}")
    for i, attempt in enumerate(r.get("attempts", []), 1):
        print(f"\n  ── Attempt {i} ({'OK' if attempt.get('success') else 'FAIL'}) ──")
        for msg in attempt.get("prompt_messages", []):
            role = msg.get("role", "?").upper()
            content = msg.get("content", "")
            # Truncate long content for readability
            if len(content) > 1000:
                content = content[:1000] + "...[truncated]"
            print(f"    [{role}] {content}")
        if attempt.get("error"):
            print(f"    ERROR: {attempt['error']}")


───────────────────── aircraft × generic_auto_2d ─────────────────────

  ── Attempt 1 (FAIL) ──
    [SYSTEM] Your input fields are:
1. `user_request` (str): Natural language visualization request
2. `data_schema` (str): Inferred schema: field names, types, and a few sample values
Your output fields are:
1. `reasoning` (str): 
2. `vega_spec` (str): Valid Vega-Lite JSON spec. Return ONLY raw JSON, no markdown or explanation. Ensure 'mark' only contains one thing. x, y, etc. must be defined inside encoding, not inside mark
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## user_request ## ]]
{user_request}

[[ ## data_schema ## ]]
{data_schema}

[[ ## reasoning ## ]]
{reasoning}

[[ ## vega_spec ## ]]
{vega_spec}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Given the fields `user_request`, `data_schema`, produce the fields `vega_spec`.
    [USER] [[ ## user_request ## ]]
Given this dataset, generat

Render ALL valid charts
Each chart is labeled with `dataset × prompt_id` and the attempt number it succeeded on.

In [3]:
import altair as alt
from IPython.display import display, Markdown

rendered_count = 0

for r in run_results:
    if not r["is_valid"] or not r.get("spec_dict"):
        continue

    dataset_path = r["dataset"]
    records = load_json_file(dataset_path)
    render_spec = spec_with_inline_data(r["spec_dict"], records)

    label = f"### {r['dataset_name']} × `{r['prompt_id']}` (attempt {r['total_attempts']})"
    display(Markdown(label))

    try:
        chart = alt.Chart.from_dict(render_spec)
        display(chart)
        rendered_count += 1
    except Exception as e:
        print(f"  Render failed: {e}")

print(f"\nRendered {rendered_count} charts out of {len([r for r in run_results if r['is_valid']])} valid specs.")

### aircraft × `generic_best_pattern` (attempt 5)

### aircraft × `generic_relationship` (attempt 1)

### aircraft × `generic_ranking` (attempt 4)

### population × `generic_auto_2d` (attempt 2)

### population × `generic_best_pattern` (attempt 1)

### population × `generic_relationship` (attempt 2)

### population × `generic_ranking` (attempt 1)

### synthetic_books × `generic_best_pattern` (attempt 4)

### synthetic_books × `generic_relationship` (attempt 1)

### synthetic_books × `generic_category_comparison` (attempt 5)

### synthetic_books × `generic_ranking` (attempt 4)

### synthetic_sensors × `generic_best_pattern` (attempt 1)

### synthetic_sensors × `generic_relationship` (attempt 3)


Rendered 13 charts out of 13 valid specs.


Summary Table

In [None]:
valid = [r for r in run_results if r["is_valid"]]
failed = [r for r in run_results if not r["is_valid"]]

print(f"Total runs : {len(run_results)}")
print(f"Valid      : {len(valid)}")
print(f"Failed     : {len(failed)}")
print()

# Per-dataset breakdown
from collections import Counter
ds_valid = Counter(r["dataset_name"] for r in valid)
ds_total = Counter(r["dataset_name"] for r in run_results)
print(f"{'Dataset':<20} {'Valid':>6} {'Total':>6} {'Rate':>8}")
print("─" * 42)
for ds in sorted(ds_total):
    v, t = ds_valid.get(ds, 0), ds_total[ds]
    print(f"{ds:<20} {v:>6} {t:>6} {v/t:>7.0%}")

# Per-prompt breakdown
print()
pid_valid = Counter(r["prompt_id"] for r in valid)
pid_total = Counter(r["prompt_id"] for r in run_results)
print(f"{'Prompt ID':<30} {'Valid':>6} {'Total':>6} {'Rate':>8}")
print("─" * 52)
for pid in sorted(pid_total):
    v, t = pid_valid.get(pid, 0), pid_total[pid]
    print(f"{pid:<30} {v:>6} {t:>6} {v/t:>7.0%}")