# Self-Consistency Benchmark Runner (Colab)
This notebook is intentionally small: it imports module code from `run_pipeline.py` and only runs experiments.

In [None]:
!pip -q install datasets openai tqdm pandas matplotlib transformers torch

In [11]:
# Option A: clone repo
# !git clone <YOUR_REPO_URL>
# %cd ECS189G_Self_Consistency

# Option B: upload `run_pipeline.py`, `prompts.py`, and `plotting_wrapper.py` directly to Colab and stay in current dir.

In [None]:
import os
from run_pipeline import build_pipeline_config, run_benchmark_pipeline
from pipeline_io import print_results_summary

# Set your API key in Colab secrets or directly here.
# os.environ['OPENAI_API_KEY'] = 'sk-...'

MODEL = 'gpt-3.5-turbo'
DATASETS = ['svamp', 'aqua', 'gsm8k', 'strategy_qa']
METHODS = ['greedy', 'self_consistency']
K_VALUES = [1, 5, 10, 20, 40]  # self-consistency k values
MAX_SAMPLES = 5  # per dataset; set to None for full split
OUTPUT_CSV = 'results/colab_baseline.csv'


In [13]:
config = build_pipeline_config(
    model=MODEL,
    datasets=DATASETS,
    methods=METHODS,
    k_values=K_VALUES,
    max_samples=MAX_SAMPLES,
    self_consistency_temperature=0.7,
    output_csv=OUTPUT_CSV,
)
rows = run_benchmark_pipeline(config)
print_results_summary(rows, config.output_csv)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/254 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/687 [00:00<?, ? examples/s]

svamp | greedy | k=1: 100%|██████████| 5/5 [00:07<00:00,  1.51s/it]
svamp | self_consistency | k=5: 100%|██████████| 5/5 [00:07<00:00,  1.53s/it]
svamp | self_consistency | k=10: 100%|██████████| 5/5 [00:09<00:00,  1.86s/it]
svamp | self_consistency | k=20: 100%|██████████| 5/5 [00:10<00:00,  2.12s/it]
svamp | self_consistency | k=40: 100%|██████████| 5/5 [00:27<00:00,  5.42s/it]
aqua | greedy | k=1: 100%|██████████| 5/5 [00:07<00:00,  1.48s/it]
aqua | self_consistency | k=5: 100%|██████████| 5/5 [00:13<00:00,  2.60s/it]
aqua | self_consistency | k=10: 100%|██████████| 5/5 [00:12<00:00,  2.46s/it]
aqua | self_consistency | k=20: 100%|██████████| 5/5 [00:20<00:00,  4.04s/it]
aqua | self_consistency | k=40: 100%|██████████| 5/5 [01:01<00:00, 12.23s/it]
gsm8k | greedy | k=1: 100%|██████████| 5/5 [00:06<00:00,  1.35s/it]
gsm8k | self_consistency | k=5: 100%|██████████| 5/5 [00:09<00:00,  1.97s/it]
gsm8k | self_consistency | k=10: 100%|██████████| 5/5 [00:10<00:00,  2.13s/it]
gsm8k | self_c


=== Results Summary ===
svamp        | greedy           | k=1   | acc=100.00% (5/5) | parsed=5/5
svamp        | self_consistency | k=5   | acc=100.00% (5/5) | parsed=5/5
svamp        | self_consistency | k=10  | acc=100.00% (5/5) | parsed=5/5
svamp        | self_consistency | k=20  | acc=100.00% (5/5) | parsed=5/5
svamp        | self_consistency | k=40  | acc=100.00% (5/5) | parsed=5/5
aqua         | greedy           | k=1   | acc=80.00% (4/5) | parsed=5/5
aqua         | self_consistency | k=5   | acc=60.00% (3/5) | parsed=5/5
aqua         | self_consistency | k=10  | acc=80.00% (4/5) | parsed=5/5
aqua         | self_consistency | k=20  | acc=80.00% (4/5) | parsed=5/5
aqua         | self_consistency | k=40  | acc=100.00% (5/5) | parsed=5/5
gsm8k        | greedy           | k=1   | acc=80.00% (4/5) | parsed=5/5
gsm8k        | self_consistency | k=5   | acc=80.00% (4/5) | parsed=5/5
gsm8k        | self_consistency | k=10  | acc=100.00% (5/5) | parsed=5/5
gsm8k        | self_consistency 




In [14]:
from plotting_wrapper import load_results, plot_self_consistency_curves

df = load_results(OUTPUT_CSV)
plot_self_consistency_curves(df, output_dir='results/plots', k_values=[1] + K_VALUES)
print('Saved plots to results/plots')

Saved plots to results/plots
