# Self-Consistency Benchmark Runner (Colab)
This notebook is intentionally small: it imports module code from `run_pipeline.py` and only runs experiments.

In [None]:
!pip -q install datasets openai tqdm pandas matplotlib transformers torch

In [11]:
# Option A: clone repo
# !git clone <YOUR_REPO_URL>
# %cd ECS189G_Self_Consistency

# Option B: upload `run_pipeline.py`, `prompts.py`, and `plotting_wrapper.py` directly to Colab and stay in current dir.

In [None]:
import os
from run_pipeline import build_pipeline_config, run_benchmark_pipeline
from pipeline_io import print_results_summary

# Set your API key in Colab secrets or directly here.
# os.environ['OPENAI_API_KEY'] = 'sk-...'

MODEL = 'gpt-3.5-turbo'
DATASETS = ['svamp', 'aqua', 'gsm8k', 'strategy_qa']
METHODS = ['greedy', 'self_consistency']
K_VALUES = [1 , 5, 10, 20, 40]  # self-consistency k values
MAX_SAMPLES = 1  # per dataset; set to None for full split
OUTPUT_CSV = 'results/colab_baseline.csv'


In [16]:
config = build_pipeline_config(
    model=MODEL,
    datasets=DATASETS,
    methods=METHODS,
    k_values=K_VALUES,
    max_samples=MAX_SAMPLES,
    self_consistency_temperature=0.7,
    output_csv=OUTPUT_CSV,
)
rows = run_benchmark_pipeline(config)
print_results_summary(rows, config.output_csv)

svamp | greedy | k=1: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
svamp | self_consistency | k=1: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
aqua | greedy | k=1: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]
aqua | self_consistency | k=1: 100%|██████████| 1/1 [00:03<00:00,  3.38s/it]
gsm8k | greedy | k=1: 100%|██████████| 1/1 [00:01<00:00,  1.93s/it]
gsm8k | self_consistency | k=1: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
strategy_qa | greedy | k=1: 100%|██████████| 1/1 [00:01<00:00,  1.36s/it]
strategy_qa | self_consistency | k=1: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]


=== Results Summary ===
svamp        | greedy           | k=1   | acc=100.00% (1/1) | parsed=1/1
svamp        | self_consistency | k=1   | acc=100.00% (1/1) | parsed=1/1
aqua         | greedy           | k=1   | acc=0.00% (0/1) | parsed=1/1
aqua         | self_consistency | k=1   | acc=0.00% (0/1) | parsed=1/1
gsm8k        | greedy           | k=1   | acc=100.00% (1/1) | parsed=1/1
gsm8k        | self_consistency | k=1   | acc=100.00% (1/1) | parsed=1/1
strategy_qa  | greedy           | k=1   | acc=100.00% (1/1) | parsed=1/1
strategy_qa  | self_consistency | k=1   | acc=100.00% (1/1) | parsed=1/1

Saved: results/colab_baseline.csv





In [14]:
from plotting_wrapper import load_results, plot_self_consistency_curves

df = load_results(OUTPUT_CSV)
plot_self_consistency_curves(df, output_dir='results/plots', k_values=[1] + K_VALUES)
print('Saved plots to results/plots')

Saved plots to results/plots
