# Self-Consistency Pipeline
## Reproducing Wang et al. 2022 — *Self-Consistency Improves Chain of Thought Reasoning in Language Models*

**Greedy baseline:** `temperature=0, top_p=1, n=1`

**Self-Consistency (majority vote over k sampled CoT paths):**

| Model | Temperature | k values |
|-------|-------------|----------|
| GPT-3.5-Turbo | 0.7 | 1, 5, 10, 20, 40 |
| UL2 (google/ul2) | 0.5 | 1, 5, 10, 20, 40 |
| GPT-5.2 | 0.7 | 1, 5, 10, 20, 40 |

**Benchmarks:** SVAMP, AQuA, GSM8K, StrategyQA

## 0 — Imports

Safely load modules from the existing codebase (some files have inline test code that
references notebook-only variables, so we catch those errors during import).

In [7]:
import importlib.util, sys, os, re
from collections import Counter
from functools import partial

import pandas as pd
from dotenv import load_dotenv
from datasets import load_dataset
from torch.utils.data import DataLoader

load_dotenv()

_spec = importlib.util.spec_from_file_location("prompts", "prompts.py")
_mod = importlib.util.module_from_spec(_spec)
try:
    _spec.loader.exec_module(_mod)
except NameError:
    pass 
sys.modules["prompts"] = _mod

from prompts import BenchmarkType, build_prompt, FEW_SHOT_EXAMPLES

_spec2 = importlib.util.spec_from_file_location("aggregator", "aggregator.py")
_mod2 = importlib.util.module_from_spec(_spec2)
_mod2.BenchmarkType = BenchmarkType
_mod2.build_prompt = build_prompt
try:
    _spec2.loader.exec_module(_mod2)
except NameError:
    pass  
sys.modules["aggregator"] = _mod2

from aggregator import BenchmarkResults, extract_answer, grade_answer, aggregate

from model_wrapper import ModelClient
from plotting_wrapper import plot_self_consistency_curves

print("All imports OK")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

## 1 — Load & Parse Datasets

In [3]:
# Load raw datasets (same splits used in benchmarks.ipynb)
svamp_test       = load_dataset("tongyx361/svamp", split="test")
aqua_test        = load_dataset("deepmind/aqua_rat", split="test")
gsm8k_test       = load_dataset("openai/gsm8k", "main", split="test")
strategy_qa_test = load_dataset("ChilleD/StrategyQA", split="test")

# ── Parsing helpers (from benchmarks.ipynb) ────────────────────────────
def parse_svamp_example(example):
    example["question"]     = example["Body"] + " " + example["Question"]
    example["final_answer"] = str(example["Answer"])
    return example

def parse_aqua_example(example):
    options = "\n".join(example["options"])
    example["question"]     = example["question"] + "\nOptions:\n" + options
    example["final_answer"] = example["correct"]
    return example

def parse_gsm8k_example(example):
    split = example["answer"].split("#### ")
    example["final_answer"] = split[-1].strip()
    example["reasoning"]    = split[0].strip()
    return example

def parse_strategy_qa_example(example):
    example["final_answer"] = str(example["answer"])
    return example

# Apply parsers
svamp_processed       = svamp_test.map(parse_svamp_example)
aqua_processed        = aqua_test.map(parse_aqua_example)
gsm8k_processed       = gsm8k_test.map(parse_gsm8k_example)
strategy_qa_processed = strategy_qa_test.map(parse_strategy_qa_example)

print(f"SVAMP: {len(svamp_processed)},  AQuA: {len(aqua_processed)},  "
      f"GSM8K: {len(gsm8k_processed)},  StrategyQA: {len(strategy_qa_processed)}")

NameError: name 'load_dataset' is not defined

## 2 — DataLoaders & Benchmark Map

In [None]:
BATCH_SIZE = 4

svamp_dataloader       = DataLoader(svamp_processed,       batch_size=BATCH_SIZE)
aqua_dataloader        = DataLoader(aqua_processed,        batch_size=BATCH_SIZE)
gsm8k_dataloader       = DataLoader(gsm8k_processed,       batch_size=BATCH_SIZE)
strategy_qa_dataloader = DataLoader(strategy_qa_processed, batch_size=BATCH_SIZE)

# Central mapping: name -> (dataloader, BenchmarkType)
BENCHMARK_MAP = {
    "svamp":       (svamp_dataloader,       BenchmarkType.SVAMP),
    "aqua":        (aqua_dataloader,        BenchmarkType.AQUA),
    "gsm8k":       (gsm8k_dataloader,       BenchmarkType.GSM8K),
    "strategy_qa": (strategy_qa_dataloader, BenchmarkType.STRATEGY_QA),
}

# Quick sanity check
for name, (dl, btype) in BENCHMARK_MAP.items():
    batch = next(iter(dl))
    print(f"{name:>12s}  Q: {batch['question'][0][:80]}…  A: {batch['final_answer'][0]}")

## 3 — Majority-Vote & Self-Consistency Aggregation

The original `aggregate()` in `aggregator.py` does greedy (1-sample) evaluation.
Below we add **`majority_vote`** and **`aggregate_self_consistency`** which sample
*k* CoT paths per question and take a majority vote — the core of Wang et al. 2022.

In [None]:
def majority_vote(answers):
    """Return the most common non-None answer (ties broken arbitrarily)."""
    valid = [a for a in answers if a is not None]
    if not valid:
        return None
    return Counter(valid).most_common(1)[0][0]


def aggregate_self_consistency(
    client,
    model_name,
    dataloader,
    benchmark,
    temperature,
    k,
    cot=True,
):
    """
    Self-consistency evaluation: for each question, sample *k* CoT
    reasoning paths, extract an answer from each, and return the
    majority-vote answer.
    """
    results = BenchmarkResults(name=benchmark.value)

    for batch in dataloader:
        questions    = batch["question"]
        gold_answers = batch["final_answer"]

        for q, gold in zip(questions, gold_answers):
            prompt = build_prompt(q, benchmark, cot=cot)
            responses = client.generate(
                model_name=model_name,
                prompt=prompt,
                temperature=temperature,
                top_p=1.0,
                num_samples=k,
            )

            extracted = [extract_answer(r, benchmark) for r in responses]
            predicted = majority_vote(extracted)
            correct   = grade_answer(predicted, gold, benchmark)
            results.total += 1
            if correct:
                results.correct += 1
            else:
                results.failures.append({
                    "question": q,
                    "predicted": predicted,
                    "gold": gold,
                    "raw_responses": responses,
                })

    return results

print("majority_vote and aggregate_self_consistency defined.")

## 4 — Experiment Configuration

In [None]:
client = ModelClient()

# ── Self-consistency k values from the paper ──────────────────────────
K_VALUES = [1, 5, 10, 20, 40]

# ── Per-model sampling temperatures ───────────────────────────────────
MODEL_CONFIGS = {
    "gpt-3.5-turbo": 0.7,
    "google/ul2":    0.5,
    "gpt-5.2":       0.7,
}

RESULTS_CSV = "results.csv"
PLOTS_DIR   = "plots"

print(f"Models:     {list(MODEL_CONFIGS.keys())}")
print(f"Datasets:   {list(BENCHMARK_MAP.keys())}")
print(f"k values:   {K_VALUES}")

## 5 — Run Greedy Baselines (`temperature=0, n=1`)

Uses the existing `aggregate()` from `aggregator.py` with a thin wrapper
around `ModelClient.generate`.

In [None]:
rows = []

for model_name in MODEL_CONFIGS:
    # Wrap ModelClient.generate to match aggregate()'s expected interface:
    #   model_fn(prompts: List[str]) -> List[str]   (one response per prompt)
    def _greedy_fn(prompts, _model=model_name):
        return [
            client.generate(
                model_name=_model,
                prompt=p,
                temperature=0.0,
                top_p=1.0,
                num_samples=1,
            )[0]
            for p in prompts
        ]

    for ds_name, (dataloader, btype) in BENCHMARK_MAP.items():
        print(f"[greedy] {model_name} / {ds_name} … ", end="", flush=True)
        result = aggregate(_greedy_fn, dataloader, btype, cot=True)
        print(result)
        rows.append({
            "model":    model_name,
            "dataset":  ds_name,
            "method":   "greedy",
            "k":        1,
            "accuracy": result.accuracy,
        })

greedy_df = pd.DataFrame(rows)
greedy_df

## 6 — Run Self-Consistency (`k ∈ {1, 5, 10, 20, 40}`)

For each model × dataset × k, sample *k* CoT paths and take a majority vote.

In [None]:
sc_rows = []

for model_name, temperature in MODEL_CONFIGS.items():
    for ds_name, (dataloader, btype) in BENCHMARK_MAP.items():
        for k in K_VALUES:
            print(f"[self-consistency] {model_name} / {ds_name} / k={k} … ",
                  end="", flush=True)
            result = aggregate_self_consistency(
                client=client,
                model_name=model_name,
                dataloader=dataloader,
                benchmark=btype,
                temperature=temperature,
                k=k,
                cot=True,
            )
            print(result)
            sc_rows.append({
                "model":    model_name,
                "dataset":  ds_name,
                "method":   "self_consistency",
                "k":        k,
                "accuracy": result.accuracy,
            })

sc_df = pd.DataFrame(sc_rows)
sc_df

## 7 — Save Combined Results

In [None]:
all_df = pd.concat([greedy_df, sc_df], ignore_index=True)
all_df.to_csv(RESULTS_CSV, index=False)
print(f"Saved {len(all_df)} rows to {RESULTS_CSV}")
all_df

## 8 — Plot Results

Uses `plot_self_consistency_curves` from `plotting_wrapper.py` to generate
one accuracy-vs-k chart per dataset.

In [None]:
plot_self_consistency_curves(
    df=all_df,
    output_dir=PLOTS_DIR,
    k_values=K_VALUES,
)

# Also display inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from pathlib import Path

for img_path in sorted(Path(PLOTS_DIR).glob("*.png")):
    img = mpimg.imread(str(img_path))
    fig, ax = plt.subplots(figsize=(9, 5))
    ax.imshow(img)
    ax.set_title(img_path.stem)
    ax.axis("off")
    plt.tight_layout()
    plt.show()

## 9 — Summary Table

In [None]:
pivot = all_df.pivot_table(
    index=["model", "method"],
    columns=["dataset", "k"],
    values="accuracy",
)
pivot.style.format("{:.1%}").set_caption("Accuracy by model / method / dataset / k")