# Majority voting

In [None]:
import os
import torch
from vllm import LLM, SamplingParams
from datasets import load_dataset
from collections import defaultdict
import torch.multiprocessing as mp
import random, numpy as np
import time
from utils import *

  from .autonotebook import tqdm as notebook_tqdm
2025-08-03 08:35:52,662	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


* Setting Configurations 

In [2]:
config = {
    "model_path": "meta-llama/Llama-3.1-8B-Instruct",
    "dataset_name": "HuggingFaceH4/MATH-500",
    "dataset_split": "test",
    "prompt": """Solve the following math problem efficiently and clearly:

- For simple problems (2 steps or fewer):
Provide a concise solution with minimal explanation.

- For complex problems (3 steps or more):
Use this step-by-step format:

## Step 1: [Concise description]
[Brief explanation and calculations]

## Step 2: [Concise description]
[Brief explanation and calculations]

...

Therefore, the final answer is: $\\boxed{answer}$. I hope it is correct.

Where [answer] is just the final number or expression that solves the problem."""
}

In [3]:
# Fix seed 
def set_seed(seed):
    mp.set_sharing_strategy('file_system')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


In [4]:
# Start llm engine 
llm = LLM(model=config["model_path"])

INFO 08-03 07:36:39 config.py:1005] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 08-03 07:36:39 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/L

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  5.43it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:01,  1.83it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  1.46it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.35it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]


INFO 08-03 07:36:46 model_runner.py:1071] Loading model weights took 14.9888 GB





INFO 08-03 07:36:46 gpu_executor.py:122] # GPU blocks: 28041, # CPU blocks: 2048
INFO 08-03 07:36:46 gpu_executor.py:126] Maximum concurrency for 131072 tokens per request: 3.42x
INFO 08-03 07:36:48 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-03 07:36:48 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-03 07:36:55 model_runner.py:1530] Graph capturing finished in 8 secs.


# Test trials for majority voting

In [5]:
from typing import List, Dict, Tuple
from collections import defaultdict

# Majority voting with normalization
def find_majority_answer(completions: List[str]) -> Tuple[str, Dict[str, int]]:
    canonical_groups = defaultdict(int)
    canonical_to_original = {}

    for output in completions:
        # Step 1: Try extracting from boxed expression
        boxed = last_boxed_only_string(output)
        if boxed:
            try:
                answer = remove_boxed(boxed)
            except Exception:
                continue  # skip this sample if boxed parsing fails
        else:
            continue  # skip if no boxed answer found

        # Step 2: Normalize and vote
        canonical = strip_string(answer)
        canonical_groups[canonical] += 1

        if canonical not in canonical_to_original:
            canonical_to_original[canonical] = answer

    # Step 3: Determine majority
    if not canonical_groups:
        return "", {}  # No valid votes

    max_count = max(canonical_groups.values())
    for canonical, count in canonical_groups.items():
        if count == max_count:
            return canonical_to_original[canonical], dict(canonical_groups)

# Inference wrapper with prompt formatting
def majority_vote(batch: Dict, config: Dict, sampling_params, llm) -> Dict:
    prompt = batch["problem"]
    full_prompt = f"{config['prompt'].strip()}\n\n### Problem:\n{prompt.strip()}"

    # generate가 List[RequestOutput] 반환
    request_outputs = llm.generate(full_prompt, sampling_params=sampling_params)

    # 각 RequestOutput 객체에서 outputs 리스트를 펼쳐서 CompletionOutput.text 수집
    completions = []
    for request_output in request_outputs:
        completions.extend([output.text for output in request_output.outputs])

    pred, votes = find_majority_answer(completions)

    return {
        "completions": completions,
        "pred": pred,
        "votes": votes,
    }

# Prediction vs GT comparison
def evaluate_single(predicted: str, gt_solution: str) -> bool:
    pred_norm = strip_string(predicted)
    try:
        gt_extracted = remove_boxed(last_boxed_only_string(gt_solution)) or gt_solution
        gt_norm = strip_string(gt_extracted)
    except Exception:
        gt_norm = strip_string(gt_solution)
    return pred_norm == gt_norm

In [None]:
from datasets import Dataset, load_dataset

dataset = load_dataset(config['dataset_name'], split=config['dataset_split'])
dataset


Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id'],
    num_rows: 2
})

# Power monitoring & save module

In [None]:
import os
import json
import pandas as pd
from pathlib import Path
from typing import List


# Base directory to store all results
base_dir = Path("results")
base_dir.mkdir(exist_ok=True)

# Helper function to save result and CSV placeholder
def save_results_and_csv(problem_id: int, n: int, result: dict, is_correct: bool):
    # Define directory
    folder_name = f"problem_{problem_id}_batch_{n}"
    folder_path = base_dir / folder_name
    folder_path.mkdir(exist_ok=True)

    # Save majority_vote result + correctness
    result_to_save = {
        "pred": result["pred"],
        "votes": result["votes"],
        "completions": result["completions"],
        "is_correct": is_correct
    }
    with open(folder_path / "result.json", "w") as f:
        json.dump(result_to_save, f, indent=2)

    # Create placeholder CSV for power monitoring
    csv_path = folder_path / "power_log.csv"
    if not csv_path.exists():
        df = pd.DataFrame(columns=["timestamp", "power_W"])
        df.to_csv(csv_path, index=False)

# Show a sample path
base_dir.absolute()

PosixPath('/root/search-and-learn/scripts/results')

# Main experiment

In [None]:
from monitor_power import monitor_npu_power_usage, monitor_gpu_power_usage, calculate_avg_power_usage
from multiprocessing import Process, Queue
from datetime import datetime

n_values = [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 128]

# 전체 루프
for idx in range(500):
    for n in n_values:
        # 1. 폴더 생성
        run_dir = f"results/problem_{idx}_batch_{n}"
        os.makedirs(run_dir, exist_ok=True)
        power_monitor_path = os.path.join(run_dir, "device_status.csv")

        # 2. Power monitor start
        stop_queue = Queue()
        monitor_process = Process(
            target=monitor_gpu_power_usage,  # or monitor_npu_power_usage
            args=(power_monitor_path, stop_queue)
        )
        monitor_process.start()

        # 3. Run inference
        sampling_params = SamplingParams(
            n=n,
            temperature=0.8,
            top_p=1.0,
            max_tokens=2048,
        )

        try:
            result = majority_vote(dataset[idx], config, sampling_params, llm)
            is_correct = is_equiv(result["pred"], dataset[idx]['answer'], verbose=False)
        except Exception as e:
            result = {"error": str(e)}
            is_correct = False

        # 4. Stop power monitor
        stop_queue.put("stop")
        monitor_process.join()

        # 5. 평균 전력 사용량 계산
        avg_power = calculate_avg_power_usage(power_monitor_path)

        # 6. 결과 저장
        output_path = os.path.join(run_dir, "result.json")
        with open(output_path, "w") as f:
            json.dump({
                "index": idx,
                "batch_size": n,
                "prediction": result.get("pred"),
                "votes": result.get("votes"),
                "is_correct": is_correct,
                "avg_power_usage": avg_power,
                "error": result.get("error", None)
            }, f, indent=2)

        print(f"✅ Finished problem_{idx}, batch_{n}, correct={is_correct}")

NameError: name 'n_values' is not defined