# Majority voting

In [2]:
import os
import torch
import random
import numpy as np
import json
import pandas as pd
import asyncio
import time

from openai import AsyncOpenAI
from datasets import load_dataset
from collections import defaultdict
from multiprocessing import Process, Queue
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple

from utils import strip_string, last_boxed_only_string, remove_boxed, is_equiv
from monitor_power import monitor_npu_power_usage, monitor_gpu_power_usage, calculate_avg_power_usage


* Setting Configurations 

In [3]:
config = {
    "api_url": "http://localhost:8000/v1/",
    "model": "EMPTY",
    "dataset_name": "HuggingFaceH4/MATH-500",
    "dataset_split": "test",
    "prompt": """Solve the following math problem efficiently and clearly:

- For simple problems (2 steps or fewer):
Provide a concise solution with minimal explanation.

- For complex problems (3 steps or more):
Use this step-by-step format:

## Step 1: [Concise description]
[Brief explanation and calculations]

## Step 2: [Concise description]
[Brief explanation and calculations]

...

Therefore, the final answer is: $\\boxed{answer}$. I hope it is correct.

Where [answer] is just the final number or expression that solves the problem."""
}

In [4]:
# ------------------------- OpenAI API Client -------------------------
def create_client():
    return AsyncOpenAI(base_url=config["api_url"], api_key="EMPTY")

client = create_client()

def system(msg):
    return {"role": "system", "content": msg}

def user(msg):
    return {"role": "user", "content": msg}



In [5]:
# Fix seed 
def set_seed(seed):
    mp.set_sharing_strategy('file_system')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


NameError: name 'mp' is not defined

In [4]:
# Start llm engine 
llm = LLM(model=config["model_path"])

INFO 08-03 08:37:38 config.py:1005] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 08-03 08:37:38 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/L

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  3.29it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.47it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.23it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.32it/s]


INFO 08-03 08:37:45 model_runner.py:1071] Loading model weights took 14.9888 GB





INFO 08-03 08:37:46 gpu_executor.py:122] # GPU blocks: 28041, # CPU blocks: 2048
INFO 08-03 08:37:46 gpu_executor.py:126] Maximum concurrency for 131072 tokens per request: 3.42x
INFO 08-03 08:37:47 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-03 08:37:47 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-03 08:37:55 model_runner.py:1530] Graph capturing finished in 8 secs.


# Test trials for majority voting

In [6]:
def find_majority_answer(completions: List[str]) -> Tuple[str, Dict[str, int]]:
    canonical_groups = defaultdict(int)
    canonical_to_original = {}

    for output in completions:
        boxed = last_boxed_only_string(output)
        if boxed:
            try:
                answer = remove_boxed(boxed)
            except Exception:
                continue
        else:
            continue

        canonical = strip_string(answer)
        canonical_groups[canonical] += 1
        if canonical not in canonical_to_original:
            canonical_to_original[canonical] = answer

    if not canonical_groups:
        return "", {}

    max_count = max(canonical_groups.values())
    for canonical, count in canonical_groups.items():
        if count == max_count:
            return canonical_to_original[canonical], dict(canonical_groups)

def majority_vote(batch: Dict, completions: List[str]) -> Dict:
    pred, votes = find_majority_answer(completions)
    return {
        "completions": completions,
        "pred": pred,
        "votes": votes,
    }


def save_results(run_dir: Path, result: dict):
    run_dir.mkdir(parents=True, exist_ok=True)
    with open(run_dir / "result.json", "w") as f:
        json.dump(result, f, indent=2)

In [11]:
async def one_request(prompt: str) -> str:
    messages = [
        system("You are a helpful assistant with various knowledge. Answer with precision and clarity."),
        user(prompt)
    ]
    result = ""
    stream = await client.chat.completions.create(
        model=config["model"],
        messages=messages,
        temperature=0.8,
        top_p=1.0,
        max_tokens=1500,
        stream=True,
    )
    async for chunk in stream:
        if chunk.choices[0].delta.content:
            result += chunk.choices[0].delta.content
    return result

async def generate_n_responses(prompt: str, n: int, timestamps: dict) -> List[str]:
    timestamps["api_start"] = time.time()
    tasks = [one_request(prompt) for _ in range(n)]
    responses = await asyncio.gather(*tasks)
    timestamps["batch_done"] = time.time()
    return responses

In [8]:
# set_seed(42)

base_dir = Path("results_single")
base_dir.mkdir(exist_ok=True)

dataset = load_dataset(config['dataset_name'], split=config['dataset_split'])

# n_values = [64, 1, 2, 4, 8, 16, 32, 48]
n = 64
idx = 0

In [9]:
batch = dataset[idx]
full_prompt = f"{config['prompt'].strip()}\n\n### Problem:\n{batch['problem'].strip()}"
print(full_prompt)

Solve the following math problem efficiently and clearly:

- For simple problems (2 steps or fewer):
Provide a concise solution with minimal explanation.

- For complex problems (3 steps or more):
Use this step-by-step format:

## Step 1: [Concise description]
[Brief explanation and calculations]

## Step 2: [Concise description]
[Brief explanation and calculations]

...

Therefore, the final answer is: $\boxed{answer}$. I hope it is correct.

Where [answer] is just the final number or expression that solves the problem.

### Problem:
Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$


In [12]:
timestamps = {}
timestamps["majority_done"] = time.time()
responses = await generate_n_responses(full_prompt, n, timestamps)

responses

['## Step 1: Convert the point to polar coordinates using the formula $r = \\sqrt{x^2 + y^2}$\n$r = \\sqrt{0^2 + 3^2} = \\sqrt{9} = 3$\n\n## Step 2: Determine the angle $\\theta$ using the formula $\\theta = \\arctan\\left(\\frac{y}{x}\\right)$\nSince $x=0$, we have to consider the quadrant of the point. The point $(0,3)$ is in the positive y-direction, so $\\theta = \\frac{\\pi}{2}$\n\nTherefore, the final answer is: $\\boxed{(3,\\frac{\\pi}{2})}$.',
 '## Step 1: Calculate the radius (r) using the formula $r = \\sqrt{x^2 + y^2}.$\n$r = \\sqrt{0^2 + 3^2} = \\sqrt{0 + 9} = \\sqrt{9} = 3$\n\n## Step 2: Calculate the angle ($\\theta$) using the formula $\\theta = \\arctan(\\frac{y}{x}).$\nSince $x = 0$ and $y = 3$, we get $\\theta = \\arctan(\\frac{3}{0})$. However, this is undefined, so we look at the sign of y to determine the angle. Since y is positive, and the point is in the positive y-axis direction, we have $\\theta = \\frac{\\pi}{2}$.\n\n## Step 3: Determine if we need to add $2\\

In [16]:
from transformers import AutoTokenizer

# 모델에 맞는 tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

# 예: 첫 번째 응답
text = responses[2]

# 토큰화 및 길이 확인
tokenized = tokenizer(text, return_tensors="pt")
num_tokens = tokenized.input_ids.shape[1]

print(f"응답 토큰 길이: {num_tokens}")

응답 토큰 길이: 258


In [14]:
text = responses[1]


In [26]:
from typing import Optional, Union

import aiohttp
from tqdm import tqdm
from dataclasses import dataclass, field
import traceback
import sys 

AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=60)


@dataclass
class RequestFuncInput:
    prompt: str
    api_url: str
    prompt_len: int
    output_len: int
    model: str
    model_name: Optional[str] = None
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
    

@dataclass
class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
    itl: list[float] = field(
        default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
    prompt: str = ""
    initiated_timestamp: datetime = datetime.now()
    completed_timestamp: datetime = datetime.now()

async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("chat/completions", "profile")
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
            "messages": [
                {
                    "role": "user",
                    "content": content
                },
            ],
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "stream_options": {
                "include_usage": True,
            },
        }
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

                            if choices := data.get("choices"):
                                content = choices[0]["delta"].get("content")
                                # First token
                                if ttft == 0.0:
                                    ttft = timestamp - st
                                    output.ttft = ttft

                                # Decoding phase
                                else:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)

                                generated_text += content or ""
                            elif usage := data.get("usage"):
                                output.output_tokens = usage.get(
                                    "completion_tokens")

                            most_recent_timestamp = timestamp

                    output.generated_text = generated_text
                    output.success = True
                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

    if pbar:
        pbar.update(1)
    return output



In [19]:
import asyncio
import aiohttp
import nest_asyncio
nest_asyncio.apply()  # Jupyter 환경에서 필수

from typing import List

# Config 설정
api_url = "http://localhost:8000/v1/chat/completions"
model_name = "meta-llama/Llama-3.1-8B-Instruct"
context_len = 1122

# 개별 요청 함수
async def send_prompt(session: aiohttp.ClientSession, prompt: str, idx: int, context_len: int) -> str:
    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.8,
        "top_p": 1.0,
        "min_tokens": context_len-1,
        "max_tokens": context_len
    }
    try:
        async with session.post(api_url, json=payload) as resp:
            data = await resp.json()
            return data["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"[{idx}] request failed: {e}")
        return ""

# 병렬 요청 함수
async def generate_n_responses(prompt: str, n: int) -> List[str]:
    async with aiohttp.ClientSession() as session:
        tasks = [send_prompt(session, prompt, i, context_len) for i in range(n)]
        return await asyncio.gather(*tasks)



In [20]:
# 사용 예시
prompt = "What is the capital of France?"

# 실행
responses = await generate_n_responses(prompt, n=4)
for i, r in enumerate(responses):
    print(f"[{i}] {r}")

[0] The capital of France is Paris. Paris is also the most populous city in France and the country's cultural center, known for famous landmarks such as the Eiffel Tower, Notre Dame Cathedral, and the Louvre Museum. It has been the capital of France since 987.  With a population of over 2.1 million people within the city limits, and over 12.2 million in the metropolitan area, Paris is one of the most populous cities in the European Union. Paris also has a significant impact on the global economy, fashion, art, and culture. It is often referred to as the 'City of Light' due to its historic role in the Enlightenment and for its beautiful street lighting.  It is also a major hub for international business, finance, fashion, politics, education, and entertainment. Paris is also home to many world-renowned companies and organizations, such as the Euronext stock exchange and the French central bank. In summary, Paris is a hub for various activities such as culture, business, education, and e

In [23]:
from transformers import AutoTokenizer

# LLaMA 3.1 tokenizer 불러오기
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

text = responses[1]

# 토큰 수 계산
token_count = len(tokenizer.encode(text))
print(f"Token count: {token_count}")

Token count: 1123


In [None]:
# 응답 예시


Token count: 1501


In [18]:
responses[0]

"The capital of France is Paris. It is the country's largest city and is known for its famous landmarks such as the Eiffel Tower, Notre Dame Cathedral, and the Louvre Museum. It is also a major cultural and economic hub, and is home to many of France's government institutions, universities, and cultural attractions. Paris has been the capital of France since 987. It is one of the most visited cities in the world and is known for its romantic atmosphere, fashion, art, and cuisine. It has also been the hub of many historical events and revolutions, and is home to many significant museums, galleries, and libraries. \n\nIf you are looking for more information, I would be happy to help. Let me know what you would like to know. \n\nIs there something specific you would like to know? Would you like to know more about Paris in general, or perhaps one of the cities attractions? Let me know if you would like more information. \n\nI can also help you with more questions if you would like. Do you 

In [5]:
from typing import List, Dict, Tuple
from collections import defaultdict

# Majority voting with normalization
def find_majority_answer(completions: List[str]) -> Tuple[str, Dict[str, int]]:
    canonical_groups = defaultdict(int)
    canonical_to_original = {}

    for output in completions:
        # Step 1: Try extracting from boxed expression
        boxed = last_boxed_only_string(output)
        if boxed:
            try:
                answer = remove_boxed(boxed)
            except Exception:
                continue  # skip this sample if boxed parsing fails
        else:
            continue  # skip if no boxed answer found

        # Step 2: Normalize and vote
        canonical = strip_string(answer)
        canonical_groups[canonical] += 1

        if canonical not in canonical_to_original:
            canonical_to_original[canonical] = answer

    # Step 3: Determine majority
    if not canonical_groups:
        return "", {}  # No valid votes

    max_count = max(canonical_groups.values())
    for canonical, count in canonical_groups.items():
        if count == max_count:
            return canonical_to_original[canonical], dict(canonical_groups)

# Inference wrapper with prompt formatting
def majority_vote(batch: Dict, config: Dict, sampling_params, llm) -> Dict:
    prompt = batch["problem"]
    full_prompt = f"{config['prompt'].strip()}\n\n### Problem:\n{prompt.strip()}"

    # generate가 List[RequestOutput] 반환
    request_outputs = llm.generate(full_prompt, sampling_params=sampling_params)

    # 각 RequestOutput 객체에서 outputs 리스트를 펼쳐서 CompletionOutput.text 수집
    completions = []
    for request_output in request_outputs:
        completions.extend([output.text for output in request_output.outputs])

    pred, votes = find_majority_answer(completions)

    return {
        "completions": completions,
        "pred": pred,
        "votes": votes,
    }

# Prediction vs GT comparison
def evaluate_single(predicted: str, gt_solution: str) -> bool:
    pred_norm = strip_string(predicted)
    try:
        gt_extracted = remove_boxed(last_boxed_only_string(gt_solution)) or gt_solution
        gt_norm = strip_string(gt_extracted)
    except Exception:
        gt_norm = strip_string(gt_solution)
    return pred_norm == gt_norm

In [6]:
from datasets import Dataset, load_dataset

dataset = load_dataset(config['dataset_name'], split=config['dataset_split'])
dataset


Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id'],
    num_rows: 500
})

# Power monitoring & save module

In [7]:
import os
import json
import pandas as pd
from pathlib import Path
from typing import List


# Base directory to store all results
base_dir = Path("results")
base_dir.mkdir(exist_ok=True)

# Helper function to save result and CSV placeholder
def save_results_and_csv(problem_id: int, n: int, result: dict, is_correct: bool):
    # Define directory
    folder_name = f"problem_{problem_id}_batch_{n}"
    folder_path = base_dir / folder_name
    folder_path.mkdir(exist_ok=True)

    # Save majority_vote result + correctness
    result_to_save = {
        "pred": result["pred"],
        "votes": result["votes"],
        "completions": result["completions"],
        "is_correct": is_correct
    }
    with open(folder_path / "result.json", "w") as f:
        json.dump(result_to_save, f, indent=2)

    # Create placeholder CSV for power monitoring
    csv_path = folder_path / "power_log.csv"
    if not csv_path.exists():
        df = pd.DataFrame(columns=["timestamp", "power_W"])
        df.to_csv(csv_path, index=False)

# Show a sample path
base_dir.absolute()

PosixPath('/root/search-and-learn/scripts/results')

# Main experiment

In [None]:
from monitor_power import monitor_npu_power_usage, monitor_gpu_power_usage, calculate_avg_power_usage
from multiprocessing import Process, Queue
from datetime import datetime

n_values = [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 128]

# 전체 루프
for n in n_values:
    for idx in range(500):
    
        # 1. 폴더 생성
        run_dir = f"results/batch_{n}/problem_{idx}"
        os.makedirs(run_dir, exist_ok=True)
        power_monitor_path = os.path.join(run_dir, "device_status.csv")

        # 2. Power monitor start
        stop_queue = Queue()
        monitor_process = Process(
            target=monitor_gpu_power_usage,  # or monitor_npu_power_usage
            args=(power_monitor_path, stop_queue)
        )
        monitor_process.start()

        # 3. Run inference
        sampling_params = SamplingParams(
            n=n,
            temperature=0.8,
            top_p=1.0,
            max_tokens=2048,
        )

        try:
            result = majority_vote(dataset[idx], config, sampling_params, llm)
            is_correct = is_equiv(result["pred"], dataset[idx]['answer'], verbose=False)
        except Exception as e:
            result = {"error": str(e)}
            is_correct = False

        # 4. Stop power monitor
        stop_queue.put("stop")
        monitor_process.join()

        # 5. 평균 전력 사용량 계산
        avg_power = calculate_avg_power_usage(power_monitor_path)

        # 6. 결과 저장
        output_path = os.path.join(run_dir, "result.json")
        with open(output_path, "w") as f:
            json.dump({
                "index": idx,
                "batch_size": n,
                "prediction": result.get("pred"),
                "votes": result.get("votes"),
                "is_correct": is_correct,
                "avg_power_usage": avg_power,
                "error": result.get("error", None)
            }, f, indent=2)

        print(f"✅ Finished problem_{idx}, batch_{n}, correct={is_correct}")

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:15<00:00, 15.48s/it, est. speed input: 10.73 toks/s, output: 132.35 toks/s]

✅ Finished problem_0, batch_1, correct=True



Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:15<00:00, 15.27s/it, est. speed input: 10.87 toks/s, output: 268.24 toks/s]


✅ Finished problem_0, batch_2, correct=False


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:15<00:00, 15.47s/it, est. speed input: 10.73 toks/s, output: 198.28 toks/s]

✅ Finished problem_0, batch_4, correct=True



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:16<00:00, 16.12s/it, est. speed input: 10.30 toks/s, output: 586.84 toks/s]


✅ Finished problem_0, batch_8, correct=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:16<00:00, 16.98s/it, est. speed input: 9.78 toks/s, output: 1311.48 toks/s]


✅ Finished problem_0, batch_16, correct=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:19<00:00, 19.80s/it, est. speed input: 8.38 toks/s, output: 2438.42 toks/s]


✅ Finished problem_0, batch_32, correct=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:20<00:00, 20.29s/it, est. speed input: 8.18 toks/s, output: 3042.27 toks/s]


✅ Finished problem_0, batch_48, correct=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:24<00:00, 24.34s/it, est. speed input: 6.82 toks/s, output: 3930.99 toks/s]


✅ Finished problem_0, batch_64, correct=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts: 100%|██████████| 1/1 [00:25<00:00, 25.56s/it, est. speed input: 6.49 toks/s, output: 4660.41 toks/s]


✅ Finished problem_0, batch_80, correct=True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Monitoring stopped.


KeyboardInterrupt: 