In [None]:
%load_ext autoreload
%autoreload 2

# Generation

> Generate with specified stopping criteria


In [None]:
import argparse
import logging
import os
import time
import torch

from vllm import LLM, SamplingParams

In [None]:
from dart_math.utils import (
    init_logging,
    get_pathname_from_name_or_path,
    PromptTemplate,
)

from dart_math.gen import is_dp_dars_finished, Generator
from dart_math.eval import EvaluatorMathBatch
from dart_math.data import load_query_dps, RespSampleVLLM
from dart_math.exec import CodeExecCfg
from dart_math.utils import PROJ_HOME

In [None]:
init_logging()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

[INFO] [2024-07-06 22:07:05.22] [pid 3781391] [/ssddata/tongyx/projects/dart-math/dart_math/utils.py:306:init_logging]
log_path = None


In [None]:
parser = argparse.ArgumentParser(description="vLLM generation", allow_abbrev=False)

parser.add_argument(
    "--gen_save_path",
    type=str,
    default=os.path.join(PROJ_HOME, "data/res/gen.jsonl"),
    help="Path save results of generation (and evaluation).",
)

# Device
parser.add_argument(
    "--gpu_mem_util",
    type=float,
    default=0.85,
    help="GPU memory utilization for vLLM. Default: 0.85 in case of OOM.",
)

parser.add_argument(
    "--swap_space", type=float, default=60, help="CPU swap space in GB for vLLM."
)

# Model
parser.add_argument(
    "--model_name_or_path",
    type=str,
    default="deepseek-ai/deepseek-math-7b-rl",
    help="HF-style model name or path.",
)

parser.add_argument(
    "--dtype",
    type=str,
    default="bfloat16",
    help="Data type for the model.",
)

# Data
parser.add_argument(
    "--datasets",
    type=str,
    nargs="+",
    default=["math-test"],
    help="Dataset(s) to generate on.",
)

# Generation configurations
parser.add_argument(
    "--temperature",
    type=float,
    default=0,
    help="Temperature for sampling.",
)
parser.add_argument(
    "--top_p",
    type=float,
    default=0.95,
    help="Top-p for sampling.",
)
parser.add_argument(
    "--max_new_toks",
    type=int,
    default=2048,
    help="Maximum number of new tokens.",
)
parser.add_argument(
    "--n_shots",
    type=int,
    default=-1,
    help="Number of shots for prompting. -1 means adaptive to datasets.",
)
parser.add_argument(
    "--prompt_template",
    type=str,
    default="cot",
    help="ID / Path to the file of prompt template.",
)
parser.add_argument(
    "--n_paths",
    type=int,
    default=1,
    help="Number of generated completions per request. NOTE: might cause bug in vLLM (0.4.2).",
)
parser.add_argument(
    "--save_gen_path_bs",
    type=int,
    default=2**14,
    help="# Completions = # Paths per request * # Requests. Values <= 0 mean adaptive.",
)
parser.add_argument(
    "--inf_seed",
    type=int,
    default=0,
    help="Random seed for inference. -1 means using us timestamp mod 2^32.",
)

# Stopping criteria
parser.add_argument(
    "--max_n_trials",
    nargs="+",
    type=int,
    default=1,
    help="(List of) maximum number of trials for each query. Non-positive means no limit.",
)
parser.add_argument(
    "--gen_only",
    action="store_true",
    help="Whether to only generate reponses and not evaluate the generated completions.",
)
parser.add_argument(
    "--min_n_corrects",
    nargs="+",
    type=int,
    default=0,
    help="(List of) minimum number of correct completions per query needed to stop generation. Non-positive means no goal.",
)
parser.add_argument(
    "--strict_extract",
    action="store_true",
    help="Whether to extract answers strictly. If `False`, speculate the answer from the last number if needed.",
)

# Code execution
parser.add_argument(
    "--code_exec_cfg",
    type=str,
    default="",
    help="ID / Path to file of the code execution configuration.",
)
parser.add_argument(
    "--max_n_workers",
    type=int,
    default=None,
    help="The maximum number of CPU core workers to execute the code with multi-processing. Default as `None`, meaning using default value of `code_exec_cfg`. ",
)
parser.add_argument(
    "--max_n_calls",
    type=int,
    default=None,
    help="The maximum number of calls to the code execution function.\nThis could be large because there is token length limit already.\nDefault as `None`, meaning using default value of `code_exec_cfg`.  Non-positive values mean no limit.",
)
parser.add_argument(
    "--trunc_len",
    type=int,
    nargs=2,
    default=None,
    help="The maximum lengths to truncate the output into the beginning and end.\nDefault as `None`, meaning using default value of `code_exec_cfg`. Double non-positive values like `(0, 0)` mean no truncation. ",
)

args, unk_args = parser.parse_known_args()

for arg_str in unk_args:
    if arg_str.startswith("--f="):
        continue  # For Jupyter notebook
    else:
        raise ValueError(f"Unknown arguments: {unk_args}")

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# Test tool-integrated reasoning
args.prompt_template = "tool"

In [None]:
if args.inf_seed == -1:
    args.inf_seed = int(time.time() * 10**6) % 2**32
    logging.warning(f"args.inf_seed=-1 -> Setting {args.inf_seed=}")

if "tool" in args.prompt_template and args.code_exec_cfg == "":
    args.code_exec_cfg = "python"
    logging.warning(f"{args.prompt_template=} -> Setting {args.code_exec_cfg=}")

In [None]:
model_dirname = get_pathname_from_name_or_path(args.model_name_or_path)

In [None]:
prompt_template = (
    PromptTemplate.get_prompt_template_from_prompt_type_and_model(
        prompt_type=args.prompt_template, model_name_or_path=args.model_name_or_path
    )
    if args.prompt_template in ["cot", "tool"]
    else PromptTemplate.load_from_id_or_path(args.prompt_template)
)

In [None]:
query_dps = load_query_dps(args.datasets, args.max_n_trials, args.min_n_corrects)
logging.info(f"Loaded {len(query_dps)} query data points.")
# TODO: response-wise prompt template
for query_dp in query_dps:
    query_dp.prompt_template = prompt_template

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
[INFO] [2024-06-16 13:15:02.90] [pid 646448] [/tmp/ipykernel_646448/3154498625.py:2:<module>]
Loaded 5000 query data points.


In [None]:
if args.temperature <= 1e-5:
    args.temperature = 0
    args.n_paths = 1
    args.top_p = 1
    logging.warning(
        f"args.temperature<=1e-5 -> Setting {args.temperature=}, {args.n_paths=}, {args.top_p=} for vLLM."
    )

sampling_params = SamplingParams(
    n=args.n_paths,
    temperature=args.temperature,
    top_p=args.top_p,
    max_tokens=args.max_new_toks,
    skip_special_tokens=True,
    seed=args.inf_seed,
)

Temperature is too small. Setting temperautre = 0, n_paths = 1, top_p = 1 for vLLM.


In [None]:
sampling_params.stop = [
    prompt_template.query_prompt.strip(),
    prompt_template.resp_prompt.strip(),
]
logging.info(f"sampling_params = {sampling_params}")

[INFO] [2024-06-16 13:15:02.123] [pid 646448] [/tmp/ipykernel_646448/1135370137.py:5:<module>]
sampling_params = SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0, top_p=1.0, top_k=-1, min_p=0.0, seed=0, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=['User:', 'Assistant:'], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=2048, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None)


In [None]:
llm = LLM(
    model=args.model_name_or_path,
    tokenizer=args.model_name_or_path,
    tensor_parallel_size=torch.cuda.device_count(),
    dtype=args.dtype,
    seed=args.inf_seed,
    gpu_memory_utilization=args.gpu_mem_util,
    swap_space=args.swap_space,
    trust_remote_code=True,
)
logging.info("LLM loaded!")



INFO 06-16 13:15:02 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='deepseek-ai/deepseek-math-7b-rl', speculative_config=None, tokenizer='deepseek-ai/deepseek-math-7b-rl', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=deepseek-ai/deepseek-math-7b-rl)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-16 13:15:04 weight_utils.py:207] Using model weights format ['*.safetensors']
INFO 06-16 13:15:08 model_runner.py:146] Loading model weights took 12.8725 GB
INFO 06-16 13:15:08 gpu_executor.py:83] # GPU blocks: 7255, # CPU blocks: 8192
INFO 06-16 13:16:06 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-16 13:16:06 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-16 13:16:11 model_runner.py:924] Graph capturing finished in 5 secs.


[INFO] [2024-06-16 13:16:11.868] [pid 646448] [/tmp/ipykernel_646448/4134619846.py:11:<module>]
LLM loaded!


In [None]:
code_exec_cfg = (
    CodeExecCfg.load_from_id_or_path(args.code_exec_cfg) if args.code_exec_cfg else None
)
if code_exec_cfg:
    if args.max_n_workers is not None:
        code_exec_cfg.max_n_workers = args.max_n_workers
    if args.max_n_calls is not None:
        code_exec_cfg.max_n_calls = args.max_n_calls
    if args.trunc_len is not None:
        code_exec_cfg.trunc_len = args.trunc_len

    print(f"{code_exec_cfg.__dict__=}")

In [None]:
generator = Generator(
    llm,
    sampling_params,
    resp_sample_cls=RespSampleVLLM,
    batch_evaluator=(
        EvaluatorMathBatch(strict_extract=args.strict_extract)
        if not args.gen_only
        else None
    ),
    code_exec_cfg=code_exec_cfg,
)
generator.gen(
    query_dps=query_dps,
    dp_stop_criteria=is_dp_dars_finished,
    save_path=args.gen_save_path,
    n_paths_per_save=args.save_gen_path_bs,
)

[INFO] [2024-06-16 13:16:12.140] [pid 646448] [/ssddata/tongyx/projects/dart-math/dart_math/gen.py:151:gen_pure]
sampling_params: SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0, top_p=1.0, top_k=-1, min_p=0.0, seed=0, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=['User:', 'Assistant:', '```output'], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=2048, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None)
[INFO] [2024-06-16 13:16:12.141] [pid 646448] [/ssddata/tongyx/projects/dart-math/dart_math/gen.py:152:gen_pure]
input_strs[0]: User: How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have?
Please integrate natural language reasoning with programs to solve the problem above, and put your final answer within \boxed{}.

Assistant:
Processed prompts:  37%|███



Processed prompts: 100%|██████████| 5000/5000 [06:17<00:00, 13.26it/s, Generation Speed: 3101.52 toks/s]
[INFO] [2024-06-16 13:22:31.317] [pid 646448] [/ssddata/tongyx/projects/dart-math/dart_math/gen.py:210:gen_pure]
len(remain_ids): 4978
[INFO] [2024-06-16 13:22:31.317] [pid 646448] [/ssddata/tongyx/projects/dart-math/dart_math/gen.py:215:gen_pure]
cells_list: (#4978)[['from sympy import symbols, solveset, S\n\ndef count_vertical_asymptotes():\n    """How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?"""\n    x = symbols(\'x\')\n    denominator = x**2 + x - 6\n    asymptotes = solveset(denominator, x, domain=S.Reals)\n    num_asymptotes = len(asymptotes)\n\n    return num_asymptotes\n\nresult = count_vertical_asymptotes()\nprint(result)'],...]
Executing:  31%|███       | 1530/4978 [04:24<09:55,  5.79it/s]  


In [None]:
logging.info("Generation done!")

[INFO] [2024-06-16 11:52:25.783] [pid 587771] [/tmp/ipykernel_587771/3111312346.py:1:<module>]
Generation done!
