In [1]:
from vllm import LLM, SamplingParams
import json, os, random
import pdb
from datasets import load_dataset
import time

import hydra 
from omegaconf import DictConfig, OmegaConf

INFO 09-28 17:29:45 [__init__.py:216] Automatically detected platform cuda.


In [2]:
def mystrip(one_str):
    one_str = one_str.strip()
    one_str = one_str.strip("\\n")
    one_str = one_str.strip("#")
    return one_str

def extract_substring2(text, start_str, stop_strs):
    start_index = text.find(start_str)
    if start_index == -1:
        return None
    start = start_index + len(start_str)
    
    end = len(text)
    
    for stop_str in stop_strs:
        temp_index = text.find(stop_str, start)
        if temp_index != -1 and temp_index < end:
            end = temp_index
    if start < end:
        return mystrip(text[start:end])
    else:
        return None

def split_response(response):
    mydict = {
        "original":response
    }
    str_analysis = "The problem analysis:"
    str_query = "The retrieval query:"
    str_answer = "The final answer:"
    stop_strs = [str_analysis, str_query, str_answer, "The retrieval documents:", "###", "####"]
    stop_strs_query = [str_analysis, str_query, str_answer, "The retrieval documents:", "###", "####", "\nStep", "?"]
    stop_strs_answer = [str_analysis, str_query, str_answer, "The retrieval documents:", "###", "####", "\nStep"]
    
    start_index = response.find(str_analysis)
    if start_index==-1:     
        mydict['analysis']=None
        return mydict
    else:
        mydict["analysis"]=extract_substring2(response, str_analysis, stop_strs)

    start_index_query = response.find(str_query, start_index+len(str_analysis))
    start_index_answer = response.find(str_answer, start_index+len(str_analysis))

    if start_index_query==-1 and start_index_answer==-1:
        mydict['analysis']=None
        return mydict
    elif start_index_query!=-1 and start_index_answer!=-1:
        if start_index_query<start_index_answer:
            mydict['query']=extract_substring2(response[start_index_query:], str_query, stop_strs_query)
        else:
            mydict['answer']=extract_substring2(response[start_index_answer:], str_answer, stop_strs_answer)
    elif start_index_query!=-1:
        mydict['query']=extract_substring2(response[start_index_query:], str_query, stop_strs_query)
    elif start_index_answer!=-1:
        mydict['answer']=extract_substring2(response[start_index_answer:], str_answer, stop_strs_answer)
    else:
        raise ValueError
    return mydict

def solve(cfg: DictConfig):
    ckpt , records = solve_init(cfg)
    solve_main(cfg, ckpt, records)
    
    remain_idxs = [i for i , record in enumerate(records) if 'answer' not in record]
    print(f"Remain records: {len(remain_idxs)}")

    if len(remain_idxs) > 0:
        solve_directly(cfg, ckpt, records)
    
    output_file = os.path.join(os.getcwd(), "records.jsonl")
    print(f"Saving records to {output_file}")
    with open(output_file, "w", encoding='utf-8') as f:
        for record in records:
            json.dump(record, f, ensure_ascii=False)
            f.write('\n')

def solve_init(cfg: DictConfig):
    if cfg.debug:
        ckpt = LLM(
            model=cfg.model.path, 
            tensor_parallel_size=1,
        )
    else:
        ckpt = LLM(
            model=cfg.model.path, 
            tensor_parallel_size=cfg.model.tensor_parallel_size
        )
    print("ckpt is ready.")
    
    dataset = dataset = load_dataset('hotpotqa/hotpot_qa', 'fullwiki')['validation']

    if cfg.debug:
        dataset_size = len(dataset)
        sample_size = min(8, dataset_size)
        sampled_indices = random.sample(range(dataset_size), sample_size)
        dataset = dataset.select(sampled_indices)

    records = []
    for i, data in enumerate(dataset):
        record = {
            'question': data['question'],
            'golden_answers': data['answer'],
            'state': "undo",
            'resample_times': 0
        }
        records.append(record)
    return ckpt , records

def generate_naive_generation_cot_prompt(question):
    system_message = """You are a helpful assistant that thinks through problems step by step before providing a final answer based on your own knowledge.

For any question, please structure your response in this format:
The problem analysis: [Provide detailed step-by-step reasoning]
The final answer: [Provide the concise final answer]

Example:
User: What is 25 × 36?
Assistant:
The problem analysis: I need to multiply 25 by 36.
I can break this down:
25 × 36 = 25 × (30 + 6)
= 25 × 30 + 25 × 6
= 750 + 150
= 900
The final answer: 25 × 36 = 900

Please think through each question carefully, breaking down complex problems into manageable steps."""
    user_message = f"""The question: {question}"""
    message_list = [{"role": "system", "content": system_message}, {"role": "user", "content": user_message}]
    return message_list

def generate_naive_generation_prompt(question):
    system_message = """ Answer the question based on your own knowledge. Only give me the answer and do not output any other words."""
    user_message = f"""The question: {question}"""
    message_list = [{"role": "system", "content": system_message}, {"role": "user", "content": user_message}]
    return message_list

def solve_main(cfg: DictConfig, ckpt, records):
    sampling_params = SamplingParams(temperature=cfg.params.temperature, max_tokens=cfg.params.max_tokens)
    
    messages = [generate_naive_generation_cot_prompt(record['question']) for record in records]
    outputs = ckpt.chat(messages, sampling_params)
    outputs = [output.outputs[0].text for output in outputs]
    vals = [split_response(output) for output in outputs]
        
    for i, val in enumerate(vals):
        records[i]['output'] = val['original']
        if 'answer' in val and val['answer'] is not None:
            records[i]['answer'] = val['answer']
            records[i]['state'] = "done"
        else:
            records[i]['state'] = "wrong"

def solve_directly(cfg: DictConfig, ckpt, records):
    sampling_params = SamplingParams(temperature=cfg.params.temperature, max_tokens=cfg.params.max_tokens)
    
    remain_idxs = [i for i, record in enumerate(records) if 'answer' not in record]
    messages = [generate_naive_generation_prompt(records[remain_idx]['question']) for remain_idx in remain_idxs]
    
    outputs = ckpt.chat(messages, sampling_params)
    outputs = [output.outputs[0].text for output in outputs]
        
    for output, remain_idx in zip(outputs, remain_idxs):   
        records[remain_idx]['answer'] = output
        records[remain_idx]['state'] = "done"
        records[remain_idx]['resample_times'] = records[remain_idx].get('resample', 0) + 1


In [3]:
file_path = '/home/kangjh/Research/ParametricReasoning/RPRAG/benchmark/NaiveGeneration/conf/config.yaml'
cfg = OmegaConf.load(file_path)
print(OmegaConf.to_yaml(cfg))
start = time.time()
print(f"Start at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start))}")

solve(cfg)

end = time.time()
print(f"End at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end))}")
elapsed_time = end - start
print(f"Elapsed time: {elapsed_time:.2f} seconds")




defaults:
- _self_
debug: false
model:
  path: meta-llama/Meta-Llama-3-8B-Instruct
  tensor_parallel_size: 1
data:
  dev_dataset_path: /mnt/raid5/kangjh/downloads/datasets/hotpotqa/dev/dev.json
params:
  temperature: 0.0
  max_tokens: 512

Start at 2025-09-28 17:29:48
INFO 09-28 17:29:48 [utils.py:328] non-default args: {'disable_log_stats': True, 'model': 'meta-llama/Meta-Llama-3-8B-Instruct'}


INFO 09-28 17:29:57 [__init__.py:742] Resolved architecture: LlamaForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 09-28 17:29:57 [__init__.py:1815] Using max model len 8192
INFO 09-28 17:30:01 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:02 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:02 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=Fal



[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:07 [gpu_model_runner.py:2370] Loading model from scratch...
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:07 [cuda.py:362] Using Flash Attention backend on V1 engine.
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:07 [weight_utils.py:348] Using model weights format ['*.safetensors']
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:08 [weight_utils.py:369] Ti

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:11 [default_loader.py:268] Loading weights took 2.83 seconds
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:12 [gpu_model_runner.py:2392] Model loading took 14.9596 GiB and 4.429412 seconds
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:17 [backends.py:539] Using cache directory: /home/kangjh/.cache/vllm/torch_compile_cache/ec52f37144/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:17 [backends.py:550] Dynamo bytecode transform time: 5.32 s
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:19 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.827 s
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:21 [monitor.py:34] torch.compile takes 5.32 s in total
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:23 [gpu_worker.py:298] Available KV cache memory: 26.48 GiB
[1;36m(EngineCore_DP0

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:08<00:00,  8.17it/s]


[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:32 [gpu_model_runner.py:3118] Graph capturing finished in 9 secs, took 0.53 GiB
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:32 [gpu_worker.py:391] Free memory on device (47.13/47.43 GiB) on startup. Desired GPU memory utilization is (0.9, 42.69 GiB). Actual usage is 14.96 GiB for weight, 1.24 GiB for peak activation, 0.02 GiB for non-torch memory, and 0.53 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=27701056921` to fit into requested memory, or `--kv-cache-memory=32465419264` to fully utilize gpu memory. Current kv cache memory in use is 28428768665 bytes.
[1;36m(EngineCore_DP0 pid=1323835)[0;0m INFO 09-28 17:30:32 [core.py:218] init engine (profile, create kv cache, warmup model) took 20.74 seconds
INFO 09-28 17:30:34 [llm.py:295] Supported_tasks: ['generate']
INFO 09-28 17:30:34 [__init__.py:36] No IOProcessor plugins requested by the model
ckpt is ready.
INFO 09-28

Adding requests:   0%|          | 0/7405 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/7405 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…

Remain records: 38


Adding requests:   0%|          | 0/38 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/38 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Saving records to /home/kangjh/Research/ParametricReasoning/RPRAG/records.jsonl
End at 2025-09-28 17:39:15
Elapsed time: 567.51 seconds


In [4]:
dataset = dataset = load_dataset('hotpotqa/hotpot_qa', 'fullwiki')['validation']

In [5]:
dataset

Dataset({
    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
    num_rows: 7405
})

In [6]:
dataset[0]['answer']

'yes'

In [7]:
dataset[1]['answer']

'Chief of Protocol'