In [1]:
from __future__ import annotations

import tqdm
import random
import argparse
import jsonlines
from pathlib import Path

from typing import Dict, List, Any
from banks.registries import DirectoryPromptRegistry

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer


MODELS: Dict[str, str] = {
    'llama3_8B': "/local/common_models/Llama-3.1-8B", 
    'llama3_70B': "/local/common_models/Llama-3.1-70B",
    'gemma3_1B': "/local/common_models/gemma-3-1b-pt",
    'gemma3_27B': "/local/common_models/gemma-3-27b-pt",
    'qwen2.5_7B': "/local/common_models/Qwen2.5-7B",
    'qwen2.5_32B': "/local/common_models/Qwen2.5-32B",
}

DATA_PATHS: Dict[tuple] = {
    # 'datasets/realistic/git_tasks.jsonl': ['revert', 'cherrypick'],
    '../datasets/realistic/stacktrace_tasks.jsonl': ['traceb', 'tracef'],
}

FEW_SHOT_CREATE: Dict[str, List[str]] = {
    'traceb': ["=== NEW TRACEBACK ===", "dummy", "=== ANSWER ===", "<start>", "dummy", "<end>"],
    'tracef': ["=== NEW TRACEBACK ===", "dummy", "=== ANSWER ===", "<start>", "dummy", "<end>"],
    'revert': ["=== NEW HISTORY ===", "dummy", "=== ANSWER ===", "<start>", "dummy", "<end>"],
    'cherrypick': ["=== NEW HISTORY ===", "dummy", "=== ANSWER ===", "<start>", "dummy", "<end>"]
}

  from .autonotebook import tqdm as notebook_tqdm


INFO 05-07 19:02:47 [__init__.py:239] Automatically detected platform cpu.


In [2]:

def load_dataset(data_path: Path, task_key: str) -> List[Dict]:
    """Load JSONL records and organise them per induction setting."""    
    records: List[Dict] = []
    data_path = Path(data_path)
    with jsonlines.open(data_path, "r") as reader:
        records.extend(reader)
    
    assert task_key in ['traceb', 'tracef', 'revert', 'cherrypick']
    dataset = []
    for rec in records:
        dataset.append({
            'input': rec['snippet'], 
            'target': rec[task_key],
        })
    return dataset


def get_prompts_from_registry(prompt_path: str, config: str = 'exact'):
    """
    Load task and system prompts from a prompt registry directory.
    """
    registry = DirectoryPromptRegistry(Path(prompt_path), force_reindex=True)
    task_prompt = registry.get(name=f"task_{config}")
    return task_prompt


In [3]:

class PromptBuilder:
    """Construct few shot prompts according to *PromptVariant*."""

    def __init__(
        self,
        shots: int,
        prompt_dir: Path,
        task_key: str,
        test_data_path: Path,
        tokenizer: AutoTokenizer,
    ) -> None:

        self.shots = shots
        self.task_key = task_key
        self.tokenizer = tokenizer

        self.dataset = load_dataset(test_data_path, task_key=task_key)
        self.task_prompt = get_prompts_from_registry(prompt_dir, task_key)

    def _make_example_line(self, input_str: str, target: str | None) -> str:
        """Return an *atomised* example line (with or without target)."""
        relevant_list = FEW_SHOT_CREATE[self.task_key].copy()
        # Insert the input string into the dummy placeholder
        relevant_list[1] = input_str
        relevant_list[4] = target
        # Each line in the list is joined by a newline character
        return "\n".join(relevant_list)

    def build_prompt(self, curr_record: Dict) -> List[int]:
        """Return a fully instantiated prompt string"""
        few_shot_pool = [rec for rec in self.dataset if rec != curr_record]
        few_shot_examples = random.sample(few_shot_pool, k=self.shots)
        # Add the few‑shot examples
        examples = [self._make_example_line(ex["input"], ex["target"]) for ex in few_shot_examples]
        # Each example is separated by a \n\n
        examples_block = '\n\n'.join(examples)
        # Add the current code snippet to the relevant block : seems correct ! 
        curr_prompt = self.task_prompt.text({'few_shot_block': examples_block, 'snippet': curr_record['input']})
        ids = [self.tokenizer.bos_token_id] + self.tokenizer.encode(curr_prompt, add_special_tokens=False)
        print(ids)
        return ids


In [4]:

class ModelEvaluator:
    """
    Handles vLLM-based inference for batch prompt evaluation.
    """
    def __init__(self, model_path: str, temperature: float, max_tokens: int, seed: int, tensor_parallel_size: int):
        # Load tokenizer and LLM engine
        print(f"Loading tokenizer from {model_path}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=False)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        print(f"Initializing vLLM engine with model {model_path}...")
        self.llm = LLM(model=model_path, tensor_parallel_size=tensor_parallel_size, seed=seed, skip_tokenizer_init=False)
        self.sampling_params = SamplingParams(
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=1.0,
            stop=["\n<end>", "<end>"],   # both variants, safe on all traces
        )


    def run(self, builder, batch_size: int) -> List[Dict[str, Any]]:
        """
        Generate outputs for a list of prompt strings in batches.
        Returns list of (generated_text, usage_stats).
        """
        records = builder.dataset
        results = []
        for i in tqdm.tqdm(range(0, len(records), batch_size)):
            curr_inp_batch = records[i : i + batch_size]
            
            token_batches = [builder.build_prompt(record) for record in curr_inp_batch]
            responses = self.llm.generate(
                prompt_token_ids=token_batches,
                sampling_params=self.sampling_params,
            )
            for resp, curr_input in zip(responses, curr_inp_batch):
                gen_ids = resp.outputs[0].token_ids
                out_text = self.tokenizer.decode(gen_ids).strip()
                results.append({
                    'completion_tokens': len(gen_ids),
                    'input_text': curr_input['input'],
                    'full_answer': out_text,
                    'gold_ans': curr_input['target'],
                    'exact_match': out_text == curr_input['target']
                })
                print(results)
            break
        return results


In [5]:

def save_to_jsonl(out_path: Path, records: list[dict]) -> None:
    """
    Save a list of dictionaries to a JSON Lines file.

    :param out_path: Path to the output file
    :param records: List of dicts to write
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    print(f"Saving results to {out_path}")
    with jsonlines.open(out_path, mode='w') as writer:
        writer.write_all(records)


In [6]:

# Initialize evaluator
evalr = ModelEvaluator(
    model_path='Qwen/Qwen2.5-1.5B',
    temperature=0,
    max_tokens=200,
    seed=20,
    tensor_parallel_size=1
)

Loading tokenizer from Qwen/Qwen2.5-1.5B...
Initializing vLLM engine with model Qwen/Qwen2.5-1.5B...
INFO 05-07 19:02:49 [config.py:2673] For macOS with Apple Silicon, currently bfloat16 is not supported. Setting dtype to float16.
INFO 05-07 19:02:54 [config.py:600] This model supports multiple tasks: {'classify', 'reward', 'embed', 'generate', 'score'}. Defaulting to 'generate'.
INFO 05-07 19:02:54 [config.py:1634] Disabled the custom all-reduce kernel because it is not supported on current platform.
INFO 05-07 19:02:54 [llm_engine.py:242] Initializing a V0 LLM engine (v0.8.3) with config: model='Qwen/Qwen2.5-1.5B', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quanti

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.86s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.86s/it]


INFO 05-07 19:02:59 [loader.py:447] Loading weights took 2.87 seconds
INFO 05-07 19:02:59 [executor_base.py:112] # cpu blocks: 9362, # CPU blocks: 0
INFO 05-07 19:02:59 [executor_base.py:117] Maximum concurrency for 131072 tokens per request: 1.14x





INFO 05-07 19:03:00 [llm_engine.py:448] init engine (profile, create kv cache, warmup model) took 0.83 seconds


In [7]:

for data_path, task_info_list in DATA_PATHS.items():
    data_path = Path(data_path)
    for task_type in task_info_list:
        builder = PromptBuilder(
            shots=1,
            prompt_dir=Path('../prompts/realistic/codeassist/'),
            task_key=task_type,
            test_data_path= Path(data_path),
            tokenizer=evalr.tokenizer
        )
        outputs = evalr.run(builder, 1)
        break
        # out_file = Path('') /f"{task_type}" / f"{args.model}_{data_path.stem}.jsonl"
        # save_to_jsonl(out_file, outputs)


  outputs = evalr.run(builder, 1)
  0%|          | 0/1500 [00:00<?, ?it/s]




TypeError: prompt must be a string, array of strings, array of tokens, or array of token arrays