In [None]:
import os
import gc
import json
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from vllm import LLM, SamplingParams
import time
import torch.multiprocessing as mp


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Set environment variable to help with memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Track progress
PROGRESS_FILE = "last_processed.json"

  from .autonotebook import tqdm as notebook_tqdm
2025-04-05 15:57:28,730	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Using device: cuda


In [2]:
def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            return json.load(f)
    return {}

def save_progress(progress_dict):
    with open(PROGRESS_FILE, "w") as f:
        json.dump(progress_dict, f)

def clean_output(text, keyword):
    index = text.rfind(keyword)
    if index != -1:
        return text[index + len(keyword):].strip()
    return text

In [3]:
class ExplanationGeneratorLama:
    def __init__(self, model_name, max_new_tokens=500):
        self.max_new_tokens = max_new_tokens
        self.sampling_params = SamplingParams(
            temperature=0.7,
            top_p=0.95,
            max_tokens=max_new_tokens
        )
        self.llm = LLM(
            model=model_name,
            dtype="half"
        )

    def build_prompts(self, entry):
        prompt_templates = [
            "Instruction: Provide a concise explanation of what the above code mean. Generate strictly less than 100 words in total. Please give the output just as text only. Do not return anything else. Answer: \n",
            "Instruction: Provide a detailed line-by-line explanation of this code snippet, describing the purpose and functionality of each statement, function, and control structure. Please give the output just as text only. Do not return anything else. Answer: \n",
            "Instruction: Summarize what this code snippet does in simple, non-technical language, focusing on its overall purpose and key operations for someone with little programming experience. Please give the output just as text only. Do not return anything else. Answer: \n",
            "Instruction: Generate an explanation of the code snippet in such a way that it can regenerate the code based on this explanation. Please give the output just as text only. Do not return anything else. Answer: \n",
            "Instruction: Explain how the code snippet  is implemented. Please provide the explanation as text only without any additional content. Answer: \n"
        ]
        prompts = []
        for template in prompt_templates:
            prompt = (
                f"Code snippet: {entry}\n"
                f"{template}"
            )
            prompts.append(prompt)
        return prompts

    def generate_explanations_batch(self, batch):
        all_prompts = []
        print(batch)
        print(type(batch))
        for entry in batch['code']:
            all_prompts.extend(self.build_prompts(entry))
        results = self.llm.generate(all_prompts, self.sampling_params)
        explanations = [res.outputs[0].text for res in results]

        # Group explanations
        grouped = [explanations[i:i+5] for i in range(0, len(explanations), 5)]
        return grouped

def process_and_save(batch, generator, model_key, output_path, progress_dict):
    explanations = generator.generate_explanations_batch(batch)
    new_data = []
    for i, entry in enumerate(batch):
        row = {
            "corpus_id": entry["corpus_id"],
            "query_id": entry["query_id"],
            "doc": entry["doc"],
            "code": entry["code"]
        }
        for j, explanation in enumerate(explanations[i]):
            row[f"explanation_{model_key}_{j+1}"] = explanation
        new_data.append(row)

    df_batch = pd.DataFrame(new_data)
    df_batch.to_csv(output_path, mode='a', index=False, header=not os.path.exists(output_path))

    # Only update progress after successful write
    last_id = batch[-1]["corpus_id"]
    progress_dict[model_key] = last_id
    save_progress(progress_dict)

In [None]:
if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)
    start = time.time()
    csv_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/data_preprocessing/CodeSearchNet_Python_valid.csv"
    output_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/explanations/CodeSearchNet_Python_valid_vllm_new.csv"
    
    dataset = Dataset.from_pandas(pd.read_csv(csv_path))

    models_dict = {
        "deepseek": "/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa",
        # "granite": "/datasets/ai/ibm-granite/hub/models--ibm-granite--granite-3.0-2b-instruct/snapshots/69e41fe735f54cec1792de2ac4f124b6cc84638f"
    }

    batch_size = 40  # effective batch is 5 * 40 = 200

    progress_dict = load_progress()

    for model_key, model_path in tqdm(models_dict.items()):
        print(f"\nProcessing model {model_key}")
        generator = ExplanationGeneratorLama(model_path)

        # Determine where to resume
        last_id = progress_dict.get(model_key)
        if last_id:
            idx = next((i for i, ex in enumerate(dataset) if ex['corpus_id'] == last_id), -1)
            if idx != -1:
                dataset = dataset.select(range(idx + 1, len(dataset)))

        def process_batch(batch):
            process_and_save(batch, generator, model_key, output_path, progress_dict)
            return batch  # Return unmodified batch for `map`'s sake

        dataset.map(
            process_batch,
            batched=True,
            batch_size=batch_size,
            num_proc=5,
            load_from_cache_file=False
        )

        del generator
        gc.collect()
        torch.cuda.empty_cache()

    end = time.time()
    print(f"All model generations written to {output_path}")
    print(f'Overall time taken = {(end-start)} seconds')
    


  0%|          | 0/1 [00:00<?, ?it/s]


Processing model deepseek
INFO 04-05 15:57:30 __init__.py:207] Automatically detected platform cuda.
INFO 04-05 15:57:38 config.py:549] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 04-05 15:57:38 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 04-05 15:57:38 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa', speculative_config=None, tokenizer='/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_para


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.31it/s]



INFO 04-05 15:57:41 model_runner.py:1115] Loading model weights took 3.3414 GB
INFO 04-05 15:57:41 worker.py:267] Memory profiling takes 0.48 seconds
INFO 04-05 15:57:41 worker.py:267] the current vLLM instance can use total_gpu_memory (44.52GiB) x gpu_memory_utilization (0.90) = 40.07GiB
INFO 04-05 15:57:41 worker.py:267] model weights take 3.34GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 35.25GiB.
INFO 04-05 15:57:42 executor_base.py:111] # cuda blocks: 82516, # CPU blocks: 9362
INFO 04-05 15:57:42 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 10.07x
INFO 04-05 15:57:44 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:14<00:00,  2.43it/s]

INFO 04-05 15:57:58 model_runner.py:1562] Graph capturing finished in 14 secs, took 0.21 GiB
INFO 04-05 15:57:58 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.69 seconds





{'Unnamed: 0': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'query_id': ['q251820', 'q251821', 'q251822', 'q251823', 'q251824', 'q251825', 'q251826', 'q251827', 'q251828', 'q251829', 'q251830', 'q251831', 'q251832', 'q251833', 'q251834', 'q251835', 'q251837', 'q251838', 'q251839', 'q251840', 'q251841', 'q251842', 'q251843', 'q251844', 'q251845', 'q251846', 'q251847', 'q251848', 'q251849', 'q251850', 'q251851', 'q251852', 'q251853', 'q251854', 'q251855', 'q251856', 'q251857', 'q251858', 'q251859', 'q251860'], 'corpus_id': ['c251820', 'c251821', 'c251822', 'c251823', 'c251824', 'c251825', 'c251826', 'c251827', 'c251828', 'c251829', 'c251830', 'c251831', 'c251832', 'c251833', 'c251834', 'c251835', 'c251836', 'c251837', 'c251838', 'c251839', 'c251840', 'c251841', 'c251842', 'c251843', 'c251844', 'c251845', 'c251846', 'c251847', 'c251848', 'c251849', 'c251850', 'c251851', 'c251852', 'c


Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

{'Unnamed: 0': [2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797], 'query_id': ['q254604', 'q254605', 'q254606', 'q254607', 'q254608', 'q254609', 'q254610', 'q254611', 'q254612', 'q254613', 'q254614', 'q254615', 'q254616', 'q254617', 'q254618', 'q254619', 'q254620', 'q254621', 'q254622', 'q254623', 'q254624', 'q254625', 'q254626', 'q254627', 'q254628', 'q254629', 'q254630', 'q254631', 'q254632', 'q254633', 'q254634', 'q254635', 'q254636', 'q254637', 'q254638', 'q254639', 'q254640', 'q254641', 'q254642', 'q254643'], 'corpus_id': ['c254578', 'c254579', 'c254580', 'c254581', 'c254582', 'c254583', 'c254584', 'c254585', 'c254586', 'c254587', 'c254588', 'c254589', 'c254590', 'c254591', 'c254592', 'c254593', 'c254594', 'c254595', 'c254596', 'c254597', 'c254598', 'c254599', 'c254600', 'c254601', 'c254602', 

Map (num_proc=5):   0%|          | 0/13788 [00:00<?, ? examples/s]


<class 'datasets.formatting.formatting.LazyBatch'>


Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


<class 'datasets.formatting.formatting.LazyBatch'>


Map (num_proc=5):   0%|          | 0/13788 [00:00<?, ? examples/s][A
Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

{'Unnamed: 0': [8274, 8275, 8276, 8277, 8278, 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302, 8303, 8304, 8305, 8306, 8307, 8308, 8309, 8310, 8311, 8312, 8313], 'query_id': ['q260161', 'q260162', 'q260164', 'q260165', 'q260166', 'q260167', 'q260168', 'q260169', 'q260170', 'q260171', 'q260172', 'q260173', 'q260174', 'q260176', 'q260177', 'q260178', 'q260179', 'q260180', 'q260181', 'q260182', 'q260183', 'q260184', 'q260185', 'q260186', 'q260187', 'q260188', 'q260189', 'q260190', 'q260191', 'q260192', 'q260193', 'q260194', 'q260195', 'q260196', 'q260197', 'q260198', 'q260199', 'q260200', 'q260201', 'q260202'], 'corpus_id': ['c260094', 'c260095', 'c260096', 'c260097', 'c260098', 'c260099', 'c260100', 'c260101', 'c260102', 'c260103', 'c260104', 'c260105', 'c260106', 'c260107', 'c260108', 'c260109', 'c260110', 'c260111', 'c260112', 'c260113', 'c260114', 'c260115', 'c260116', 'c260117', 'c260118', 





<class 'datasets.formatting.formatting.LazyBatch'>


Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


<class 'datasets.formatting.formatting.LazyBatch'>


Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Map (num_proc=5):   0%|          | 0/13788 [00:00<?, ? examples/s]
  0%|          | 0/1 [00:31<?, ?it/s]


RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method