In [1]:
import re
import json
import traceback
from threading import Lock

import torch
from torch import cuda, bfloat16
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
new_model = "/home/hb/dataset_bgp/finetuned_models/LLaMA3-8B-analysis-5k-no_4bit_paged_adam32"

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto'
)

model = PeftModel.from_pretrained(
    base_model,
    new_model
)

# Merge LoRA weights into base model
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Ensure we have a valid pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer.padding_side = "left"

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=712
)
print("[INFO] Model, tokenizer, and pipeline are loaded.")

In [3]:
def query_llm(prompt, pipe_ref):
    """
    Generates text from the LLM using the pipeline.
    """
    generation_kwargs = dict(
        max_new_tokens=712,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.0,
        eos_token_id=pipe_ref.tokenizer.eos_token_id,
        pad_token_id=pipe_ref.tokenizer.pad_token_id,
    )
    result = pipe_ref(prompt, **generation_kwargs)
    return result[0]["generated_text"] if result else None

def extract_code_from_reply(llm_output):
    """
    Extract code from triple-backtick fences.
    """
    code_pattern = r"```(?:\w+)?\s*\n(.*?)```"
    match = re.search(code_pattern, llm_output, re.DOTALL)
    return match.group(1).strip() if match else None

def save_code_to_file(code, filename):
    """
    Saves code content to a file.
    """
    with open(filename, "w") as f:
        f.write(code)

def process_prompts_save_code(file_path, pipe_ref):
    """
    Loads prompts from JSON, queries the LLM, extracts code, saves to .py files.
    """
    with open(file_path, "r") as f:
        instructions = json.load(f)

    total = len(instructions)
    code_blocks_saved = 0

    for idx, instruction in enumerate(instructions, start=1):
        prompt = instruction["instruction"]
        name = instruction.get("task_name", f"Task_{idx}")
        print(f"[INFO] Generating code for '{name}'...")

        llm_output = query_llm(prompt, pipe_ref)
        if not llm_output:
            print("  [WARN] No LLM output.")
            continue

        code_block = extract_code_from_reply(llm_output)
        if code_block:
            code_blocks_saved += 1
            filename = f"gen_code/generated_{idx}.py"
            save_code_to_file(code_block, filename)
            print(f"  [SAVED] {filename}")
        else:
            print("  [WARN] No code block found.")

    print("\n=== Summary of Phase 1 ===")
    print(f"Total instructions: {total}")
    print(f"Code blocks saved : {code_blocks_saved}")
    # Return or store relevant info if needed


In [4]:
test_file_path = "/home/hb/LLM-research/evaluation/BGP/BGP_analysis_test.json"
process_prompts_save_code(test_file_path, pipe)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[INFO] Generating code for 'Basic BGP Update Collection'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_1.py
[INFO] Generating code for 'Filtering by Specific ASN'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [WARN] No code block found.
[INFO] Generating code for 'Extracting Unique Prefix Announcements'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [WARN] No code block found.
[INFO] Generating code for 'Detecting Withdrawn Routes'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_4.py
[INFO] Generating code for 'Monitoring AS Path Changes'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_5.py
[INFO] Generating code for 'Analyzing BGP Prefix Announcements by Multiple ASNs'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_6.py
[INFO] Generating code for 'Detecting Route Flapping Events'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_7.py
[INFO] Generating code for 'Comparing AS Paths Between Different Route Collectors'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [WARN] No code block found.
[INFO] Generating code for 'Identifying the Most Announced Prefixes'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_9.py
[INFO] Generating code for 'Detecting MOAS (Multiple Origin AS) Conflicts'...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_10.py
[INFO] Generating code for 'Identifying Hijacked Prefixes'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [WARN] No code block found.
[INFO] Generating code for 'Analyzing AS Path Prepending Behavior'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_12.py
[INFO] Generating code for 'Detecting Sudden BGP Route Growth Events'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [WARN] No code block found.
[INFO] Generating code for 'Tracking Longest AS Paths in Routing Data'...


Both `max_new_tokens` (=712) and `max_length`(=712) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  [SAVED] generated_14.py
[INFO] Generating code for 'Detecting BGP Route Leaks'...
  [SAVED] generated_15.py

=== Summary of Phase 1 ===
Total instructions: 15
Code blocks saved : 10


### change env to python_39_env

In [1]:
import os
import traceback

def evaluate_python_file(filepath: str):
    """
    Attempts to run the code in 'filepath'.
    Returns a dict with 'status': 'pass'|'fail', plus optional 'error_message'.
    """
    try:
        # Read in the code
        with open(filepath, "r") as f:
            code = f.read()

        # Step 1: Check syntax
        compile(code, filepath, "exec")

        # Step 2: Execute
        safe_globals = {
            "__builtins__": __builtins__,
            "__name__": "__main__",  # Avoid polluting actual environment
        }
        exec(code, safe_globals)

        return {"status": "pass"}

    except SyntaxError as se:
        return {
            "status": "fail",
            "error_message": (f"SyntaxError in {os.path.basename(filepath)}: {se.msg} "
                              f"at line {se.lineno}, col {se.offset}")
        }
    except Exception as ex:
        return {
            "status": "fail",
            "error_message": (f"RuntimeError in {os.path.basename(filepath)}:\n"
                              f"{traceback.format_exc()}")
        }

def evaluate_all_generated_scripts(directory_path: str) -> None:
    """
    PHASE 2:
    Finds all .py files in 'directory_path' and runs them.
    Summarizes pass/fail results for each script's code.
    """
    all_py_files = [f for f in os.listdir(directory_path) if f.endswith(".py")]
    total = len(all_py_files)
    passed = 0
    failed_files = []

    for script_file in all_py_files:
        filepath = os.path.join(directory_path, script_file)
        print(f"\n[INFO] Evaluating: {script_file}")

        result = evaluate_python_file(filepath)
        if result["status"] == "pass":
            print(f"  [PASS] {script_file}")
            passed += 1
        else:
            print(f"  [FAIL] {script_file}: {result['error_message']}")
            failed_files.append(script_file)

    failed = total - passed
    print("\n=== Final Evaluation Summary ===")
    print(f"Total scripts: {total}, Passed: {passed}, Failed: {failed}")

    if failed_files:
        print("Failed scripts:")
        for ff in failed_files:
            print(f" - {ff}")

In [None]:
evaluate_all_generated_scripts("gen_code/")