In [None]:
import json
import pandas as pd
import os

# Read CSV into DataFrame
df = pd.read_csv("vulnerabilities.csv")

# Define system prompt
system_prompt = """You are a security analyst. Given the following raw security report, extract and decompose its contents into a single JSON object with exactly these top-level keys. Response in json raw format without snippet code ('''json '''), example:
{
    "vulnerable_file": [""],
    "vulnerable_function": [""],
    "vulnerability_explanation": "",
    "recommendation": ""
}"""

# Parameters
max_tokens_per_batch = 800000  # Set below the 900,000 token limit to be safe
estimated_tokens_per_request = 1000  # Adjust based on your average request size
requests_per_batch = max_tokens_per_batch // estimated_tokens_per_request

# Create output directory
output_dir = "batch_inputs"
os.makedirs(output_dir, exist_ok=True)

batch_count = 0
request_count = 0
batch_file = open(os.path.join(output_dir, f"batch_input_{batch_count}.jsonl"), "w")

for index, row in df.iterrows():
    security_report = row["description"]
    if pd.isna(security_report) or not security_report.strip():
        continue

    user_prompt = f"""Decompose the report into:
- Vulnerable file: can be one affected file or multiple affected file (if there is something like [XXX].[function_name] so the [XXX] will be the vulnerable file name [XXX].sol)
- Vulnerable function: can be one affected function or multiple affected functions
- Vulnerability explanation: raw markdown format, extract any information related to vulnerability explanation from the reports. Don't change anything, keep it as is but remove all links and parse it into information that we can understand without link. In this step just focus on separate vulnerability explanation from recommendations and removing link into understandable information.
- Recommendation: extract the Recommended Mitigation Steps header or any recommendation paragraph from the report into recommendation. Focus on taking the recommendation and separate it from vulnerability explanation. Also remove all links and parse it into some information that we can understand without link.

Requirements:  
- Remove all URLs; rewrite any link text into self-contained explanations.  
- Output must be valid JSON.

Here is the report to analyze:
{security_report}
"""

    task = {
        "custom_id": f"request-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1",
            "temperature": 0.1,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        }
    }

    batch_file.write(json.dumps(task) + "\n")
    request_count += 1

    if request_count >= requests_per_batch:
        batch_file.close()
        batch_count += 1
        request_count = 0
        batch_file = open(os.path.join(output_dir, f"batch_input_{batch_count}.jsonl"), "w")

batch_file.close()

In [None]:
import json
import pandas as pd
import os

# Read the original CSV
df = pd.read_csv("vulnerabilities.csv")

# Initialize new columns
df["vuln_loc"] = None
df["vuln_explanation"] = None
df["vuln_recommendation"] = None

# Path to your batch output directory
batch_output_dir = "batch_outputs"

# Gather all output lines from multiple batch files
output_lines = []
for batch_idx in range(len(os.listdir(batch_output_dir))):
    file_path = os.path.join(batch_output_dir, f"batch_output_{batch_idx}.jsonl")
    with open(file_path) as f:
        output_lines.extend(f.readlines())

# Process each output line using custom_id
for line in output_lines:
    try:
        json_out = json.loads(line)
        custom_id = json_out.get("custom_id", "")
        json_string = json_out['response']['body']['choices'][0]['message']['content']
        json_data = json.loads(json_string)

        # Extract original index from custom_id like "request-123"
        original_index = int(custom_id.split("-")[-1])

        df.loc[original_index, "vuln_loc"] = json.dumps({
            "vuln_file": json_data["vulnerable_file"],
            "vuln_function": json_data["vulnerable_function"]
        })
        df.loc[original_index, "vuln_explanation"] = json_data["vulnerability_explanation"]
        df.loc[original_index, "vuln_recommendation"] = json_data["recommendation"]

    except (json.JSONDecodeError, KeyError, ValueError) as e:
        print(f"Error processing custom_id {custom_id}: {e}")
        print(f"Raw response: {line}")

# Optional: save the updated DataFrame
df.to_csv("vuln_gpt.csv", index=False)