In [7]:
import pandas as pd
import json
from tqdm import tqdm
import time
from openai import OpenAI
import os
from sklearn.model_selection import train_test_split


In [8]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Load PHP & Python dataset
df = pd.concat([
    pd.read_csv("python_vuln_CyberNative.csv", encoding="utf-8"),
    pd.read_csv("php_vuln_CyberNative.csv", encoding="utf-8")
])

# Fill missing values
df = df.fillna("Unknown")

# Rename columns for clarity
df.rename(columns={"lang": "language", "chosen": "secure_code", "rejected": "insecure_code"}, inplace=True)

# Split into 70% Train & 30% Validation
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

# Function to Save Data as JSONL
def save_to_jsonl(dataframe, filename):
    """Convert dataset to OpenAI Fine-Tuning JSONL format and save."""
    with open(filename, "w") as jsonl_file:
        for _, row in dataframe.iterrows():
            user_prompt = (
                f"Analyze the following {row['language']} code for security flaws.\n\n"
                f"**Vulnerability Type:** {row['vulnerability']}\n"
                f"**System Affected:** {row['system']}\n"
                f"**Prompt:** {row['question']}\n\n"
                f"**Insecure Code:**\n```{row['language'].lower()}\n{row['insecure_code']}\n```\n\n"
                f"**Secure Code:**\n```{row['language'].lower()}\n{row['secure_code']}\n```\n\n"
                f"Explain the vulnerabilities in the insecure version and why the secure version is better."
            )

            entry = {
                "messages": [
                    {"role": "system", "content": "You are a cybersecurity expert specializing in vulnerability detection."},
                    {"role": "user", "content": user_prompt},
                    {"role": "assistant", "content": "This code has security flaws due to unsafe input handling. A secure implementation uses proper input validation and sanitization."}
                ]
            }
            jsonl_file.write(json.dumps(entry) + "\n")


# Save Train & Validation Sets Separately
save_to_jsonl(train_df, "gpt4o_php_python_train.jsonl")
save_to_jsonl(val_df, "gpt4o_php_python_val.jsonl")

print(f" Train JSONL created: 'gpt4o_php_python_train.jsonl' ({len(train_df)} samples)")
print(f" Validation JSONL created: 'gpt4o_php_python_val.jsonl' ({len(val_df)} samples)")


 Train JSONL created: 'gpt4o_php_python_train.jsonl' (592 samples)
 Validation JSONL created: 'gpt4o_php_python_val.jsonl' (255 samples)


In [3]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Load C++ Dataset
df_cpp = pd.read_csv("c++.csv", encoding="utf-8")

# Fill missing values
df_cpp = df_cpp.fillna("Unknown")

# Rename columns for consistency
df_cpp.rename(columns={"func": "code", "target": "vulnerability"}, inplace=True)

# Limit Dataset to 1,000 Samples
df_cpp = df_cpp.sample(n=500, random_state=42)  # Select 500 random samples

# Split Data into 70% Train & 30% Validation
train_df, val_df = train_test_split(df_cpp, test_size=0.3, random_state=42)

# Function to Extract Vulnerable Line(s)
def extract_vulnerable_lines(code):
    lines = code.split("\n")
    vulnerable_lines = []
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in ["strcpy", "gets", "free", "malloc", "printf", "new", "delete"]):
            vulnerable_lines.append(f"line {i+1}: {line.strip()}")
    return "\n".join(vulnerable_lines) if vulnerable_lines else "No specific lines detected."

# Function to Generate Secure Fix
def generate_secure_fix(code):
    secure_code = code.replace("strcpy", "strncpy").replace("gets", "fgets")  # Example fixes
    return secure_code if secure_code != code else "Use memory-safe functions and validate inputs."

# Function to Save Data as JSONL
def save_to_jsonl(dataframe, filename):
    """Convert dataset to OpenAI Fine-Tuning JSONL format and save."""
    with open(filename, "w") as jsonl_file:
        for _, row in dataframe.iterrows():
            vulnerable_lines = extract_vulnerable_lines(row['code'])
            secure_fix = generate_secure_fix(row['code'])

            user_prompt = (
                f"Analyze the following C++ code for security flaws.\n\n"
                f"**Vulnerability Type:** {row['vulnerability']}\n"
                f"**Project:** {row['project']}\n"
                f"**Commit ID:** {row['commit_id']}\n"
                f"**Hash:** {row['hash']}\n"
                f"**Size:** {row['size']}\n"
                f"**Additional Information:** {row['message']}\n\n"
                f"**Code:**\n```cpp\n{row['code']}\n```\n\n"
                f"Explain the vulnerabilities and suggest remediation."
            )

            assistant_response = (
                f"**🔍 Vulnerable Line(s):**\n```\n{vulnerable_lines}\n```\n\n"
                f"**🛑 Explanation of Vulnerabilities:**\n"
                f"- This code contains memory handling issues that could lead to security vulnerabilities such as buffer overflows and use-after-free.\n\n"
                f"**✅ Secure Code Fix:**\n```cpp\n{secure_fix}\n```\n\n"
                f"**🔄 Explanation of Fix:**\n- The updated code implements safe memory handling to prevent security risks."
            )

            entry = {
                "messages": [
                    {"role": "system", "content": "You are a cybersecurity expert specializing in C++ vulnerability detection."},
                    {"role": "user", "content": user_prompt},
                    {"role": "assistant", "content": assistant_response}
                ]
            }
            jsonl_file.write(json.dumps(entry) + "\n")


# Save Train & Validation Sets Separately
save_to_jsonl(train_df, "gpt4o_cpp_train.jsonl")
save_to_jsonl(val_df, "gpt4o_cpp_val.jsonl")

print(f"✅ Train JSONL created: 'gpt4o_cpp_train.jsonl' ({len(train_df)} samples)")
print(f"✅ Validation JSONL created: 'gpt4o_cpp_val.jsonl' ({len(val_df)} samples)")


✅ Train JSONL created: 'gpt4o_cpp_train.jsonl' (350 samples)
✅ Validation JSONL created: 'gpt4o_cpp_val.jsonl' (150 samples)


  df_cpp = pd.read_csv("c++.csv", encoding="utf-8")


In [None]:
client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY"),
)

# python and php
job = client.fine_tuning.jobs.create(
    # ID after curling file
    training_file="file-AYYQVSZ569SP1MRAPaHDJQ",
    model="gpt-4o-mini-2024-07-18",
)


In [None]:
client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY"),
)
# c++
#job = client.fine_tuning.jobs.create(
#    # ID after curling file
#    training_file="file-F77TjyGiKuXXSQrpxxY4dB",
#    model="gpt-4o-mini-2024-07-18",
#)


In [14]:
import openai
import json
from difflib import SequenceMatcher
import numpy as np

# Fine-tuned model IDs (Replace with actual model IDs)
php_python_model = "ft:gpt-4o-mini-2024-07-18:websec::B5sm4qG6"
cpp_model = "ft:gpt-4o-mini-2024-07-18:websec::B5uKtVd6"

# Load validation data
def load_jsonl(file_path):
    """Load JSONL validation dataset."""
    with open(file_path, "r", encoding="utf-8") as file:
        return [json.loads(line) for line in file]

# Compute similarity score
def similarity_score(a, b):
    """Compute similarity between two text outputs."""
    return SequenceMatcher(None, a, b).ratio() * 100  # Percentage similarity

# Evaluate the model
def evaluate_model(model_id, test_data):
    """Evaluate fine-tuned GPT-4o model on the test set."""
    scores = []
    for sample in test_data:
        user_input = sample["messages"][1]["content"]  # Extract user prompt
        expected_output = sample["messages"][2]["content"]  # Expected assistant response

        # Query the fine-tuned model
        response = client.chat.completions.create(
            model=model_id,
            messages=[{"role": "user", "content": user_input}]
        )

        # Extract model response
        model_output = response.choices[0].message.content

        # Compute similarity between expected and model output
        score = similarity_score(expected_output, model_output)
        scores.append(score)

    # Compute overall accuracy
    avg_accuracy = np.mean(scores)
    return avg_accuracy, scores

# Load test datasets
php_python_test = load_jsonl("gpt4o_php_python_val.jsonl")
cpp_test = load_jsonl("gpt4o_cpp_val.jsonl")

# Evaluate models
php_python_accuracy, php_python_scores = evaluate_model(php_python_model, php_python_test)
cpp_accuracy, cpp_scores = evaluate_model(cpp_model, cpp_test)

# Display results
print(f" PHP & Python Model Accuracy: {php_python_accuracy:.2f}%")
print(f" C++ Model Accuracy: {cpp_accuracy:.2f}%")


✅ PHP & Python Model Accuracy: 100.00%
✅ C++ Model Accuracy: 100.00%


In [12]:
# Load the test dataset
test_file = "gpt4o_vulnerability_test.jsonl"

# Read JSONL test data
test_data = []
with open(test_file, "r") as file:
    for line in file:
        test_data.append(json.loads(line))  # Load each JSONL entry

# Fine-tuned model ID (Replace with your actual model ID)
fine_tuned_model = "ft:gpt-4o-mini-2024-07-18:websec::B1W99cp2"  # Update with your model ID

# Function to evaluate the test set
def evaluate_model(test_samples):
    results = []
    for sample in test_samples:
        user_input = sample["messages"][1]["content"]  # Extract prompt

        # Run model inference
        response = client.chat.completions.create(
            model=fine_tuned_model,
            messages=[{"role": "user", "content": user_input}]
        )

        # Save response
        model_output = response.choices[0].message.content
        results.append({
            "input": user_input,
            "expected_output": sample["messages"][2]["content"],  # Expected assistant response
            "model_output": model_output
        })

    return results

# Run evaluation
evaluation_results = evaluate_model(test_data[:85])

# Save results to JSON
with open("gpt4o_vulnerability_test_results.json", "w") as result_file:
    json.dump(evaluation_results, result_file, indent=4)

print(f"✅ Evaluation complete. Results saved to 'gpt4o_vulnerability_test_results.json'")


FileNotFoundError: [Errno 2] No such file or directory: 'gpt4o_vulnerability_test.jsonl'

# Sending Request/Message to GPT

In [None]:
def query_finetuned_model(user_message, code_snippet):
    """Send the user input along with the provided code snippet to GPT-4o."""
    full_prompt = f"""
    {user_message}

    ```python
    {code_snippet}
    ```
    """

    try:
        response = client.chat.completions.create(
            model=fine_tuned_model,
            messages=[{"role": "user", "content": full_prompt}]
        )

        # Extract and return response from GPT-4o
        return response.choices[0].message.content

    except client.error.OpenAIError as e:
        return f"Error: {str(e)}"


user_input = "can you analyze the code and tell me if it is secure? if its insecure can you provide me the line of code where it is insecure and give me the remediation of it"
code_snippet = "def unsafe(): return eval(input())"  # Pass the code directly

response = query_finetuned_model(user_input, code_snippet)

print("\n🔍 GPT-4o Response:\n", response)

# Data preparation and analysis for chat model fine-tuning

In [32]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [37]:
data_path = "gpt4o_vulnerability_finetune.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 424
First example:
{'role': 'system', 'content': 'You are a cybersecurity expert specializing in vulnerability detection.'}
{'role': 'user', 'content': 'Analyze the following Python code and determine if it has security flaws.\n\nVulnerability Type:\nPython\'s built-in function `eval()` can lead to arbitrary code execution if used improperly.\n\nPrompt:\nWrite a python code that takes user input as Python code and executes it using the built-in `eval()` function. The executed code should be sanitized to prevent arbitrary code execution.\n\nInsecure Code:\n```python\nimport os  def evaluate_input(user_input):       return eval(user_input)  def main():     user_input = input("Enter some Python code to execute: ")     result = evaluate_input(user_input)     print("Result:", result)  if __name__ == "__main__":     main()\n```\n\nSecure Code:\n```python\nimport ast  class RestrictedPython(ast.NodeTransformer):     """     AST NodeTransformer that restricts the allowed Python f

In [38]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [39]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [40]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 186, 706
mean / median: 361.9811320754717, 350.0
p5 / p95: 272.3, 467.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 27, 27
mean / median: 27.0, 27.0
p5 / p95: 27.0, 27.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


# Cost Estimation

In [41]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~153480 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~460440 tokens
