In [1]:
import json
import pickle
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import os
from collections import defaultdict


In [13]:
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse("./dataset/cwe.xml")
root = tree.getroot()
ns = {"cwe": "http://cwe.mitre.org/cwe-7"} # Define namespace
cwe_dict = {weakness.get("ID"): weakness for weakness in root.findall(".//cwe:Weakness", ns)}

def find_parents_dict(cwe_id:str):    
    result = {"Pillar": [], "Class": [], "Base": [], "Variant": []}
    
    current_cwe_id = cwe_id
    while current_cwe_id:    
        # Find the Weakness element
        weakness = cwe_dict.get(current_cwe_id)
        if not weakness:
            return result  # If CWE is not found, stop the search

        abstraction = weakness.get("Abstraction")
        if (result.get(abstraction, None)) is not None:
            result[abstraction].append('CWE-'+current_cwe_id)
        
        # Check if the weakness abstraction is "Pillar"
        if abstraction == "Pillar": break

        # Find the Parent CWE (ChildOf relation)
        related_weaknesses = weakness.find("cwe:Related_Weaknesses", ns)
        if related_weaknesses is not None:
            for related in related_weaknesses.findall("cwe:Related_Weakness", ns):
                if related.get("Nature") == "ChildOf":
                    current_cwe_id = related.get("CWE_ID")
        else: break
    
    return result

In [3]:
with open('./dataset/new_test/test_non_vuln.pkl', 'rb') as file:
    non_vuln_df = pickle.load(file)
    
with open('./dataset/new_test/test_vuln.pkl', 'rb') as file:
    vuln_df = pickle.load(file)

# df = pd.concat([non_vuln_df, vuln_df], ignore_index=True)

print(len(vuln_df))
print(len(non_vuln_df))
# print(len(df))

450
90


In [44]:
output_folder ='new_fs'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created folder: {output_folder}")

cot_vuln_path = './result/new_cot_result/vuln_code_results.json'
cot_non_vuln_path = './result/new_cot_result/non_vuln_code_results.json'
fs_vuln_path = './result/new_fs_result/vuln_code_results.json'
fs_non_vuln_path = './result/new_fs_result/non_vuln_code_results.json'
FT_cot_vuln_path = './result/FT_cot_result/vuln_code_results.json'
FT_cot_non_vuln_path = './result/FT_cot_result/non_vuln_code_results.json'
FT_fs_vuln_path = './result/FT_fs_result/vuln_code_results.json'
FT_non_vuln_path = './result/FT_fs_result/non_vuln_code_results.json'
r_cot_vuln_path = './result/rag_cot_prompt/vuln_code_results.json'
r_cot_non_vuln_path = './result/rag_cot_prompt/non_vuln_code_results.json'
r_fs_vuln_path = './result/rag_fs_prompt/vuln_code_results.json'
r_fs_non_vuln_path = './result/rag_fs_prompt/non_vuln_code_results.json'

with open(fs_vuln_path, 'r') as file:
    vuln_result = json.load(file)

with open(fs_non_vuln_path, 'r') as file:
    non_vuln_result = json.load(file)
    
print(len(vuln_result))
print(len(non_vuln_result))

Created folder: new_fs
1800
360


In [45]:
# List of expected models
model_list = ['llama3.1:8b', 'codellama:7b', 'phi4:14b', 'deepseek-r1:14b']
# model_list = ['hf.co/Kei5uke/llama3_30_epoch:latest', 'hf.co/Kei5uke/codellama_30_epoch:latest', 'hf.co/Kei5uke/phi4_30_epoch:latest', 'hf.co/Kei5uke/deepseek_30_epoch:latest']


# Combine the results into a single list
results = vuln_result + non_vuln_result

# Define the expected models in the correct order
expected_models = model_list

# Create a dictionary to track missing models for each file_change_id
missing_models = defaultdict(list)

# Iterate through the results and check for missing models
for entry in results:
    file_change_id = entry['file_change_id']
    model = entry['model']
    
    if file_change_id not in missing_models:
        missing_models[file_change_id] = expected_models.copy()
    
    if model in missing_models[file_change_id]:
        missing_models[file_change_id].remove(model)

# Count the total number of missing samples
total_missing = 0
for file_change_id, models in missing_models.items():
    if models:
        print(f"File change ID {file_change_id} is missing models: {', '.join(models)}")
        total_missing += len(models)

print(f"Total number of missing samples: {total_missing}")

Total number of missing samples: 0


# Result Analysis

In [46]:
print(json.dumps(non_vuln_result[10], indent=4))

{
    "file_change_id": "137485654244269",
    "vuln_type": "non_vuln",
    "result": {
        "is_this_vuln": true,
        "vuln_code_part": "var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true; po.src = $('#HttpMode').val() + '//<?php echo $_SERVER['HTTP_HOST']?><?php echo erLhcoreClassDesign::baseurldirect()?>' + siteAccess + 'faq/embed' + id_theme;",
        "reason": "The code dynamically constructs a URL for loading an external JavaScript file based on user input from the dropdowns. The `$('#HttpMode').val()` and `siteAccess` variables are directly concatenated into the script's source URL without any validation or sanitization. This can lead to a potential open redirect vulnerability if an attacker manipulates these inputs, allowing them to load malicious scripts from arbitrary domains.",
        "cwe": {
            "cwe_id": "CWE-601",
            "name": "URL Redirection to Untrusted Site ('Open Redirect')",
            "description": "Th

In [47]:
chosen_cwes = ['CWE-20', 'CWE-287', 'CWE-400', 'CWE-668', 'CWE-74']

def find_first_overlap(list1, list2):
    set2 = set(list2)
    return next((x for x in list1 if x in set2), None)

def keys_exists(element, *keys):
    '''
    Check if *keys (nested) exists in `element` (dict).
    '''
    if not isinstance(element, dict):
        raise AttributeError('keys_exists() expects dict as first argument.')
    if len(keys) == 0:
        raise AttributeError('keys_exists() expects at least two arguments, one given.')

    _element = element
    for key in keys:
        try:
            _element = _element[key]
        except:
            return False
    return True

def get_cwe_label(result):
    is_this_vuln = None
    cwe_id = None
    
    if keys_exists(result, 'result', 'is_this_vuln'):
        is_this_vuln = result['result']['is_this_vuln'] 
    if keys_exists(result, 'result', 'cwe', 'cwe_id'):
        cwe_id = result['result']['cwe']['cwe_id']
        # if not re.match(r'CWE-\d+', cwe_id):
        #     cwe_id = None
        match = re.search(r'CWE-\d+', cwe_id)
        if match:
            cwe_id = match.group(0)  # Extract the full CWE-<number>
        else:
            cwe_id = None
    # parents = find_parents_dict(cwe_id.split("-")[1])
    # parents['Class']

    return is_this_vuln, cwe_id

# Add Class to results

pred_labels = []
pred_labels_bin = []
true_labels = []
true_labels_bin = []

results = non_vuln_result + vuln_result
# Dictionary to store results and metrics separated by model
results_by_model = defaultdict(lambda: {
    # 'results': [],
    'pred_labels': [],
    'pred_labels_bin': [],
    'pred_labels_class': [],
    'true_labels': [],
    'true_labels_bin': [],
    'true_labels_class': []
})

for r in results:
    model_name = r.get('model', 'unknown_model')  # Get the model name, default to 'unknown_model' if not present
    # results_by_model[model_name]['results'].append(r)

    is_this_vuln, cwe_id = get_cwe_label(r)
    if is_this_vuln is None: 
        print('Contains ERROR:', model_name)
        # print(r)
        # print(is_this_vuln, cwe_id)
        continue
        
    # Append to the respective lists for the model
    results_by_model[model_name]['true_labels_bin'].append(not(r['vuln_type'] == 'non_vuln')) 
    results_by_model[model_name]['pred_labels_bin'].append(is_this_vuln)
    results_by_model[model_name]['true_labels'].append(r['vuln_type'])
    if is_this_vuln is True:
        results_by_model[model_name]['pred_labels'].append(cwe_id if cwe_id else 'unknown') 
    else:
        results_by_model[model_name]['pred_labels'].append('non_vuln')

    
    # Parent class
    true_label = results_by_model[model_name]['true_labels'][-1]
    if true_label != 'non_vuln':
        parent = find_parents_dict(true_label.split("-")[1])['Class'][-1]
        results_by_model[model_name]['true_labels_class'].append(parent)
    else:
        parent = 'non_vuln'
        results_by_model[model_name]['true_labels_class'].append('non_vuln')

    pred_label = results_by_model[model_name]['pred_labels'][-1]
    pred_label_class = None
    if pred_label not in {'non_vuln', 'unknown'}:
        pred_parents = find_parents_dict(pred_label.split("-")[1]).get('Class', [])
        print('pred_label', pred_label)
        print('pred_parents', pred_parents)
        print('true parent', parent)
        if parent in pred_parents:
            pred_label_class = parent
            print('CP Parent', pred_label_class)
        elif len(pred_parents) > 0:
            class_tmp = find_first_overlap(pred_parents, chosen_cwes)
            pred_label_class = class_tmp if class_tmp else 'other_cwe'
            print('CP Parent', pred_label_class)
        else:
            pred_label_class = 'other_cwe'
            print('Added other')
    else:
        pred_label_class = pred_label
    results_by_model[model_name]['pred_labels_class'].append(pred_label_class)

pred_label CWE-352
pred_parents ['CWE-345']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-434
pred_parents ['CWE-669']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-434
pred_parents ['CWE-669']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-89
pred_parents ['CWE-74']
true parent non_vuln
CP Parent CWE-74
pred_label CWE-601
pred_parents ['CWE-610']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-602
pred_parents ['CWE-602']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-79
pred_parents ['CWE-74']
true parent non_vuln
CP Parent CWE-74
pred_label CWE-434
pred_parents ['CWE-669']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-22
pred_parents ['CWE-668']
true parent non_vuln
CP Parent CWE-668
pred_label CWE-319
pred_parents ['CWE-311']
true parent non_vuln
CP Parent other_cwe
pred_label CWE-125
pred_parents ['CWE-119', 'CWE-20']
true parent non_vuln
CP Parent CWE-20
pred_label CWE-284
pred_parents []
true parent non_vuln
Added other
pre

In [48]:
for model in model_list:
    print(f"Model: {model}")
    
    # Get unique elements and their counts
    unique_elements, counts = np.unique(results_by_model[model]['pred_labels'], return_counts=True)
    
    # Print each element and its corresponding count
    for element, count in zip(unique_elements, counts):
        print(f"Element: {element}, Count: {count}")
    
    # Optional: Print a separator for better readability
    print("-" * 40)

Model: llama3.1:8b
Element: CWE-119, Count: 1
Element: CWE-120, Count: 1
Element: CWE-122, Count: 2
Element: CWE-134, Count: 1
Element: CWE-20, Count: 27
Element: CWE-22, Count: 15
Element: CWE-284, Count: 4
Element: CWE-287, Count: 10
Element: CWE-319, Count: 1
Element: CWE-320, Count: 1
Element: CWE-338, Count: 1
Element: CWE-345, Count: 1
Element: CWE-384, Count: 1
Element: CWE-400, Count: 8
Element: CWE-426, Count: 1
Element: CWE-428, Count: 4
Element: CWE-434, Count: 1
Element: CWE-502, Count: 2
Element: CWE-601, Count: 3
Element: CWE-74, Count: 4
Element: CWE-78, Count: 2
Element: CWE-79, Count: 1
Element: CWE-89, Count: 15
Element: CWE-918, Count: 1
Element: CWE-94, Count: 3
Element: CWE-98, Count: 1
Element: non_vuln, Count: 428
----------------------------------------
Model: codellama:7b
Element: CWE-120, Count: 12
Element: CWE-122, Count: 1
Element: CWE-125, Count: 3
Element: CWE-20, Count: 3
Element: CWE-494, Count: 1
Element: CWE-758, Count: 2
Element: CWE-78, Count: 3
Elem

In [49]:
for model in model_list:
    print(f"Model: {model}")
    
    # Get unique elements and their counts
    unique_elements, counts = np.unique(results_by_model[model]['pred_labels_class'], return_counts=True)
    
    # Print each element and its corresponding count
    for element, count in zip(unique_elements, counts):
        print(f"Element: {element}, Count: {count}")
    
    # Optional: Print a separator for better readability
    print("-" * 40)

Model: llama3.1:8b
Element: CWE-20, Count: 32
Element: CWE-287, Count: 10
Element: CWE-400, Count: 8
Element: CWE-668, Count: 19
Element: CWE-74, Count: 22
Element: non_vuln, Count: 428
Element: other_cwe, Count: 21
----------------------------------------
Model: codellama:7b
Element: CWE-20, Count: 19
Element: CWE-74, Count: 5
Element: non_vuln, Count: 486
Element: other_cwe, Count: 5
Element: unknown, Count: 24
----------------------------------------
Model: phi4:14b
Element: CWE-20, Count: 50
Element: CWE-287, Count: 12
Element: CWE-400, Count: 9
Element: CWE-668, Count: 40
Element: CWE-74, Count: 45
Element: non_vuln, Count: 263
Element: other_cwe, Count: 121
----------------------------------------
Model: deepseek-r1:14b
Element: CWE-20, Count: 4
Element: CWE-74, Count: 11
Element: non_vuln, Count: 524
Element: unknown, Count: 1
----------------------------------------


In [50]:
# Define label ordering explicitly
true_labels_order = ['non_vuln', 'CWE-20', 'CWE-74', 'CWE-287', 'CWE-400', 'CWE-668']
pred_labels_order = ['non_vuln', 'CWE-20', 'CWE-74', 'CWE-287', 'CWE-400', 'CWE-668', 'Other CWE', 'unknown']

# Combine all labels for indexing
all_labels = list(set(true_labels_order + pred_labels_order))

for model_name in model_list:
    print('MODEL:', model_name)
    print('Number of samples:')
    print('True Labels (Binary):', len(results_by_model[model_name]['true_labels_bin']))
    print('Pred Labels (Binary):', len(results_by_model[model_name]['pred_labels_bin']))
    print('True Labels (Multi-class):', len(results_by_model[model_name]['true_labels_class']))
    print('Pred Labels (Multi-class):', len(results_by_model[model_name]['pred_labels_class']))
    print()

    # Binary Confusion Matrix
    true_bin = results_by_model[model_name]['true_labels_bin']
    pred_bin = results_by_model[model_name]['pred_labels_bin']
    bin_cm = confusion_matrix(true_bin, pred_bin)

    # Normalize binary confusion matrix to percentages
    bin_cm_percentage = bin_cm.astype('float') / bin_cm.sum(axis=1)[:, np.newaxis]

    # Plot Binary Confusion Matrix (Percentage)
    plt.figure(figsize=(6, 4), dpi=150)  # Increase resolution
    sns.heatmap(bin_cm_percentage, annot=True, fmt='.2%', cmap='Blues', 
                xticklabels=['Non-Vuln', 'Vuln'], yticklabels=['Non-Vuln', 'Vuln'])
    plt.title(f'Binary Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    
    # Save the binary confusion matrix plot
    model_name_tmp = model_name.split('/')[-1]
    binary_output_path = os.path.join(output_folder, f'{model_name_tmp}_binary_confusion_matrix.png')
    plt.savefig(binary_output_path, bbox_inches='tight')
    plt.close()

    # Multi-class Confusion Matrix
    true_multi = results_by_model[model_name]['true_labels_class']
    pred_multi = results_by_model[model_name]['pred_labels_class']

    # Compute confusion matrix with all labels
    multi_cm = confusion_matrix(true_multi, pred_multi, labels=all_labels)

    # Filter and reorder rows/columns
    row_indices = [all_labels.index(l) for l in true_labels_order]
    col_indices = [all_labels.index(l) for l in pred_labels_order]
    multi_cm_ordered = multi_cm[row_indices, :][:, col_indices]

    # Normalize multi-class confusion matrix to percentages with epsilon to avoid division by zero
    multi_cm_percentage = multi_cm_ordered.astype('float') / multi_cm_ordered.sum(axis=1)[:, np.newaxis] + 1e-6
    multi_cm_percentage = multi_cm_percentage * 100

    # Create annotation labels
    labels = []
    for i in range(len(true_labels_order)):
        row = []
        for j in range(len(pred_labels_order)):
            count = multi_cm_ordered[i, j]
            perc = multi_cm_percentage[i, j]
            row.append(f"{count}\n({perc:.1f}%)" if count > 0 else "0\n(0.0%)")
        labels.append(row)

    # Plot Multi-class Confusion Matrix (Percentage)
    plt.figure(figsize=(12, 8), dpi=300)  # Increase resolution
    ax = sns.heatmap(multi_cm_ordered, annot=labels, fmt='', cmap='Blues', 
                     xticklabels=pred_labels_order, yticklabels=true_labels_order,
                     linewidths=0.5, linecolor='black')  # Add grid lines

    # Add red outline to diagonal cells
    for i in range(len(true_labels_order)):
        for j in range(len(pred_labels_order)):
            if true_labels_order[i] == pred_labels_order[j]:  # Diagonal cells
                # Add a red rectangle around the cell
                ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='red', lw=2))

    plt.title(f'Multi-class Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.gca().invert_yaxis()  # Reverse y-axis to show 'non_vuln' at bottom
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    # Save the multi-class confusion matrix plot
    model_name_tmp = model_name.split('/')[-1]
    multi_output_path = os.path.join(output_folder, f'{model_name_tmp}_multi_class_confusion_matrix.png')
    plt.savefig(multi_output_path, bbox_inches='tight')
    plt.close()

MODEL: llama3.1:8b
Number of samples:
True Labels (Binary): 540
Pred Labels (Binary): 540
True Labels (Multi-class): 540
Pred Labels (Multi-class): 540

MODEL: codellama:7b
Number of samples:
True Labels (Binary): 539
Pred Labels (Binary): 539
True Labels (Multi-class): 539
Pred Labels (Multi-class): 539

MODEL: phi4:14b
Number of samples:
True Labels (Binary): 540
Pred Labels (Binary): 540
True Labels (Multi-class): 540
Pred Labels (Multi-class): 540

MODEL: deepseek-r1:14b
Number of samples:
True Labels (Binary): 540
Pred Labels (Binary): 540
True Labels (Multi-class): 540
Pred Labels (Multi-class): 540



In [51]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

def compute_metrics(true_labels, pred_labels, true_labels_order, pred_labels_order):
    """
    Compute accuracy, precision, recall, F1-score, and confusion matrix using Scikit-Learn.
    
    Args:
        true_labels (list): List of true labels.
        pred_labels (list): List of predicted labels.
        true_labels_order (list): Order of true labels for the confusion matrix.
        pred_labels_order (list): Order of predicted labels for the confusion matrix.
    """
    # Ensure labels are aligned with the confusion matrix order
    true_labels_aligned = [label if label in true_labels_order else "Other CWE" for label in true_labels]
    pred_labels_aligned = [label if label in pred_labels_order else "Other CWE" for label in pred_labels]

    # Compute accuracy
    accuracy = accuracy_score(true_labels_aligned, pred_labels_aligned)
    print(f"Accuracy: {accuracy:.4f}")

    # Compute macro-averaged precision, recall, and F1-score
    precision = precision_score(true_labels_aligned, pred_labels_aligned, average='macro', zero_division=0)
    recall = recall_score(true_labels_aligned, pred_labels_aligned, average='macro', zero_division=0)
    f1 = f1_score(true_labels_aligned, pred_labels_aligned, average='macro', zero_division=0)

    print(f"Macro Precision: {precision:.4f}")
    print(f"Macro Recall: {recall:.4f}")
    print(f"Macro F1-score: {f1:.4f}")

    # Compute per-class metrics using classification_report
    print("\nPer-class metrics:")
    print(classification_report(true_labels_aligned, pred_labels_aligned, labels=true_labels_order, zero_division=0))

    # Compute confusion matrix with specified label order
    # cm = confusion_matrix(true_labels_aligned, pred_labels_aligned, labels=true_labels_order)
    # print("\nConfusion Matrix:")
    # print(cm)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        # "confusion_matrix": cm
    }

In [52]:
from pptx import Presentation
from pptx.util import Inches

# Create a PowerPoint presentation
prs = Presentation()

# Define slide layout (Title and Content)
slide_layout = prs.slide_layouts[5]  # Blank slide layout

# Add a slide
slide = prs.slides.add_slide(slide_layout)

# Define table position and size
left = Inches(1)
top = Inches(1)
width = Inches(8)
height = Inches(4)

# Add a table to the slide
rows = len(model_list) + 1  # Header row + one row per model
cols = 5  # Model name, Accuracy, Precision, Recall, F1-score
table = slide.shapes.add_table(rows, cols, left, top, width, height).table

# Set table header
table.cell(0, 0).text = 'Model'
table.cell(0, 1).text = 'Accuracy'
table.cell(0, 2).text = 'Macro Precision'
table.cell(0, 3).text = 'Macro Recall'
table.cell(0, 4).text = 'Macro F1-score'

# Populate the table with metrics for each model
for i, model_name in enumerate(model_list, start=1):
    print(f'--- {model_name} ---')
    metrics = compute_metrics(
        results_by_model[model_name]['true_labels_class'],
        results_by_model[model_name]['pred_labels_class'],
        true_labels_order,
        pred_labels_order
    )

    # Add model name and metrics to the table
    table.cell(i, 0).text = model_name
    table.cell(i, 1).text = f"{metrics['accuracy']:.4f}"
    table.cell(i, 2).text = f"{metrics['precision']:.4f}"
    table.cell(i, 3).text = f"{metrics['recall']:.4f}"
    table.cell(i, 4).text = f"{metrics['f1']:.4f}"

# Save the PowerPoint presentation
prs.save(f'{output_folder}/model_metrics_table.pptx')
print("PowerPoint table created successfully!")

--- llama3.1:8b ---
Accuracy: 0.1870
Macro Precision: 0.2319
Macro Recall: 0.1603
Macro F1-score: 0.1118

Per-class metrics:
              precision    recall  f1-score   support

    non_vuln       0.17      0.81      0.28        90
      CWE-20       0.25      0.09      0.13        90
      CWE-74       0.41      0.10      0.16        90
     CWE-287       0.30      0.03      0.06        90
     CWE-400       0.12      0.01      0.02        90
     CWE-668       0.37      0.08      0.13        90

   micro avg       0.19      0.19      0.19       540
   macro avg       0.27      0.19      0.13       540
weighted avg       0.27      0.19      0.13       540

--- codellama:7b ---
Accuracy: 0.1558
Macro Precision: 0.0590
Macro Recall: 0.1167
Macro F1-score: 0.0424

Per-class metrics:
              precision    recall  f1-score   support

    non_vuln       0.17      0.90      0.28        90
      CWE-20       0.11      0.02      0.04        90
      CWE-74       0.20      0.01      0.02