# Analyse Commit Classification 

Analysis of Prospector reports with respect to commit classification. Compares results with and without commit classification.

In [2]:
import json
import sys
import os
from omegaconf import OmegaConf

# print(os.getcwdb())
config = OmegaConf.load("config.yaml")

## Load Data 

Load the JSON data from the files created by `fetch.py`. 

In [3]:
with open("../../../" + config.ground_truth, "r") as f1, open(
    "../../../" + config.no_cc, "r"
) as f2:
    f1_data = json.load(f1)  # baseline
    f2_data = json.load(f2)  # prospector without cc
    # f3_data = json.load(f3)  # propsector with cc

## Analysis

### How many fixing commits have been correctly obtained by Prospector? 

Find out how many fixing commits have been obtained correctly using Prospector: 

In [4]:
def count_correct_results(ground_truth_data, prospector_data):
    count_correct = 0
    count_correct_first_3 = 0

    for cve_record in ground_truth_data["ground_truth"]:
        cve_id = cve_record["vulnerability_id"]

        prospector_cve_record = prospector_data["vulnerabilities"].get(cve_id, None)
        if prospector_cve_record is not None:
            # print(f"{cve_id} in both baseline and prospector.") # Sanity check

            true_fixing_commits = [
                commit["id"] for fix in cve_record["fixes"] for commit in fix["commits"]
            ]

            if prospector_cve_record["commits"]:
                first_fixing_commit = prospector_cve_record["commits"][0]["commit_hash"]
                first_3_fixing_commits = [
                    commit["commit_hash"]
                    for commit in prospector_cve_record["commits"][:3]
                ]

            if first_fixing_commit in true_fixing_commits:
                count_correct += 1

            if set(first_3_fixing_commits) & set(true_fixing_commits):
                count_correct_first_3 += 1

    return count_correct, count_correct_first_3

In [12]:
cves_2018 = []
for file in os.listdir("../../../data_sources/reports/"):
    if file.startswith("CVE-2018-"):
        cves_2018.append(file)

print(count_correct_results(f1_data, f2_data))
# print(count_correct_results(f1_data, f3_data))

print(f"out of {len(cves_2018)}.")

(16, 19)
out of 197.


### First 10 candidates

In how many reports is the fixing commit within the first 10 candidate? This is important to know to see how many commits the LLM rules should be applied to. I should compare the percentages of correct fixing commits for different numbers of candidate commits, eg. 1, 3, 5, and 10. I need to compare using Prospector without the cc rule. 

Right now, I have the outcomes of the 58 2019 CVEs, let's start comparing with those. 

In [6]:
def count_among_first_x_candidates(ground_truth_data, prospector_data):
    count_correct = 0
    count_correct_first_3 = 0
    count_correct_first_5 = 0
    count_correct_first_10 = 0

    for cve_record in ground_truth_data["ground_truth"]:
        cve_id = cve_record["vulnerability_id"]

        prospector_cve_record = prospector_data["vulnerabilities"].get(cve_id, None)
        if prospector_cve_record is not None:
            # print(f"{cve_id} in both baseline and prospector.") # Sanity check

            true_fixing_commits = [
                commit["id"] for fix in cve_record["fixes"] for commit in fix["commits"]
            ]

            if prospector_cve_record["commits"]:
                first_fixing_commit = prospector_cve_record["commits"][0]["commit_hash"]
                first_3_fixing_commits = [
                    commit["commit_hash"]
                    for commit in prospector_cve_record["commits"][:3]
                ]
                first_5_fixing_commits = [
                    commit["commit_hash"]
                    for commit in prospector_cve_record["commits"][:5]
                ]
                first_10_fixing_commits = [
                    commit["commit_hash"]
                    for commit in prospector_cve_record["commits"][:10]
                ]

            if first_fixing_commit in true_fixing_commits:
                count_correct += 1

            if set(first_3_fixing_commits) & set(true_fixing_commits):
                count_correct_first_3 += 1
            if set(first_5_fixing_commits) & set(true_fixing_commits):
                count_correct_first_5 += 1
            if set(first_10_fixing_commits) & set(true_fixing_commits):
                count_correct_first_10 += 1

    return (
        count_correct,
        count_correct_first_3,
        count_correct_first_5,
        count_correct_first_10,
    )

In [10]:
with open("../../../" + config.ground_truth_2019, "r") as f1, open(
    "../../../" + config.no_cc_2019, "r"
) as f2:
    f1_data = json.load(f1)  # ground truth from Project KB
    f2_data = json.load(f2)  # prospector results (without CC)

oo1, oo3, oo5, oo10 = count_among_first_x_candidates(f1_data, f2_data)
print(
    f"{oo1} times fixing commit among the first candidate commit, \n{oo3} times fixing commit among top 3 candidates, \n{oo5} times fixing commit among top 5 candidates, \n{oo10} times fixing commit among top 10 candidates out of {len(f2_data['vulnerabilities'])} commits in total."
)

16 times fixing commit among the first candidate commit, 
19 times fixing commit among top 3 candidates, 
19 times fixing commit among top 5 candidates, 
21 times fixing commit among top 10 candidates out of 42 commits in total.
