# Dataset Statistics

In [3]:
import os
import json
from datetime import datetime
import pandas as pd
from statistics import mean
import re

## New Bugs From Recent Issues
- range of created date 
- number per category of inputs
- number by projects
- project type

In [4]:
Defects4AT_PATH = "../Defect4AutonomicTesting/bugs"
Defects4AT_PROJECTS = [
    "spring-boot",
    "shardingsphere",
    "dolphinscheduler",
    "micrometer",
]

In [7]:
def summarize_stats_of_recent_bugs():
    create_dates = []
    closed_dates = []
    for project in Defects4AT_PROJECTS:
        print(project)
        input_categories = []
        project_path = os.path.join(Defects4AT_PATH, project)
        num_bugs = 0
        for bug_id in os.listdir(project_path):
            bug_path = os.path.join(project_path, bug_id)
            if not str.isnumeric(bug_id) or not os.path.isdir(bug_path):
                continue
            with open(os.path.join(bug_path, "issue.json")) as f:
                bug_metadata = json.load(f)
            create_date = bug_metadata["created_at"]
            closed_date = bug_metadata["closed_at"]
            create_dates.append(datetime.strptime(create_date, "%Y-%m-%dT%H:%M:%SZ"))
            closed_dates.append(datetime.strptime(closed_date, "%Y-%m-%dT%H:%M:%SZ"))
            input_category = bug_metadata["category"]
            input_categories.append(input_category)
            num_bugs += 1
        print("Input Category")
        print(pd.Series(input_categories).value_counts())
        print(f"{num_bugs} bugs in {project}")
        print()

    create_dates.sort()
    closed_dates.sort()
    print(create_dates)
    min_dt = min(create_dates)
    max_dt = max(create_dates)
    print(f"Crate date range: from {min_dt} to {max_dt}")
    print(closed_dates)
    print(f"Closed date range: from {min(closed_dates)} to {max(closed_dates)}")

In [8]:
summarize_stats_of_recent_bugs()

spring-boot
Input Category
new object input             8
new environment condition    7
new string input             5
Name: count, dtype: int64
20 bugs in spring-boot

shardingsphere
Input Category
new object input    7
new string input    4
Name: count, dtype: int64
11 bugs in shardingsphere

dolphinscheduler
Input Category
new string input             6
new environment condition    2
new object input             2
new number input             1
Name: count, dtype: int64
11 bugs in dolphinscheduler

micrometer
Input Category
new environment condition    3
new object input             3
new string input             1
new number input             1
Name: count, dtype: int64
8 bugs in micrometer

[datetime.datetime(2020, 1, 13, 8, 17, 51), datetime.datetime(2020, 3, 2, 8, 10, 39), datetime.datetime(2020, 5, 23, 15, 57), datetime.datetime(2020, 6, 8, 11, 1, 8), datetime.datetime(2020, 7, 16, 8, 37, 1), datetime.datetime(2020, 8, 24, 2, 55, 12), datetime.datetime(2021, 8, 6, 17, 45, 24),

## Prompts

In [19]:
EXPERIMENT_RESULTS_PATH = "../AutonomicTester/experiment_results"
VALIDATION_SET_PATH = "../FineTuneDataset/validation_paths_v4_q3diff5p.json"
GPT_RESULTS_PATH = os.path.join(EXPERIMENT_RESULTS_PATH, "GPT3.5Turbo")
LLAMA3_RESULTS_PATH = os.path.join(EXPERIMENT_RESULTS_PATH, "Llama3 70B")

In [41]:
def read_validation_set():
    with open(VALIDATION_SET_PATH) as f:
        validation_set = json.load(f)
        validation_set = [os.path.basename(path) for path in validation_set]
    pattern = r"prompt_(buggy|fixed|similar)_(\d+)_([-A-Za-z]+)_v\d+.txt"
    validation_list = []
    for name in validation_set:
        match = re.search(pattern, name)
        scenario = match.group(1)
        bug_id = int(match.group(2))
        project = match.group(3)
        validation_list.append(
            {"scenario": scenario, "bug_id": bug_id, "project": project}
        )
    df_validation = pd.DataFrame(validation_list).sort_values(
        by=["project", "bug_id", "scenario"]
    )
    return df_validation

In [42]:
def read_llm_prompt_set(llm_results_path):
    llm_stats = []
    for exp in os.listdir(llm_results_path):
        scenario = exp.split("_")[-1]
        stats_path = os.path.join(llm_results_path, exp, "statistics.csv")
        if not os.path.exists(stats_path):
            continue
        df_stats = pd.read_csv(stats_path)
        df_stats["scenario"] = scenario
        llm_stats.append(df_stats)
    df_llm_stats = pd.concat(llm_stats).drop(columns="elapsed_nanoseconds")
    return df_llm_stats

In [44]:
df_validation_prompt_set = read_validation_set()
df_llama3_prompt_set = read_llm_prompt_set(LLAMA3_RESULTS_PATH)
df_gpt3_prompt_set = read_llm_prompt_set(GPT_RESULTS_PATH)
df_prompt_set = df_llama3_prompt_set.join(
    df_validation_prompt_set.set_index(["project", "bug_id", "scenario"]),
    how="inner",
    on=["project_id", "bug_id", "scenario"],
)
df_prompt_stats = df_prompt_set.join(
    df_gpt3_prompt_set.set_index(["project_id", "bug_id", "scenario"]),
    how="inner",
    on=["project_id", "bug_id", "scenario"],
    lsuffix="_llama3",
    rsuffix="_gpt",
)

In [46]:
df_prompt_stats["#characters_gpt"].equals(df_prompt_stats["#characters_llama3"])

True

In [45]:
df_prompt_stats

Unnamed: 0,project_id,bug_id,#characters_llama3,#tokens_llama3,scenario,#characters_gpt,#tokens_gpt
0,spring-boot,7,3646,976,buggy,3646,932
1,spring-boot,18,5451,1201,buggy,5451,1202
2,shardingsphere,10,3635,952,buggy,3635,954
3,spring-boot,3,11641,2959,buggy,11641,2961
4,spring-boot,16,5394,1386,buggy,5394,1387
...,...,...,...,...,...,...,...
45,spring-boot,18,5515,1216,similar,5515,1218
46,micrometer,4,6947,1992,similar,6947,1994
47,dolphinscheduler,11,3535,943,similar,3535,945
48,dolphinscheduler,2,4369,1203,similar,4369,1204


In [66]:
df_defectsat_prompt_set = df_llama3_prompt_set[
    df_llama3_prompt_set["project_id"].isin(
        ["spring-boot", "shardingsphere", "dolphinscheduler", "micrometer"]
    )
]
print("Total prompts:", len(df_llama3_prompt_set))
num_all_bugs = len(df_llama3_prompt_set[df_llama3_prompt_set["scenario"] == "buggy"])
num_all_fixes = len(df_llama3_prompt_set[df_llama3_prompt_set["scenario"] == "fixed"])
num_all_similar = len(
    df_llama3_prompt_set[df_llama3_prompt_set["scenario"] == "similar"]
)
print("#all bugs:", num_all_bugs)
print("#all fixed:", num_all_fixes)
print("#all similar:", num_all_similar)
print("#Defects4J prompts:", len(df_llama3_prompt_set) - len(df_defectsat_prompt_set))
df_defectsat_prompt_set["scenario"].value_counts()

Total prompts: 1975
#all bugs: 719
#all fixed: 719
#all similar: 537
#Defects4J prompts: 1825


scenario
buggy      50
fixed      50
similar    50
Name: count, dtype: int64

In [57]:
df_prompt_stats[
    df_prompt_stats["project_id"].isin(
        ["spring-boot", "shardingsphere", "dolphinscheduler", "micrometer"]
    )
]["scenario"].value_counts()

scenario
buggy      48
fixed      48
similar    48
Name: count, dtype: int64

In [48]:
df_prompt_stats["scenario"].value_counts()

scenario
buggy      693
fixed      693
similar    511
Name: count, dtype: int64

In [49]:
df_prompt_stats["#characters_gpt"].describe()

count     1897.000000
mean      7777.778071
std       3704.321572
min        931.000000
25%       5205.000000
50%       6604.000000
75%       9563.000000
max      26552.000000
Name: #characters_gpt, dtype: float64

In [50]:
df_prompt_stats["#tokens_llama3"].describe()

count    1897.000000
mean     2406.736953
std      1264.445760
min       252.000000
25%      1506.000000
50%      2046.000000
75%      2929.000000
max      8136.000000
Name: #tokens_llama3, dtype: float64

In [51]:
df_prompt_stats["#tokens_gpt"].describe()

count    1897.000000
mean     2408.142857
std      1264.380421
min       253.000000
25%      1508.000000
50%      2051.000000
75%      2930.000000
max      8137.000000
Name: #tokens_gpt, dtype: float64

In [68]:
df_prompt_stats[["project_id", "bug_id", "scenario"]].to_csv(
    "filtered_prompts.csv", index=False
)

In [41]:
summarize_prompts_stats(GPT_RESULTS_PATH)

Found 50 prompts for fixed scenarios
Found 50 prompts for similar scenarios
Found 50 prompts for buggy scenarios
Found 681 prompts for buggy scenarios
Found 681 prompts for fixed scenarios
Found 497 prompts for similar scenarios
#Characters
min: 931, max: 52754, average: 8197
#Tokens
min: 253, max: 14591, average: 2561


In [42]:
summarize_prompts_stats(LLAMA3_RESULTS_PATH)

Found 50 prompts for buggy scenarios
Found 669 prompts for buggy scenarios
Found 487 prompts for similar scenarios
Found 669 prompts for fixed scenarios
Found 50 prompts for fixed scenarios
Found 50 prompts for similar scenarios
#Characters
min: 931, max: 26552, average: 7818
#Tokens
min: 252, max: 8136, average: 2416
