# RQ2: Comparative Evaluation

- [FAST++](#fast)
- [Field-ready testing](#field-ready-testing)

In [21]:
import pandas as pd
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
)
from statistics import mean
import random
import os

In [10]:
LABELS = ["similar", "fixed", "buggy"]
FORMAL_LABELS = ["already-tested", "need-test", "error-prone"]

In [13]:
def compute_effectiveness(df_exp: pd.DataFrame):
    y_true = df_exp["target_scenario"].array
    y_pred = df_exp["classified_scenario"].array
    precision = precision_score(y_true, y_pred, labels=LABELS, average=None)
    recall = recall_score(y_true, y_pred, labels=LABELS, average=None)
    f1 = f1_score(y_true, y_pred, labels=LABELS, average=None)
    not_yet_tested_f1 = f1[1:].mean()
    avg_f1 = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    avg_precision = precision_score(y_true, y_pred, labels=LABELS, average="macro")
    avg_recall = recall_score(y_true, y_pred, labels=LABELS, average="macro")
    print(
        " | ".join(
            [f"{l}(P R F1)" for l in FORMAL_LABELS]
            + ["not-yet-tested F1", "Total Average F1"]
        )
    )
    print(
        " & ".join(
            [
                " & ".join([f"{precision[i]:.2f}", f"{recall[i]:.2f}", f"{f1[i]:.2f}"])
                for i in range(len(LABELS))
            ]
            + [f"{not_yet_tested_f1:.2f}", f"{avg_f1:.2f}"]
        )
    )
    print(f"avg_precision: {avg_precision:.2f}, avg_recall: {avg_recall:.2f}")

## FAST++
Run following commands to reproduce FAST++ classification:
```sh
cd DataAnalysis/
conda create -n venv_fastr python=3.6 -c conda-forge --no-default-packages
conda activate venv_fastr
unset PYTHONPATH
python fastr_classify.py
# replace RESULTS_FILE_PATH with the generated one
```

In [19]:
RESULTS_FILE_PATH = "plotdata/fastr_classification_20250121_101917.csv"
df_results = pd.read_csv(RESULTS_FILE_PATH)
compute_effectiveness(df_results)

already-tested(P R F1) | need-test(P R F1) | error-prone(P R F1) | not-yet-tested F1 | Total Average F1
0.32 & 0.32 & 0.32 & 0.27 & 0.27 & 0.27 & 0.44 & 0.44 & 0.44 & 0.35 & 0.34
avg_precision: 0.34, avg_recall: 0.34


## Field-ready testing

Replication package: https://github.com/field-ready-test-cases/field-ready-test-cases/tree/master

In [23]:
EXPERIMENT_RESULTS_PATH = "../AutonomicTester/experiment_results/finetuning/FineTunedGPT3.5Turbo/20240715_215858_OpenAI GPT-3.5 Fine-tuned_Validation_buggy"
PROJECTS = ["Chart", "Lang"]
SCENARIO_VOTES_PATH = os.path.join(EXPERIMENT_RESULTS_PATH, "scenario_votes.csv")
df_votes = pd.read_csv(SCENARIO_VOTES_PATH)

In [24]:
df_votes_chart = df_votes.loc[(df_votes["project"] == "Chart")]
num_found_bugs_chart = len(df_votes_chart.loc[df_votes_chart["scenario"] == "buggy"])
num_total_bugs_chart = len(df_votes_chart)
print(f"Found {num_found_bugs_chart} bugs out of {num_total_bugs_chart} bugs in Chart")
percent_revealed_bugs_chart = num_found_bugs_chart / num_total_bugs_chart
print(f"Percentage: {percent_revealed_bugs_chart:.2f}")

Found 12 bugs out of 25 bugs in Chart
Percentage: 0.48


In [25]:
df_votes_lang = df_votes.loc[(df_votes["project"] == "Lang")]
num_found_bugs_lang = len(df_votes_lang.loc[df_votes_lang["scenario"] == "buggy"])
num_total_bugs_lang = len(df_votes_lang)
print(f"Found {num_found_bugs_lang} bugs out of {num_total_bugs_lang} bugs in Lang")
percent_revealed_bugs_lang = num_found_bugs_lang / num_total_bugs_lang
print(f"Percentage: {percent_revealed_bugs_lang:.2f}")

Found 32 bugs out of 59 bugs in Lang
Percentage: 0.54


In [26]:
df_votes_subset = df_votes.loc[df_votes["project"].isin(PROJECTS)]
num_found_bugs_subset = len(df_votes_subset.loc[df_votes_subset["scenario"] == "buggy"])
num_total_bugs_subset = len(df_votes_subset)
print(
    f"Found {num_found_bugs_subset} bugs out of {num_total_bugs_subset} bugs in total"
)
percent_revealed_bugs_subset = num_found_bugs_subset / num_total_bugs_subset
print(f"Percentage: {percent_revealed_bugs_subset:.2f}")

Found 44 bugs out of 84 bugs in total
Percentage: 0.52


In [14]:
FAULT_DENSITY = 0.00100
NUM_INPUTS = int((26 + 65) / 0.00100)
REVEALED_FAILURES = {
    "#faults": [26, 65],
    "#executed faults": [16, 42],
    "revealed failures": [10, 20],
}
FIELD_TEST_TRIGGERING_PERCENTAGE = 0.00141
NUM_BUGGY = 26 + 65
NUM_FIXED = int((NUM_INPUTS - NUM_BUGGY) / 2)
NUM_SIMILAR = NUM_INPUTS - NUM_FIXED - NUM_BUGGY

In [15]:
stats = {
    "target_scenario": ["buggy"] * NUM_BUGGY
    + ["fixed"] * NUM_FIXED
    + ["similar"] * NUM_SIMILAR,
    "classified_scenario": ["buggy"] * 30 + ["similar"] * (NUM_INPUTS - 30),
}
df = pd.DataFrame(stats)

In [16]:
def classify(df: pd.DataFrame):
    if df["target_scenario"] == "buggy" and df["classified_scenario"] == "similar":
        if random.random() <= FIELD_TEST_TRIGGERING_PERCENTAGE:
            df["classified_scenario"] = "fixed"
    elif df["target_scenario"] == "fixed":
        if random.random() <= FIELD_TEST_TRIGGERING_PERCENTAGE:
            df["classified_scenario"] = "buggy"
    elif df["target_scenario"] == "similar":
        if random.random() <= FIELD_TEST_TRIGGERING_PERCENTAGE:
            df["classified_scenario"] = "buggy"
    return df

In [17]:
classified_df = df.apply(classify, axis=1)
compute_effectiveness(classified_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


already-tested(P R F1) | need-test(P R F1) | error-prone(P R F1) | not-yet-tested F1 | Total Average F1
0.50 & 1.00 & 0.67 & 0.00 & 0.00 & 0.00 & 0.16 & 0.33 & 0.22 & 0.11 & 0.30
avg_precision: 0.22, avg_recall: 0.44


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
