# Evaluation of MV-SZZ vs. Existing SZZ Methods

This notebook computes TP, FP, Precision, Recall, and F1-score to evaluate and compare the performance of MV-SZZ against existing SZZ variants using:

1. **Developer-Informed Oracle**  
2. **Defects4J Dataset**

The evaluation is based on JSON files generated by the replication scripts provided in this package.


In [2]:
from pathlib import Path
from dataclasses import dataclass
import json
from typing import List, Dict, Tuple, Set

# ---------- Data structure ----------
@dataclass(frozen=True)
class BugFixInducingPair:
    repo_name: str
    fix_commit_hash: str
    inducing_commit_hash: str

# ---------- TP / FP counting ----------
def get_tp_fp(file_json: List[Dict]) -> Tuple[Set[BugFixInducingPair],
                                              Set[BugFixInducingPair]]:
    """
    Return unique TP and FP pairs for a parsed JSON list.
    Duplicate inducing hashes for the same fix commit are ignored.
    """
    tp: Set[BugFixInducingPair] = set()
    fp: Set[BugFixInducingPair] = set()

    for obj in file_json:
        repo = obj["repo_name"]
        fix  = obj["fix_commit_hash"]
        oracle = set(obj["bug_commit_hash"])
        preds  = obj.get("inducing_commit_hash", [])

        pred_hashes = {p["commit_hash"] for p in preds}   # de-duplicate

        for h in pred_hashes:
            pair = BugFixInducingPair(repo, fix, h)
            (tp if h in oracle else fp).add(pair)

    return tp, fp

# ---------- Metric helpers ----------
def get_scores(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    f1        = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0.0
    return precision, recall, f1


## 1. Developer-Informed Oracle

In [3]:
BASE_DIR = Path().resolve().parent.parent

EVAL_FILES_DIO = {
    "MV-SZZ(Optimal tokens)":  BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_4token_mv.json",
    "B-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_original_b.json",
    "AG-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_original_ag.json",
    "MA-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_original_ma.json",
    "L-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_original_l.json",
    "R-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_original_r.json",
}

for label, path in EVAL_FILES_DIO.items():
    if not path.is_file():
        print(f"[WARN] missing → {label}: {path}")
        continue

    with path.open(encoding="utf-8") as f:
        data = json.load(f)

    tp_set, fp_set = get_tp_fp(data)
    tp, fp = len(tp_set), len(fp_set)
    fn = sum(len(obj["bug_commit_hash"]) for obj in data) - tp

    precision, recall, f1 = get_scores(tp, fp, fn)

    print(f"=== {label} ===")
    print(f"TP: {tp:5d}  FP: {fp:5d}")
    print(f"Precision: {precision:.3f}  Recall: {recall:.3f}  F1: {f1:.3f}\n")


=== MV-SZZ(Optimal tokens) ===
TP:    44  FP:    30
Precision: 0.595  Recall: 0.579  F1: 0.587

=== B-SZZ ===
TP:    51  FP:   133
Precision: 0.277  Recall: 0.671  F1: 0.392

=== AG-SZZ ===
TP:    49  FP:   136
Precision: 0.265  Recall: 0.645  F1: 0.375

=== MA-SZZ ===
TP:    47  FP:   161
Precision: 0.226  Recall: 0.618  F1: 0.331

=== L-SZZ ===
TP:    27  FP:    43
Precision: 0.386  Recall: 0.355  F1: 0.370

=== R-SZZ ===
TP:    37  FP:    33
Precision: 0.529  Recall: 0.487  F1: 0.507



## 2. Defects4J

In [4]:
BASE_DIR = Path().resolve().parent.parent

EVAL_FILES_DIO = {
    "MV-SZZ(Optimal tokens)":  BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_5token_mv.json",
    "B-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_original_b.json",
    "AG-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_original_ag.json",
    "MA-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_original_ma.json",
    "L-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_original_l.json",
    "R-SZZ": BASE_DIR / "dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_original_r.json",
}

for label, path in EVAL_FILES_DIO.items():
    if not path.is_file():
        print(f"[WARN] missing → {label}: {path}")
        continue

    with path.open(encoding="utf-8") as f:
        data = json.load(f)

    tp_set, fp_set = get_tp_fp(data)
    tp, fp = len(tp_set), len(fp_set)
    fn = sum(len(obj["bug_commit_hash"]) for obj in data) - tp

    precision, recall, f1 = get_scores(tp, fp, fn)

    print(f"=== {label} ===")
    print(f"TP: {tp:5d}  FP: {fp:5d}")
    print(f"Precision: {precision:.3f}  Recall: {recall:.3f}  F1: {f1:.3f}\n")


=== MV-SZZ(Optimal tokens) ===
TP:    75  FP:    60
Precision: 0.556  Recall: 0.577  F1: 0.566

=== B-SZZ ===
TP:    80  FP:   146
Precision: 0.354  Recall: 0.615  F1: 0.449

=== AG-SZZ ===
TP:    76  FP:   121
Precision: 0.386  Recall: 0.585  F1: 0.465

=== MA-SZZ ===
TP:    75  FP:   132
Precision: 0.362  Recall: 0.577  F1: 0.445

=== L-SZZ ===
TP:    50  FP:    63
Precision: 0.442  Recall: 0.385  F1: 0.412

=== R-SZZ ===
TP:    56  FP:    57
Precision: 0.496  Recall: 0.431  F1: 0.461

