# Bar-Chart Replication Notebook 📊

This notebook reproduces the true-positive (TP) and false-positive (FP) bar charts
used in our study.  
Two datasets are processed:

* **Developer-Informed Oracle** 
* **Defects4J**

For each dataset we produce:

* TP bar chart  
* FP bar chart  

All code and paths assume the default project layout shipped with the replication
package. Adjust `BASE_DIR` if you place the notebook elsewhere.


In [1]:
from pathlib import Path

# ------------------------------------------------------------------
# Project root & I/O locations (edit BASE_DIR if needed)
# ------------------------------------------------------------------
BASE_DIR   = Path().resolve().parent.parent
DATA_DIR   = BASE_DIR / "dataset" / "pyszz_v2" / "json-output-raw" / "preliminary_study"
OUTPUT_DIR = BASE_DIR / "scripts" / "preliminary_study" / "bar_chart"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------------
# Dataset-specific JSON files
# ------------------------------------------------------------------
line_input_file_dio  = DATA_DIR / "developer-informed-oracle" / "dio_bic_conf_original.json"
token_input_file_dio = DATA_DIR / "developer-informed-oracle" / "dio_bic_conf_1token.json"

line_input_file_d4j  = DATA_DIR / "defects4j" / "d4j_bic_conf_original.json"
token_input_file_d4j = DATA_DIR  / "defects4j" / "d4j_bic_conf_1token.json"

print("BASE_DIR   :", BASE_DIR)
print("OUTPUT_DIR :", OUTPUT_DIR)


BASE_DIR   : /local2/i-kondo/szz/majority-voting-szz-replication-package
OUTPUT_DIR : /local2/i-kondo/szz/majority-voting-szz-replication-package/scripts/preliminary_study/bar_chart


## Library Requirements

The notebook relies on the following Python packages:

* numpy  
* matplotlib  

Install them with:

```bash
pip install numpy matplotlib


In [2]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib.ticker import MaxNLocator          # added

def _aggregate_tp_fp(
    line_json_path: str,
    token_json_path: str,
    *,
    fp_mode: bool = False,
    out_prefix: str = "./comparison"
):
    # ---------- Load data ----------
    with open(line_json_path, "r", encoding="utf-8") as f:
        data_line = json.load(f)
    with open(token_json_path, "r", encoding="utf-8") as f:
        data_token = json.load(f)

    line_dict  = {d["id"]: d for d in data_line}
    token_dict = {d["id"]: d for d in data_token}

    # ---------- Aggregate TP / FP ----------
    line_by_x, token_by_x = defaultdict(list), defaultdict(list)
    for _id, l_item in line_dict.items():
        if _id not in token_dict:
            continue                       # Skip IDs not present in both datasets
        t_item = token_dict[_id]

        bug_commits = set(t_item.get("bug_commit_hash", []))

        l_inducing = l_item.get("inducing_commit_hash", [])
        t_inducing = t_item.get("inducing_commit_hash", [])

        x_val = len(l_inducing)            # X-axis: number of deleted lines

        l_set = {c["commit_hash"] for c in l_inducing if c.get("commit_hash")}
        t_set = {c["commit_hash"] for c in t_inducing if c.get("commit_hash")}

        if fp_mode:                        # FP (= false positives)
            line_by_x[x_val].append(len(l_set - bug_commits))
            token_by_x[x_val].append(len(t_set - bug_commits))
        else:                              # TP (= true positives)
            line_by_x[x_val].append(len(l_set & bug_commits))
            token_by_x[x_val].append(len(t_set & bug_commits))

    # ---------- Exclude categories with 0 deleted lines ----------
    all_x = sorted(val for val in (set(line_by_x) | set(token_by_x)) if val > 0)
    if not all_x:
        print("No data (x>0) – skipping.")
        return

    # ---------- Prepare data for plotting ----------
    l_agg = [sum(line_by_x.get(x, []))   for x in all_x]
    t_agg = [sum(token_by_x.get(x, []))  for x in all_x]

    # ---------- Plot ----------
    plt.figure(figsize=(10, 6))
    x_pos = np.arange(len(all_x)) + 1      # Start from 1 instead of 0
    bar_w = 0.4
    plt.bar(x_pos - bar_w/2, l_agg, width=bar_w,
            color="darkgreen",  alpha=0.7, label="Line-level SZZ")
    plt.bar(x_pos + bar_w/2, t_agg, width=bar_w,
            color="darkorange", alpha=0.7, label="Token-level SZZ")

    plt.xticks(x_pos, all_x, fontsize=14, rotation=45)
    plt.yticks(fontsize=14)
    plt.xlabel("Deleted lines in defect-fixing commit", fontsize=20)
    plt.ylabel("Number of " + ("FPs" if fp_mode else "TPs"), fontsize=20)
    plt.legend(fontsize=16)
    plt.grid(axis="y", linestyle="--", alpha=0.7)

    plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))  # integer ticks

    plt.tight_layout()
    os.makedirs(os.path.dirname(out_prefix) or ".", exist_ok=True)
    suffix = "fp" if fp_mode else "tp"
    plt.savefig(f"{out_prefix}-{suffix}-bar-sum.png")
    plt.close()

# Convenience wrappers
def plot_line_token_comparison_bar_tp(line_json_path, token_json_path, out_prefix="./comparison_tp"):
    _aggregate_tp_fp(line_json_path, token_json_path, fp_mode=False, out_prefix=out_prefix)

def plot_line_token_comparison_bar_fp(line_json_path, token_json_path, out_prefix="./comparison_fp"):
    _aggregate_tp_fp(line_json_path, token_json_path, fp_mode=True,  out_prefix=out_prefix)


## Generate Charts — Developer-informed oracle

In [5]:
# True positives
plot_line_token_comparison_bar_tp(
    line_input_file_dio,
    token_input_file_dio,
    out_prefix=str(OUTPUT_DIR / "dio")
)

# False positives
plot_line_token_comparison_bar_fp(
    line_input_file_dio,
    token_input_file_dio,
    out_prefix=str(OUTPUT_DIR / "dio")
)

print("✅ DIO charts saved to:", OUTPUT_DIR)


✅ DIO charts saved to: /local2/i-kondo/szz/majority-voting-szz-replication-package/scripts/preliminary_study/bar_chart


## Generate Charts — Defects4J Dataset

In [6]:
# True positives
plot_line_token_comparison_bar_tp(
    line_input_file_d4j,
    token_input_file_d4j,
    out_prefix=str(OUTPUT_DIR / "d4j")
)

# False positives
plot_line_token_comparison_bar_fp(
    line_input_file_d4j,
    token_input_file_d4j,
    out_prefix=str(OUTPUT_DIR / "d4j")
)

print("✅ Defects4J charts saved to:", OUTPUT_DIR)


✅ Defects4J charts saved to: /local2/i-kondo/szz/majority-voting-szz-replication-package/scripts/preliminary_study/bar_chart
