# Majority Voting

# Imports & Function Definition

In [1]:
from pathlib import Path
import json
from collections import Counter
from typing import List, Dict, Any

def filter_inducing_commit_hashes(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Keep only the most frequent commit_hash entries inside
    each item's 'inducing_commit_hash' list.

    Parameters
    ----------
    data : List[Dict]
        Parsed JSON list.

    Returns
    -------
    List[Dict]
        Filtered data.
    """
    result: List[Dict[str, Any]] = []
    for entry in data:
        hashes = entry.get("inducing_commit_hash", [])
        if not hashes:                    # no hashes → copy as-is
            result.append(entry)
            continue

        counts = Counter(h["commit_hash"] for h in hashes)
        max_freq = max(counts.values())
        majority = {h for h, c in counts.items() if c == max_freq}

        new_entry = entry.copy()
        new_entry["inducing_commit_hash"] = [
            h for h in hashes if h["commit_hash"] in majority
        ]
        result.append(new_entry)
    return result


# Execution + Save

## Developer-informed oracle

### original: RQ2

In [2]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_original.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "developer-informed-oracle" / "dio_bic_conf_original_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_original.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/developer-informed-oracle/dio_bic_conf_original_mv.json
Processed 76 entries → 76 entries
✓ File saved successfully.


### 1token: RQ2

In [3]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_1token.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "developer-informed-oracle" / "dio_bic_conf_1token_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_1token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/developer-informed-oracle/dio_bic_conf_1token_mv.json
Processed 76 entries → 76 entries
✓ File saved successfully.


### 2token: RQ2

In [4]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_2token.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "developer-informed-oracle" / "dio_bic_conf_2token_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_2token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/developer-informed-oracle/dio_bic_conf_2token_mv.json
Processed 76 entries → 76 entries
✓ File saved successfully.


### 3token: RQ2

In [5]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_3token.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "developer-informed-oracle" / "dio_bic_conf_3token_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_3token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/developer-informed-oracle/dio_bic_conf_3token_mv.json
Processed 76 entries → 76 entries
✓ File saved successfully.


### 4token: RQ2, RQ3, Discussion

In [6]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_4token.json"
OUTPUT_FILE_1 = BASE_DIR / "rq2" / "developer-informed-oracle" / "dio_bic_conf_4token_mv.json"
OUTPUT_FILE_2 = BASE_DIR / "rq3" / "developer-informed-oracle" / "dio_bic_conf_4token_mv.json"
OUTPUT_FILE_3 = BASE_DIR / "discussion" / "developer-informed-oracle" / "dio_bic_conf_4token_mv_select.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE_1.resolve())
print("Output:", OUTPUT_FILE_2.resolve())
print("Output:", OUTPUT_FILE_3.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE_1.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_1.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

OUTPUT_FILE_2.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_2.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

OUTPUT_FILE_3.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_3.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_4token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/developer-informed-oracle/dio_bic_conf_4token_mv.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq3/developer-informed-oracle/dio_bic_conf_4token_mv.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/discussion/developer-informed-oracle/dio_bic_conf_4token_mv_select.json
Processed 76 entries → 76 entries


✓ File saved successfully.


### 5token: RQ2, Discussion

In [7]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_5token.json"
OUTPUT_FILE_1 = BASE_DIR / "rq2" / "developer-informed-oracle" / "dio_bic_conf_5token_mv.json"
OUTPUT_FILE_2 = BASE_DIR / "discussion" / "developer-informed-oracle" / "dio_bic_conf_5token_mv_select.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE_1.resolve())
print("Output:", OUTPUT_FILE_2.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE_1.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_1.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

OUTPUT_FILE_2.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_2.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_5token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/developer-informed-oracle/dio_bic_conf_5token_mv.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/discussion/developer-informed-oracle/dio_bic_conf_5token_mv_select.json
Processed 76 entries → 76 entries


✓ File saved successfully.


## Defects4j

### original: RQ2

In [8]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "d4j_bic_conf_original.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "defects4j" / "d4j_bic_conf_original_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/defects4j/d4j_bic_conf_original.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/defects4j/d4j_bic_conf_original_mv.json
Processed 130 entries → 130 entries
✓ File saved successfully.


### 1token: RQ2

In [9]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "d4j_bic_conf_1token.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "defects4j" / "d4j_bic_conf_1token_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/defects4j/d4j_bic_conf_1token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/defects4j/d4j_bic_conf_1token_mv.json
Processed 130 entries → 130 entries
✓ File saved successfully.


### 2token: RQ2

In [10]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "d4j_bic_conf_2token.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "defects4j" / "d4j_bic_conf_2token_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/defects4j/d4j_bic_conf_2token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/defects4j/d4j_bic_conf_2token_mv.json
Processed 130 entries → 130 entries
✓ File saved successfully.


### 3token: RQ2

In [11]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "d4j_bic_conf_3token.json"
OUTPUT_FILE = BASE_DIR / "rq2" / "defects4j" / "d4j_bic_conf_3token_mv.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/defects4j/d4j_bic_conf_3token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/defects4j/d4j_bic_conf_3token_mv.json
Processed 130 entries → 130 entries
✓ File saved successfully.


### 4token: RQ2, Discussion

In [12]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "d4j_bic_conf_4token.json"
OUTPUT_FILE_1 = BASE_DIR / "rq2" / "defects4j" / "d4j_bic_conf_4token_mv.json"
OUTPUT_FILE_2 = BASE_DIR / "discussion" / "defects4j" / "d4j_bic_conf_4token_mv_select.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE_1.resolve())
print("Output:", OUTPUT_FILE_2.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE_1.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_1.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

OUTPUT_FILE_2.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_2.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/defects4j/d4j_bic_conf_4token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/defects4j/d4j_bic_conf_4token_mv.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/discussion/defects4j/d4j_bic_conf_4token_mv_select.json
Processed 130 entries → 130 entries


✓ File saved successfully.


### 5token: RQ2, RQ3, Discussion

In [13]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "d4j_bic_conf_5token.json"
OUTPUT_FILE_1 = BASE_DIR / "rq2" / "defects4j" / "d4j_bic_conf_5token_mv.json"
OUTPUT_FILE_2 = BASE_DIR / "rq3" / "defects4j" / "d4j_bic_conf_5token_mv.json"
OUTPUT_FILE_3 = BASE_DIR / "discussion" / "defects4j" / "d4j_bic_conf_5token_mv_select.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE_1.resolve())
print("Output:", OUTPUT_FILE_2.resolve())
print("Output:", OUTPUT_FILE_3.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE_1.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_1.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

OUTPUT_FILE_2.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_2.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

OUTPUT_FILE_3.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE_3.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/defects4j/d4j_bic_conf_5token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq2/defects4j/d4j_bic_conf_5token_mv.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq3/defects4j/d4j_bic_conf_5token_mv.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/discussion/defects4j/d4j_bic_conf_5token_mv_select.json
Processed 130 entries → 130 entries


✓ File saved successfully.
