# Majority Voting (All tied commits excluded)

## Imports & Fuction Definition

In [1]:
from pathlib import Path
import json
from collections import Counter
from typing import List, Dict, Any

def filter_inducing_commit_hashes_unique(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Keep an item's 'inducing_commit_hash' list only when the most-frequent
    commit_hash is unique.  If multiple hashes tie for top frequency,
    the list is cleared.

    Parameters
    ----------
    data : List[Dict]
        Parsed JSON content.

    Returns
    -------
    List[Dict]
        Data with the 'inducing_commit_hash' field filtered per the rule above.
    """
    result: List[Dict[str, Any]] = []
    for entry in data:
        hashes = entry.get("inducing_commit_hash", [])
        if not hashes:                         # no hashes → copy as-is
            result.append(entry)
            continue

        counts     = Counter(h["commit_hash"] for h in hashes)
        max_freq   = max(counts.values())
        top_hashes = [h for h, c in counts.items() if c == max_freq]

        new_entry = entry.copy()
        new_entry["inducing_commit_hash"] = (
            [h for h in hashes if h["commit_hash"] == top_hashes[0]]
            if len(top_hashes) == 1
            else []                           # non-unique → empty list
        )
        result.append(new_entry)
    return result


## Execution + Save

## Developer-informed oracle

### 4token: Discussion

In [3]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_4token.json"
OUTPUT_FILE = BASE_DIR / "discussion" / "developer-informed-oracle" / "dio_bic_conf_4token_mv_exclude.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes_unique(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_4token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/discussion/developer-informed-oracle/dio_bic_conf_4token_mv_exclude.json
Processed 76 entries → 76 entries
✓ File saved successfully.


### 5token: Discussion

In [4]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "developer-informed-oracle" / "dio_bic_conf_5token.json"
OUTPUT_FILE = BASE_DIR / "discussion" / "developer-informed-oracle" / "dio_bic_conf_5token_mv_exclude.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes_unique(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

Input : /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/rq1/developer-informed-oracle/dio_bic_conf_5token.json
Output: /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-output-raw/discussion/developer-informed-oracle/dio_bic_conf_5token_mv_exclude.json
Processed 76 entries → 76 entries
✓ File saved successfully.


## Defects4j

### 4token: Discussion

In [None]:
# ---------- 1. Paths ----------
BASE_DIR = Path("../../../dataset/pyszz_v2/json-output-raw")
INPUT_FILE  = BASE_DIR / "rq1" / "defects4j" / "dio_bic_conf_4token.json"
OUTPUT_FILE = BASE_DIR / "discussion" / "defects4j" / "dio_bic_conf_4token_mv_exclude.json"

print("Input :", INPUT_FILE.resolve())
print("Output:", OUTPUT_FILE.resolve())

# ---------- 2. Load ----------
with INPUT_FILE.open(encoding="utf-8") as f:
    data = json.load(f)

# ---------- 3. Filter ----------
filtered_data = filter_inducing_commit_hashes_unique(data)
print(f"Processed {len(data):,} entries → {len(filtered_data):,} entries")

# ---------- 4. Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8", errors="backslashreplace") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("✓ File saved successfully.")

### 5token: Discussion