In [1]:
import subprocess, csv, json, sys
from pathlib import Path

# ---------------------------------------------------------------------------
# 0. Paths / constants
# ---------------------------------------------------------------------------
BASE_DIR = Path().resolve().parents[3]
INPUT_CSV   = BASE_DIR / "dataset" / "preparation" / "d4j_bug_commit_hash.csv"
OUTPUT_JSON = (
    BASE_DIR
    / "dataset" / "pyszz_v2" / "json-input-raw" / "defects4j"
    / "d4j_bugfix_commits_original.json"
)
GITHUB_ROOT = BASE_DIR / "dataset" / "repositories" / "defects4j" / "github.com"

# Primary mapping: Defects4J project ID → official GitHub org/repo
PRIMARY_MAP: dict[str, str] = {
    "Cli":          "apache/commons-cli",
    "Closure":      "google/closure-compiler",
    "Codec":        "apache/commons-codec",
    "Compress":     "apache/commons-compress",
    "Gson":         "google/gson",
    "JacksonCore":  "FasterXML/jackson-core",
    "Jsoup":        "jhy/jsoup",
    "Lang":         "apache/commons-lang",
    "Math":         "apache/commons-math",
    "Mockito":      "mockito/mockito",
    "Time":         "JodaOrg/joda-time",
}

# Fallback mapping: same repository name but under the Defects4J organisation
FALLBACK_ORG = "Defects4J"

# ---------------------------------------------------------------------------
# 1. Helper utilities
# ---------------------------------------------------------------------------

def repo_path(org_repo: str) -> Path:
    """Return the absolute path to the local clone of *org_repo*."""
    return GITHUB_ROOT / Path(org_repo)


def ensure_full_history(repo: Path) -> None:
    """Convert a shallow clone into a full clone if required (best‑effort)."""
    shallow_file = repo / ".git" / "shallow"
    if shallow_file.exists():
        subprocess.call(
            ["git", "-C", str(repo), "fetch", "--unshallow"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )


def rev_parse(repo: Path, short: str):
    """Run `git rev-parse` inside *repo* and return the full SHA or *None*."""
    try:
        return subprocess.check_output(
            ["git", "-C", str(repo), "rev-parse", short],
            text=True,
            stderr=subprocess.DEVNULL,
        ).strip()
    except subprocess.CalledProcessError:
        return None


def resolve_short_sha(org_repo: str, short: str):
    """Resolve *short* in the primary repo, then in the Defects4J mirror.
    Returns a tuple *(full_sha_or_None, path_tried)*."""
    # 1) primary repository
    repo_dir = repo_path(org_repo)
    if repo_dir.exists():
        ensure_full_history(repo_dir)
        sha = rev_parse(repo_dir, short)
        if sha:
            return sha, str(repo_dir)
    # 2) fallback mirror
    org, repo = org_repo.split("/", 1)
    mirror = f"{FALLBACK_ORG}/{repo}"
    mirror_dir = repo_path(mirror)
    if mirror_dir.exists():
        ensure_full_history(mirror_dir)
        sha = rev_parse(mirror_dir, short)
        if sha:
            return sha, str(mirror_dir)
    return None, str(repo_dir)

# ---------------------------------------------------------------------------
# 2. Load CSV → in‑memory records
# ---------------------------------------------------------------------------
records: list[dict] = []
with open(INPUT_CSV, newline="", encoding="utf-8") as f:
    for idx, row in enumerate(csv.DictReader(f)):
        records.append(
            {
                "id": idx + 10000,
                "pid": row["pid"].strip(),       # original project ID (e.g. "Closure")
                "bug_id": row["vid"].strip(),    # bug.id as a string
                "bug_commit_hash": [row["commit"].strip()],
            }
        )

# ---------------------------------------------------------------------------
# 3. Build a lookup table from active‑bugs.csv (keyed by *pid*)
# ---------------------------------------------------------------------------
active_bugs: dict[str, dict[str, str]] = {}
for pid in {r["pid"] for r in records}:
    csv_path = (
        BASE_DIR / "dataset" / "defects4j" / "framework" / "projects" / pid / "active-bugs.csv"
    )
    table: dict[str, str] = {}
    if csv_path.exists():
        with open(csv_path, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                table[row["bug.id"].strip()] = row["revision.id.fixed"].strip()
    else:
        print(f"[warn] {csv_path} does not exist.", file=sys.stderr)
    active_bugs[pid] = table

# ---------------------------------------------------------------------------
# 4. Enrich each record and resolve its short SHA
# ---------------------------------------------------------------------------
for rec in records:
    pid = rec.pop("pid")                    # e.g. "Closure"
    org_repo = PRIMARY_MAP.get(pid, pid)    # mapped <org>/<repo>

    # a) add the corresponding fix commit
    rec["fix_commit_hash"] = active_bugs.get(pid, {}).get(rec["bug_id"], None)

    # b) resolve the 7‑char SHA to a 40‑char SHA‑1
    short = rec["bug_commit_hash"][0]
    full_sha, path_seen = resolve_short_sha(org_repo, short)
    if full_sha:
        rec["bug_commit_hash"][0] = full_sha
    else:
        print(
            f"[warn] {org_repo}: failed to resolve '{rec['bug_commit_hash']}'.",
            file=sys.stderr,
        )

    # c) finalise record
    rec["repo_name"] = org_repo
    rec.pop("bug_id", None)
    rec["language"] = ["java"]

# ---------------------------------------------------------------------------
# 5. Write the resulting JSON
# ---------------------------------------------------------------------------
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Wrote {len(records)} records to {OUTPUT_JSON}")


✅ Wrote 130 records to /local2/i-kondo/szz/majority-voting-szz-replication-package/dataset/pyszz_v2/json-input-raw/defects4j/d4j_bugfix_commits_original.json
