# Create Annotation File from Extraction JSON

This notebook reads extraction JSON files from parent folder and creates simplified annotation file containing only compositions and their properties.

In [7]:
from pathlib import Path
import json

# === Set this to your top-level folder ===
BASE_DIR = Path(r"C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\New\Hasan")

def filter_properties(record: dict) -> dict:
    """Keep compositions, but filter properties to those with a non-empty standard_property_name."""
    out = {"compositions": []}
    for comp in record.get("compositions", []):
        props = comp.get("properties_of_composition", [])
        filtered = [p for p in props if p.get("standard_property_name")]
        out["compositions"].append({
            "composition": comp.get("composition", ""),
            "properties_of_composition": filtered
        })
    return out

def process_extraction_file(src_path: Path) -> Path | None:
    """Process a single *_extraction.json file and write annotation_*.json next to it."""
    try:
        with src_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"[ERROR] Failed to read {src_path}: {e}")
        return None

    filtered = filter_properties(data)

    # Output name starts with 'annotation_' and stays in the same folder
    out_path = src_path.with_name(f"annotation_{src_path.stem}{src_path.suffix}")

    try:
        with out_path.open("w", encoding="utf-8") as f:
            json.dump(filtered, f, indent=2, ensure_ascii=False)
        return out_path
    except Exception as e:
        print(f"[ERROR] Failed to write {out_path}: {e}")
        return None

def process_folder(base_dir: Path):
    files = list(base_dir.rglob("*_extraction.json"))
    if not files:
        print(f"No *_extraction.json files found under: {base_dir}")
        return

    print(f"Found {len(files)} file(s). Processing...\n")
    success, failed = 0, 0
    for fp in files:
        out = process_extraction_file(fp)
        if out:
            success += 1
            print(f"[OK]  {fp.relative_to(base_dir)}  -->  {out.name}")
        else:
            failed += 1

    print("\nDone.")
    print(f"Successful: {success}")
    print(f"Failed:     {failed}")

# Run it
process_folder(BASE_DIR)


Found 10 file(s). Processing...

[OK]  mech_13_alloys_and_compounds\mech_13_alloys_and_compounds_extraction.json  -->  annotation_mech_13_alloys_and_compounds_extraction.json
[OK]  mech_1_physical_review_b\mech_1_physical_review_b_extraction.json  -->  annotation_mech_1_physical_review_b_extraction.json
[OK]  mech_3_acta_matarialia\mech_3_acta_matarialia_extraction.json  -->  annotation_mech_3_acta_matarialia_extraction.json
[OK]  mech_6_alloys_and_compounds\mech_6_alloys_and_compounds_extraction.json  -->  annotation_mech_6_alloys_and_compounds_extraction.json
[OK]  mech_9_physical_review_b\mech_9_physical_review_b_extraction.json  -->  annotation_mech_9_physical_review_b_extraction.json
[OK]  thermo_12_j_apl_phy\thermo_12_j_apl_phy_extraction.json  -->  annotation_thermo_12_j_apl_phy_extraction.json
[OK]  thermo_2_mat_aci_eng_b\thermo_2_mat_aci_eng_b_extraction.json  -->  annotation_thermo_2_mat_aci_eng_b_extraction.json
[OK]  thermo_4_alloys_and_compounds\thermo_4_alloys_and_compoun