# Create Annotation File from Extraction JSON

This notebook reads an extraction JSON file and creates a simplified annotation file containing only compositions and their properties.

In [None]:
import json
from pathlib import Path

# Input file path - modify this to point to your extraction JSON file
input_file_path = r"C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\km2_papers\km2_papers\Hasan\phys_revb_1\phys_revb_1_extraction.json"

# Convert to Path object
input_path = Path(input_file_path)

# Create output file path by adding '_annotation' before the extension
output_path = input_path.parent / f"{input_path.stem}_annotation{input_path.suffix}"

print(f"Input file: {input_path}")
print(f"Output file: {output_path}")

# Read the extraction JSON file
with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Successfully loaded extraction file")
print(f"Number of compositions: {len(data.get('compositions', []))}")

# Create annotation structure with only composition and properties
annotation_data = {
    "compositions": []
}

for comp in data.get('compositions', []):
    annotation_comp = {
        "composition": comp.get("composition", ""),
        "properties_of_composition": comp.get("properties_of_composition", [])
    }
    annotation_data["compositions"].append(annotation_comp)

print(f"Created annotation structure with {len(annotation_data['compositions'])} compositions")

# Write the annotation JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(annotation_data, f, indent=2, ensure_ascii=False)

print(f"Successfully created annotation file: {output_path}")