# Create Annotation File from Extraction JSON

This notebook reads an extraction JSON file and creates a simplified annotation file containing only compositions and their properties.

In [7]:
import json
from pathlib import Path

# Input file path - modify this to point to your extraction JSON file
input_file_path = r"C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\km2_papers\km2_papers\Hasan\mat_tech_4\mat_tech_4_extraction.json"

# Convert to Path object
input_path = Path(input_file_path)

# Create output file path by adding '_annotation' before the extension
output_path = input_path.parent / f"{input_path.stem}_annotation{input_path.suffix}"

print(f"Input file: {input_path}")
print(f"Output file: {output_path}")

# Read the extraction JSON file
with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Successfully loaded extraction file")
print(f"Number of compositions: {len(data.get('compositions', []))}")

# Create annotation structure with only composition and properties
annotation_data = {
    "compositions": []
}

for comp in data.get('compositions', []):
    annotation_comp = {
        "composition": comp.get("composition", ""),
        "properties_of_composition": comp.get("properties_of_composition", [])
    }
    annotation_data["compositions"].append(annotation_comp)

print(f"Created annotation structure with {len(annotation_data['compositions'])} compositions")

# Write the annotation JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(annotation_data, f, indent=2, ensure_ascii=False)

print(f"Successfully created annotation file: {output_path}")

Input file: C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\km2_papers\km2_papers\Hasan\mat_tech_4\mat_tech_4_extraction.json
Output file: C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\km2_papers\km2_papers\Hasan\mat_tech_4\mat_tech_4_extraction_annotation.json
Successfully loaded extraction file
Number of compositions: 3
Created annotation structure with 3 compositions
Successfully created annotation file: C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\km2_papers\km2_papers\Hasan\mat_tech_4\mat_tech_4_extraction_annotation.json


In [1]:
import pickle
import pandas as pd

# Path to your .pkl file
pkl_path = "C:\\Users\\hsayeed\\Documents\\GitHub\\KnowMat2\\mpds-properties.pkl"

# Read the pickle file
with open(pkl_path, "rb") as f:
    data = pickle.load(f)

# Convert to a pandas DataFrame
df = pd.DataFrame(data, columns=[
    "id", "formula", "space_group", "paper_id",
    "property_name", "unit", "value"
])

# Save as CSV
csv_path = "output.csv"
df.to_csv(csv_path, index=False)

print(f"Saved CSV to: {csv_path}")
print(df.head())


ValueError: 7 columns passed, passed data had 97341 columns

In [3]:
import pickle
import pandas as pd
from ast import literal_eval
from itertools import zip_longest
from pathlib import Path

pkl_path = "C:\\Users\\hsayeed\\Documents\\GitHub\\KnowMat2\\mpds-properties.pkl"
csv_path = "output.csv"

# 1) Load the pickle
with open(pkl_path, "rb") as f:
    obj = pickle.load(f)

# 2) If it's a string that looks like a Python list, parse it
if isinstance(obj, str):
    try:
        data = literal_eval(obj)  # safer than eval; handles Python repr with single quotes
    except Exception as e:
        raise ValueError(f"Loaded a string from pickle but couldn't parse it as a list: {e}")
else:
    data = obj

# 3) Quick diagnostics: print type and a few samples
print(f"type(data) = {type(data)}")
if isinstance(data, list):
    print(f"len(data) = {len(data)}")
    if len(data) > 0:
        print("First row sample:", data[0])
        try:
            print("Row lengths (first 5):", [len(r) for r in data[:5]])
        except TypeError:
            print("Rows are not sequences; will wrap each item as a single-element row.")
else:
    # If it's not a list (e.g., numpy array, pandas object), try to coerce to list of rows
    try:
        data = list(data)
        print("Coerced data to list via list(...).")
    except Exception as e:
        raise ValueError(f"Don't know how to handle object of type {type(obj)}: {e}")

# 4) Normalize to a list of rows (list of sequences). If elements aren't sequences, wrap them.
rows = []
for item in data:
    if isinstance(item, (list, tuple)):
        rows.append(list(item))
    else:
        # Non-sequence element; treat as a 1-column row
        rows.append([item])

# If rows are ragged (varying lengths), pad them with None so pandas can ingest uniformly.
max_len = max(len(r) for r in rows) if rows else 0
if any(len(r) != max_len for r in rows):
    rows = [list(z) for z in zip_longest(*rows, fillvalue=None)]  # <-- careful: this transposes!
    # The above zip_longest transposed the data. Undo the transpose to keep rows as rows.
    rows = [list(r) for r in zip_longest(*rows, fillvalue=None)]

# 5) Make column names: use your 7 well-known names if it matches; otherwise generic.
known_cols = ["id", "formula", "space_group", "paper_id", "property_name", "unit", "value"]
if max_len == len(known_cols):
    columns = known_cols
else:
    columns = [f"col_{i}" for i in range(max_len)]
    print(f"Detected {max_len} columns. Using generic names: {columns[:10]}{'...' if max_len>10 else ''}")

df = pd.DataFrame(rows, columns=columns)

# Optional: if you know some columns should be numeric, coerce them
for c in ("space_group", "value"):
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# 6) Save CSV
df.to_csv(csv_path, index=False)
print(f"Saved CSV to {Path(csv_path).resolve()}")

# Show a preview
print(df.head(10))


type(data) = <class 'list'>
len(data) = 7
First row sample: [[10, 'CuAl2', 140, 'P1313748-5', 'wavenumber of longitudinal optical phonon', 'mm-1', 26.4], [13, 'Al', 225, 'P1424214-1', 'work function', 'eV', 4.25], [13, 'Al', 225, 'P1424214-2', 'work function', 'eV', 4.07], [26, 'SrS', 225, 'P1923067-2', 'wavelength for luminescence', 'nm', 381.0], [32, 'SbB hyp', 216, 'P1117618-4', 'wavenumber of longitudinal optical phonon', 'mm-1', 62.1], [32, 'SbB hyp', 216, 'P1117618-5', 'wavenumber of transverse optical phonon', 'mm-1', 61.1], [32, 'SbB hyp', 216, 'P1916707-4', 'reflectivity', '', 0.19], [32, 'SbB hyp', 216, 'P1916707-5', 'refractive index', '', 2.52], [32, 'SbB hyp', 216, 'P1916707-6', 'pressure dependence of refractive index', 'GPa-1', -0.000353], [76, 'Ru rt', 194, 'P1309618-10', 'reflectivity', '', 0.99], [76, 'Ru rt', 194, 'P1309618-11', 'reflectivity', '', 0.19], [76, 'Ru rt', 194, 'P1309618-12', 'optical conductivity', 'Omega-1 m-1', 530000.0], [76, 'Ru rt', 194, 'P1309618-

In [4]:
import pickle
import pandas as pd
from ast import literal_eval
from pathlib import Path

# ---- config ----
pkl_path = "C:\\Users\\hsayeed\\Documents\\GitHub\\KnowMat2\\mpds-properties.pkl"
out_dir = Path("csv_by_property")
out_dir.mkdir(parents=True, exist_ok=True)

# Canonical property buckets in order (and filenames)
property_names = [
    "optical properties",
    "phase transitions",
    "electronic and electrical properties",
    "superconductivity",
    "magnetic properties",
    "mechanical properties",
    "thermal and thermodynamic properties",
]

columns = ["id", "formula", "space_group", "paper_id", "property_name", "unit", "value"]

# ---- load pickle ----
with open(pkl_path, "rb") as f:
    obj = pickle.load(f)

# If the pickle contains a string that looks like Python data, parse it
if isinstance(obj, str):
    try:
        obj = literal_eval(obj)
    except Exception as e:
        raise ValueError(f"Pickle contained a string but couldn't parse it: {e}")

def to_rows_list(x):
    """Coerce an object into a list of row-lists (len==7); drop malformed rows."""
    rows = []
    for item in x:
        if isinstance(item, (list, tuple)):
            if len(item) == 7:
                rows.append(list(item))
            else:
                # Skip malformed rows; comment this out if you prefer padding
                continue
        else:
            # Not a row; skip
            continue
    return rows

# ---- handle two common shapes: dict or list[7] ----
buckets = {}

if isinstance(obj, dict):
    # Expect keys to be the property names (any capitalization/whitespace tolerated)
    norm_map = {k.strip().lower(): k for k in obj.keys()}
    for pname in property_names:
        key_norm = pname.lower()
        if key_norm in norm_map:
            orig_key = norm_map[key_norm]
            buckets[pname] = to_rows_list(obj[orig_key])
        else:
            buckets[pname] = []  # missing bucket -> empty CSV
elif isinstance(obj, list) and len(obj) == 7 and all(isinstance(x, (list, tuple)) for x in obj):
    # Assume obj[i] corresponds to property_names[i]
    for pname, bucket in zip(property_names, obj):
        buckets[pname] = to_rows_list(bucket)
else:
    raise ValueError(
        "Unexpected pickle structure. Expected a dict keyed by the 7 property names "
        "or a list of length 7 where each element is a list of rows."
    )

# ---- write CSVs ----
for pname in property_names:
    rows = buckets[pname]
    df = pd.DataFrame(rows, columns=columns)
    # (optional) type coercions
    for c in ("space_group", "value"):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    # safe filename
    fname = pname.replace(" ", "_").replace("/", "-")
    out_path = out_dir / f"{fname}.csv"
    df.to_csv(out_path, index=False)
    print(f"Wrote {len(df):>6} rows -> {out_path}")

print("Done.")


Wrote  13367 rows -> csv_by_property\optical_properties.csv
Wrote  30263 rows -> csv_by_property\phase_transitions.csv
Wrote  67289 rows -> csv_by_property\electronic_and_electrical_properties.csv
Wrote  12086 rows -> csv_by_property\superconductivity.csv
Wrote  97341 rows -> csv_by_property\magnetic_properties.csv
Wrote  13871 rows -> csv_by_property\mechanical_properties.csv
Wrote  40655 rows -> csv_by_property\thermal_and_thermodynamic_properties.csv
Done.


In [6]:
import json
from pathlib import Path

# Input file path - modify this to point to your extraction JSON file
input_file_path = r"C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\New\Hasan\mech_3_acta_matarialia\mech_3_acta_matarialia_extraction.json"

# Convert to Path object
input_path = Path(input_file_path)

# Create output file path by adding '_annotation' before the extension
output_path = input_path.parent / f"{input_path.stem}_annotation{input_path.suffix}"

print(f"Input file: {input_path}")
print(f"Output file: {output_path}")

# Read the extraction JSON file
with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Successfully loaded extraction file")
print(f"Number of compositions: {len(data.get('compositions', []))}")

# Create annotation structure with only composition and properties
annotation_data = {"compositions": []}

for comp in data.get("compositions", []):
    # Filter out properties where standard_property_name is None or empty
    filtered_properties = [
        prop for prop in comp.get("properties_of_composition", [])
        if prop.get("standard_property_name")
    ]
    
    annotation_comp = {
        "composition": comp.get("composition", ""),
        "properties_of_composition": filtered_properties
    }
    annotation_data["compositions"].append(annotation_comp)

print(f"Created annotation structure with {len(annotation_data['compositions'])} compositions")

# Write the annotation JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(annotation_data, f, indent=2, ensure_ascii=False)

print(f"Successfully created filtered annotation file: {output_path}")


Input file: C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\New\Hasan\mech_3_acta_matarialia\mech_3_acta_matarialia_extraction.json
Output file: C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\New\Hasan\mech_3_acta_matarialia\mech_3_acta_matarialia_extraction_annotation.json
Successfully loaded extraction file
Number of compositions: 2
Created annotation structure with 2 compositions
Successfully created filtered annotation file: C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\New\Hasan\mech_3_acta_matarialia\mech_3_acta_matarialia_extraction_annotation.json


In [7]:
from pathlib import Path
import json

# === Set this to your top-level folder ===
BASE_DIR = Path(r"C:\Users\hsayeed\Documents\GitHub\KnowMat2\data\raw\New\Hasan")

def filter_properties(record: dict) -> dict:
    """Keep compositions, but filter properties to those with a non-empty standard_property_name."""
    out = {"compositions": []}
    for comp in record.get("compositions", []):
        props = comp.get("properties_of_composition", [])
        filtered = [p for p in props if p.get("standard_property_name")]
        out["compositions"].append({
            "composition": comp.get("composition", ""),
            "properties_of_composition": filtered
        })
    return out

def process_extraction_file(src_path: Path) -> Path | None:
    """Process a single *_extraction.json file and write annotation_*.json next to it."""
    try:
        with src_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"[ERROR] Failed to read {src_path}: {e}")
        return None

    filtered = filter_properties(data)

    # Output name starts with 'annotation_' and stays in the same folder
    out_path = src_path.with_name(f"annotation_{src_path.stem}{src_path.suffix}")

    try:
        with out_path.open("w", encoding="utf-8") as f:
            json.dump(filtered, f, indent=2, ensure_ascii=False)
        return out_path
    except Exception as e:
        print(f"[ERROR] Failed to write {out_path}: {e}")
        return None

def process_folder(base_dir: Path):
    files = list(base_dir.rglob("*_extraction.json"))
    if not files:
        print(f"No *_extraction.json files found under: {base_dir}")
        return

    print(f"Found {len(files)} file(s). Processing...\n")
    success, failed = 0, 0
    for fp in files:
        out = process_extraction_file(fp)
        if out:
            success += 1
            print(f"[OK]  {fp.relative_to(base_dir)}  -->  {out.name}")
        else:
            failed += 1

    print("\nDone.")
    print(f"Successful: {success}")
    print(f"Failed:     {failed}")

# Run it
process_folder(BASE_DIR)


Found 10 file(s). Processing...

[OK]  mech_13_alloys_and_compounds\mech_13_alloys_and_compounds_extraction.json  -->  annotation_mech_13_alloys_and_compounds_extraction.json
[OK]  mech_1_physical_review_b\mech_1_physical_review_b_extraction.json  -->  annotation_mech_1_physical_review_b_extraction.json
[OK]  mech_3_acta_matarialia\mech_3_acta_matarialia_extraction.json  -->  annotation_mech_3_acta_matarialia_extraction.json
[OK]  mech_6_alloys_and_compounds\mech_6_alloys_and_compounds_extraction.json  -->  annotation_mech_6_alloys_and_compounds_extraction.json
[OK]  mech_9_physical_review_b\mech_9_physical_review_b_extraction.json  -->  annotation_mech_9_physical_review_b_extraction.json
[OK]  thermo_12_j_apl_phy\thermo_12_j_apl_phy_extraction.json  -->  annotation_thermo_12_j_apl_phy_extraction.json
[OK]  thermo_2_mat_aci_eng_b\thermo_2_mat_aci_eng_b_extraction.json  -->  annotation_thermo_2_mat_aci_eng_b_extraction.json
[OK]  thermo_4_alloys_and_compounds\thermo_4_alloys_and_compoun