# pubchem.author Namespace Discovery

Analyze pubchem.author to match namespaces to Bioregistry namespaces, showing where they appear in schema patterns.

In [None]:
db = "pubchem.author"
ttl_db_path = f"../../docs/data/schema_extraction/{db}/{db}_generated_void.ttl"

In [None]:
import bioregistry
import pandas as pd

from rdfsolve.api import load_parser_from_file

vp = load_parser_from_file(ttl_db_path)

In [None]:
# Load pre-computed instances from schema extraction
instances_prefix = f"../../docs/data/schema_extraction/{db}/{db}_instances"
instances_df, subject_index, object_index = vp.load_instances_compact(instances_prefix)

# Also load the frequencies for shape patterns
freqs_df = pd.read_csv(f"../../docs/data/schema_extraction/{db}/{db}_pattern_coverage.csv")

print(f"Loaded {len(instances_df)} instance relationships")
print(f"Loaded {len(freqs_df)} schema patterns")

## Extract Unique Resources

In [None]:
subject_iris = set(instances_df["subject_iri"])
object_iris = set(instances_df["object_iri"])
all_iris = subject_iris | object_iris

print(f"Total unique subjects: {len(subject_iris)}")
print(f"Total unique objects: {len(object_iris)}")
print(f"Total unique IRIs: {len(all_iris)}")

## Match Resources to Bioregistry

In [None]:
from itertools import chain

matched = {}
unmatched = {}

for iri in all_iris:
    if not isinstance(iri, str) or not iri.startswith("http"):
        continue

    curie = bioregistry.curie_from_iri(iri)
    if curie:
        prefix = curie.split(":")[0]
        if prefix not in matched:
            matched[prefix] = []
        matched[prefix].append(iri)
    else:
        unmatched[iri] = None

all_matched_iris = list(chain.from_iterable(matched.values()))
print(f"Matched to {len(matched)} bioregistry prefixes: {len(all_matched_iris)} IRIs")
print(f"Unmatched: {len(unmatched)} IRIs")

## Build IRI-to-Shape Index

In [None]:
iri_shapes = {}

for _, row in instances_df.iterrows():
    shape_id = row["shape_id"]
    subject = row["subject_iri"]
    obj = row["object_iri"]

    pattern = freqs_df[freqs_df["shape_id"] == shape_id]["shape_pattern"].iloc[0]

    if subject not in iri_shapes:
        iri_shapes[subject] = {"as_subject": set(), "as_object": set()}
    iri_shapes[subject]["as_subject"].add((shape_id, pattern))

    if obj not in iri_shapes:
        iri_shapes[obj] = {"as_subject": set(), "as_object": set()}
    iri_shapes[obj]["as_object"].add((shape_id, pattern))

print(f"Indexed {len(iri_shapes)} unique IRIs")

## Detailed Matched Resources

## Matched Prefix Summary

In [None]:
prefix_summary = []

for prefix, iris in sorted(matched.items(), key=lambda x: len(x[1]), reverse=True):
    # Count how many times these IRIs appear as subject vs object
    as_subject_count = sum(1 for iri in iris if iri in iri_shapes and iri_shapes[iri]["as_subject"])
    as_object_count = sum(1 for iri in iris if iri in iri_shapes and iri_shapes[iri]["as_object"])

    prefix_summary.append(
        {
            "prefix": prefix,
            "iri_count": len(iris),
            "as_subject": as_subject_count,
            "as_object": as_object_count,
            "example_iri": iris[0],
        }
    )

prefix_df = pd.DataFrame(prefix_summary)
print(f"Matched {len(matched)} bioregistry prefixes:")
prefix_df

In [None]:
matched_results = []

for prefix, iris in matched.items():
    for iri in iris:
        info = iri_shapes.get(iri, {"as_subject": set(), "as_object": set()})

        subject_patterns = [p for _, p in info["as_subject"]]
        object_patterns = [p for _, p in info["as_object"]]

        matched_results.append(
            {
                "prefix": prefix,
                "iri": iri,
                "as_subject": len(info["as_subject"]),
                "as_object": len(info["as_object"]),
                "subject_pattern": subject_patterns[0] if subject_patterns else None,
                "object_pattern": object_patterns[0] if object_patterns else None,
            }
        )

matched_df = pd.DataFrame(matched_results)
print(f"Matched resources: {len(matched_results)} IRIs across {len(matched)} prefixes")
matched_df

## Results: Unmatched Resources

In [None]:
unmatched_iris = list(unmatched.keys())
unmatched_results = []

for iri in unmatched_iris[:20]:
    info = iri_shapes.get(iri, {"as_subject": set(), "as_object": set()})

    subject_patterns = [p for _, p in info["as_subject"]]
    object_patterns = [p for _, p in info["as_object"]]

    unmatched_results.append(
        {
            "iri": iri[:80],
            "as_subject": len(info["as_subject"]),
            "as_object": len(info["as_object"]),
            "subject_pattern": subject_patterns[0] if subject_patterns else None,
            "object_pattern": object_patterns[0] if object_patterns else None,
        }
    )

unmatched_df = pd.DataFrame(unmatched_results)
print(f"Unmatched resources ({len(unmatched_results)} of {len(unmatched_iris)} shown):")
unmatched_df

## Summary

In [None]:
total_matched = len(sum(matched.values(), []))
matched_subjects = sum(
    1 for iri in sum(matched.values(), []) if iri in iri_shapes and iri_shapes[iri]["as_subject"]
)
matched_objects = sum(
    1 for iri in sum(matched.values(), []) if iri in iri_shapes and iri_shapes[iri]["as_object"]
)

print("NAMESPACE DISCOVERY SUMMARY")
print(f"Total unique IRIs analyzed: {len(all_iris)}")
print("\nMatched to Bioregistry:")
print(f"  Total: {total_matched} IRIs ({len(matched)} prefixes)")
print(f"  As subjects: {matched_subjects} IRIs")
print(f"  As objects: {matched_objects} IRIs")
print(f"  Coverage: {(total_matched / len(all_iris) * 100):.1f}%")
print("\nUnmatched:")
print(f"  Total: {len(unmatched)} IRIs")
print(f"  Coverage: {(len(unmatched) / len(all_iris) * 100):.1f}%")

## Export Matching Results

In [None]:
import os

# Create namespaces subdirectory
namespaces_path = f"../../docs/data/schema_extraction/{db}/namespaces"
os.makedirs(namespaces_path, exist_ok=True)

# Export prefix summary
prefix_df.to_csv(f"{namespaces_path}/prefix_summary.csv", index=False)
print(f"Exported prefix summary: {len(prefix_df)} prefixes")

# Export matched resources
matched_df.to_csv(f"{namespaces_path}/matched_resources.csv", index=False)
print(f"Exported matched resources: {len(matched_df)} IRIs")

# Export unmatched resources (all, not just sample)
unmatched_full = []
for iri in unmatched.keys():
    info = iri_shapes.get(iri, {"as_subject": set(), "as_object": set()})
    subject_patterns = [p for _, p in info["as_subject"]]
    object_patterns = [p for _, p in info["as_object"]]

    unmatched_full.append(
        {
            "iri": iri,
            "as_subject": len(info["as_subject"]),
            "as_object": len(info["as_object"]),
            "subject_pattern": subject_patterns[0] if subject_patterns else None,
            "object_pattern": object_patterns[0] if object_patterns else None,
        }
    )

unmatched_full_df = pd.DataFrame(unmatched_full)
unmatched_full_df.to_csv(f"{namespaces_path}/unmatched_resources.csv", index=False)
print(f"Exported unmatched resources: {len(unmatched_full_df)} IRIs")

print(f"\nAll results exported to: {namespaces_path}/")