# pubchem.anatomy Schema Extraction

This notebook demonstrates RDF schema extraction from the pubchem.anatomy SPARQL endpoint by discovering or querying for VoID (Vocabulary of Interlinked Datasets) descriptions and some downstream uses.

In [None]:
# Dataset Configuration
import os

# Dataset parameters
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "pubchem.anatomy"
void_iri = "http://rdf.ncbi.nlm.nih.gov/pubchem/anatomy"
graph_uri = "http://rdf.ncbi.nlm.nih.gov/pubchem/anatomy"

# Setup paths
working_path = os.path.abspath("")
exports_path = os.path.join(working_path, "..", "..", "docs", "notebooks", dataset_name)
os.makedirs(exports_path, exist_ok=True)

In [None]:
# Import libraries
import json
from rdfsolve.void_parser import VoidParser
from IPython.display import display, Markdown

# Configure Plotly for HTML output
import plotly.io as pio
import plotly.offline as pyo

# Set renderer to 'notebook' for Jupyter, but ensure HTML export works
pio.renderers.default = "notebook+plotly_mimetype"

# Initialize offline mode for Plotly
pyo.init_notebook_mode(connected=True)

## Discover or get VoID Schema

In [None]:
vp = VoidParser.from_endpoint_with_discovery(
    endpoint_url=endpoint_url,
    dataset_name=dataset_name,
    exports_path=exports_path,
    #exclude_graph_patterns=["openlinksw", "well-known"], # Filter out administrative graphs, service descriptions, etc
    counts=True,
    #graph_uri=graph_uri,
)

In [None]:
discovery_df = vp.to_schema(
    filter_void_admin_nodes=True
)  # to filter out unwanted graphs here (TODO improve logic, add step when querying)
discovery_df

In [None]:
discovery_df.describe(include='all')

## Class Partition Coverage Analysis
Query again to know how many times do we find instances of each "shape" in the dataset.

In [None]:
output_path = os.path.join(exports_path, f"{dataset_name}_coverage.csv")
        
instance_counts, class_mappings, coverage_stats = vp.analyze_class_partition_usage(
            endpoint_url=endpoint_url,
            sample_limit=None
)
        
coverage_df = vp.export_coverage_analysis(
    coverage_stats, output_file=output_path
)

display(Markdown(f"**Saved to:** `{output_path}`"))

## Schema Pattern Coverage Analysis
For each subject class type, calculate how many entities participate in each schema pattern divided by the total number of entities of that class type. This gives coverage ratios showing what percentage of entities actually use each relationship pattern.

In [None]:
# Calculate schema pattern coverage ratios
frequencies_df = vp.count_schema_shape_frequencies(
    endpoint_url=endpoint_url,
)

# Show top patterns by coverage
frequencies_df[['subject_class', 'property', 'object_class', 'coverage_percent']].head(10)

In [None]:
# Export coverage analysis
frequencies_output_path = os.path.join(exports_path, f"{dataset_name}_pattern_coverage.csv")
exported_df = vp.export_schema_shape_frequencies(frequencies_df, output_file=frequencies_output_path)

# Simple summary
if not frequencies_df.empty:
    avg_coverage = frequencies_df['coverage_percent'].mean()
    high_coverage = (frequencies_df['coverage_percent'] > 50).sum()
    display(Markdown(f"""
**Pattern Coverage Summary:**
- Average pattern coverage: **{avg_coverage:.1f}%**
- Patterns with >50% coverage: **{high_coverage}/{len(frequencies_df)}**
- Exported to: `{frequencies_output_path}`
"""))

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd

if not frequencies_df.empty:
    df = frequencies_df.copy()
    df["coverage_percent"] = pd.to_numeric(
        df["coverage_percent"], errors="coerce"
    ).fillna(0)
    df = df.sort_values("coverage_percent", ascending=False).reset_index(drop=True)

    def make_label(row):
        return (
            f"<b>{row['subject_class']}</b> "
            f"<span style='color:#888;'></span> "
            f"<i>{row['property']}</i> "
            f"<span style='color:#888;'></span> "
            f"<b>{row['object_class']}</b>"
        )

    df["styled_label"] = df.apply(make_label, axis=1)

    text_positions = ["outside" if v < 95 else "inside" for v in df["coverage_percent"]]
    custom_colorscale = [
        [0.0, "#c3d9c0"],  # muted green
        [0.4, "#e8e4cf"],  # soft beige
        [0.7, "#e5cdbd"],  # muted peach
        [1.0, "#d36e61"],  # subdued red
    ]

    # Figure sizing
    bar_height = 26
    fig_height = min(2000, bar_height * len(df) + 200)

    fig = go.Figure(
        go.Bar(
            x=df["coverage_percent"],
            y=df["styled_label"],
            orientation="h",
            text=[f"{v:.1f}%" for v in df["coverage_percent"]],
            textposition=text_positions,
            marker=dict(
                color=df["coverage_percent"],
                colorscale=custom_colorscale,
                cmin=0,
                cmax=100,
                line=dict(color="white", width=0.6),
            ),
            hovertemplate="<b>%{y}</b><br>Coverage: %{x:.1f}%<extra></extra>",
        )
    )

    fig.update_layout(
        title={
            "text": f"Schema Pattern Coverage for {dataset_name}",
            "x": 0.5,
            "font": {"size": 18},
        },
        xaxis=dict(
            title="Coverage (%)",
            range=[0, 100],  # fixed x-axis range
            ticksuffix="%",
            showgrid=True,
            gridcolor="rgba(220,220,220,0.3)",
        ),
        yaxis=dict(
            title="",
            autorange="reversed",
            automargin=True,
            fixedrange=False,  # allow vertical zoom/pan
        ),
        template="plotly_white",
        autosize=True,  # allow figure to scale with container
        height=fig_height,  # base height (will scale)
        margin=dict(t=80, b=50, l=480, r=150),  # extra right margin for text
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    # Disable horizontal zoom/pan
    fig.update_xaxes(fixedrange=True)

    # Show figure with config for HTML export compatibility
    fig.show(config={
        "scrollZoom": True, 
        "responsive": True,
        "toImageButtonOptions": {
            "format": "png",
            "filename": f"{dataset_name}_schema_coverage",
            "height": fig_height,
            "width": 1200,
            "scale": 1
        }
    })
    
    # Also display as HTML div for better HTML export compatibility
    #from IPython.display import HTML
    #html_div = pio.to_html(fig, include_plotlyjs='inline', #div_id=f"plotly-div-{dataset_name}")
    #display(HTML(html_div))
    
else:
    display(Markdown("**No coverage data to visualize**"))

## LinkML

In [None]:
from linkml.generators.erdiagramgen import ERDiagramGenerator
from IPython.display import display, Javascript
from linkml_runtime.utils.schemaview import SchemaView
from linkml.generators.pydanticgen import PydanticGenerator

In [None]:
schema_name = f"{dataset_name}_schema"
yaml_text = vp.to_linkml_yaml(
    schema_name=schema_name,
    schema_description=f"LinkML schema for {dataset_name}",
    filter_void_nodes=True)

# Save to LinkML YAML
linkml_file = os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")
with open(linkml_file, 'w', encoding='utf-8') as f:
    f.write(yaml_text)
display(Markdown(f"**LinkML saved to:** `{linkml_file}`"))

### Mermaid  diagram for LinkML Schema

In [None]:
sv = SchemaView(linkml_file)
linkml_schema = sv.schema

display(Markdown(f"**Parsed LinkML schema:** Classes = {len(sv.all_classes())}, Slots = {len(sv.all_slots())}"))

# Build and display a Mermaid class diagram for the pubchem.anatomy LinkedML
mermaid_code = ERDiagramGenerator(linkml_file).serialize()

display(
    Javascript(
        f"""
require.config({{paths: {{mermaid: "https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min"}}}});
require(["mermaid"], function(mermaid) {{
    mermaid.initialize({{ startOnLoad: true }});
    var container = document.createElement("div");
    container.className = "mermaid";
    container.textContent = `{mermaid_code}`;
    document.body.appendChild(container);
}});
"""
    )
)

### LinkML pyDantic Model Generation

In [None]:
src = PydanticGenerator(linkml_file).serialize()
ns = {}
exec(src, ns)

# Find the Pydantic model classes
def _is_pydantic_model(name, val):
    """Check if this is likely a generated Pydantic model class"""
    # Must be a class (type) and have at least one model field
    if not isinstance(val, type):
        return False
    try:
        has_model_fields = 0 < len(getattr(val, "model_fields", {}))
    except:
        has_model_fields = False

    return has_model_fields

pydantic_models = {k: v for k, v in ns.items() if _is_pydantic_model(k, v)}

display(Markdown(f"**Found {len(pydantic_models)} Pydantic model classes for schema.**"))

# Save all models to globals
for name, cls in pydantic_models.items():
    globals()[name] = cls

In [None]:
# Show all generated Pydantic classes and their fields for pubchem.anatomy
def show_fields(cls):
    if hasattr(cls, 'model_fields'):
        fields = list(cls.model_fields.items())
        field_list = []
        for name, info in fields:
            field_list.append(f"  - `{name}`: {info.annotation}")
        return "\n".join(field_list)
    return "  No fields found"

# Show all available classes
if 'pydantic_models' in globals() and pydantic_models:
    markdown_output = f"**All {len(pydantic_models)} generated Pydantic classes:**\n\n"
    for name in sorted(pydantic_models.keys()):
        markdown_output += f"### {name}\n"
        markdown_output += show_fields(pydantic_models[name]) + "\n\n"
    display(Markdown(markdown_output))
else:
    display(Markdown("**No pydantic_models found**"))

## Export Formats

In [None]:
json_path = os.path.join(exports_path, f"{dataset_name}_schema.json")
csv_path = os.path.join(exports_path, f"{dataset_name}_schema.csv")

discovery_df.to_csv(csv_path, index=False)
with open(json_path, 'w', encoding='utf-8') as fh:
    json.dump(vp.to_json(filter_void_nodes=True), fh, indent=2)