# mesh.heading Schema Extraction

This notebook demonstrates RDF schema extraction from the mesh.heading SPARQL endpoint by discovering or querying for VoID (Vocabulary of Interlinked Datasets) descriptions and some downstream uses.

In [None]:
# Dataset Configuration
import os

# Dataset parameters
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "mesh.heading"
void_iri = "http://id.nlm.nih.gov/mesh/heading"
graph_uri = "http://id.nlm.nih.gov/mesh/heading"

# Setup paths
working_path = os.path.abspath("")
exports_path = os.path.join(working_path, "..", "..", "docs", "notebooks", dataset_name)
os.makedirs(exports_path, exist_ok=True)

In [None]:
# Import libraries
import json
from rdfsolve.void_parser import VoidParser
from linkml_runtime.utils.schemaview import SchemaView
from linkml.generators.pydanticgen import PydanticGenerator

## Discover or get VoID Schema

In [None]:
vp = VoidParser.from_endpoint_with_discovery(
    endpoint_url=endpoint_url,
    dataset_name=dataset_name,
    exports_path=exports_path,
    #exclude_graph_patterns=["openlinksw", "well-known"], # Filter out administrative graphs, service descriptions, etc
    counts=True,
)

In [None]:
discovery_df = vp.to_schema(
    filter_void_admin_nodes=True
)  # to filter out unwanted graphs here (TODO improve logic, add step when querying)
discovery_df

In [None]:
discovery_df.describe(include='all')

## Class Partition Coverage Analysis
Query again to know how many times do we find instances of each "shape" in the dataset.

In [None]:
output_path = os.path.join(exports_path, f"{dataset_name}_coverage.csv")
        
instance_counts, class_mappings, coverage_stats = vp.analyze_class_partition_usage(
            endpoint_url=endpoint_url,
            sample_limit=None
)
        
coverage_df = vp.export_coverage_analysis(
    coverage_stats, output_file=output_path
)

print(f"Saved to: {output_path}")


## Schema Pattern Coverage Analysis
For each subject class type, calculate how many entities participate in each schema pattern divided by the total number of entities of that class type. This gives coverage ratios showing what percentage of entities actually use each relationship pattern.

In [None]:
# Calculate schema pattern coverage ratios
frequencies_df = vp.count_schema_shape_frequencies(
    endpoint_url=endpoint_url,
)

# Show top patterns by coverage
frequencies_df[['subject_class', 'property', 'object_class', 'coverage_percent']].head(10)

In [None]:
# Export coverage analysis
frequencies_output_path = os.path.join(exports_path, f"{dataset_name}_pattern_coverage.csv")
exported_df = vp.export_schema_shape_frequencies(frequencies_df, output_file=frequencies_output_path)

# Simple summary
if not frequencies_df.empty:
    avg_coverage = frequencies_df['coverage_percent'].mean()
    high_coverage = (frequencies_df['coverage_percent'] > 50).sum()
    print(f"Average pattern coverage: {avg_coverage:.1f}%")
    print(f"Patterns with >50% coverage: {high_coverage}/{len(frequencies_df)}")
    print(f"Exported to: {frequencies_output_path}")

In [None]:
# Visualize pattern coverage
import matplotlib.pyplot as plt

if not frequencies_df.empty:
    # Simple bar chart of top 15 patterns by coverage
    top_patterns = frequencies_df

    plt.figure(figsize=(40, 60 * len(frequencies_df) / 100))
    bars = plt.barh(range(len(top_patterns)), top_patterns["coverage_percent"])
    plt.yticks(
        range(len(top_patterns)),
        [
            f"{row['subject_class']} {row['property']} {row['object_class']}"
            for _, row in top_patterns.iterrows()
        ],
    )
    plt.xlabel("Coverage (%)")
    plt.title(f"Schema Pattern Coverage in {dataset_name}")
    plt.gca().invert_yaxis()

    # Add percentage labels
    for i, (bar, pct) in enumerate(zip(bars, top_patterns["coverage_percent"])):
        plt.text(
            bar.get_width() + 1,
            bar.get_y() + bar.get_height() / 2,
            f"{pct:.1f}%",
            va="center",
            fontsize=9,
        )

    plt.tight_layout()
    plt.show()
else:
    print("No coverage data to visualize")

## LinkML

In [None]:
schema_name = f"{dataset_name}_schema"
yaml_text = vp.to_linkml_yaml(
    schema_name=schema_name,
    schema_description=f"LinkML schema for {dataset_name}",
    filter_void_nodes=True)

# Save to LinkML YAML
linkml_file = os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")
with open(linkml_file, 'w', encoding='utf-8') as f:
    f.write(yaml_text)
print('LinkML saved to', linkml_file)

### Mermaid  diagram for LinkML Schema

In [None]:
sv = SchemaView(linkml_file)
linkml_schema = sv.schema

print("Parsed LinkML schema: Classes =", len(sv.all_classes()), "Slots =", len(sv.all_slots()))

# Build and display a Mermaid class diagram for the mesh.heading LinkedML
from linkml.generators.erdiagramgen import ERDiagramGenerator

mermaid = ERDiagramGenerator(linkml_file).serialize()
from IPython.display import display, Markdown, Latex
display(Markdown(mermaid))

### LinkML pyDantic Model Generation

In [None]:
src = PydanticGenerator(linkml_file).serialize()
ns = {}
exec(src, ns)

# Find the Pydantic model classes
def _is_pydantic_model(name, val):
    """Check if this is likely a generated Pydantic model class"""
    # Must be a class (type) and have at least one model field
    if not isinstance(val, type):
        return False
    try:
        has_model_fields = 0 < len(getattr(val, "model_fields", {}))
    except:
        has_model_fields = False

    return has_model_fields

pydantic_models = {k: v for k, v in ns.items() if _is_pydantic_model(k, v)}

print(f"Found {len(pydantic_models)} Pydantic model classes for schema.")

# Save all models to globals
for name, cls in pydantic_models.items():
    globals()[name] = cls

In [None]:
# Show all generated Pydantic classes and their fields for mesh.heading
def show_fields(cls):
    if hasattr(cls, 'model_fields'):
        fields = list(cls.model_fields.items())
        for name, info in fields:
            print(f"  {name}: {info.annotation}")

# Show all available classes
if 'pydantic_models' in globals() and pydantic_models:
    print(f"All {len(pydantic_models)} generated Pydantic classes:\n")
    for name in sorted(pydantic_models.keys()):
        print(f"=== {name} ===")
        show_fields(pydantic_models[name])
        print()
else:
    print("No pydantic_models found")

## Export Formats

In [None]:
json_path = os.path.join(exports_path, f"{dataset_name}_schema.json")
csv_path = os.path.join(exports_path, f"{dataset_name}_schema.csv")

discovery_df.to_csv(csv_path, index=False)
with open(json_path, 'w', encoding='utf-8') as fh:
    json.dump(vp.to_json(filter_void_nodes=True), fh, indent=2)