In [None]:
# Dataset Configuration
import os

# Dataset parameters
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "mesh.heading"
void_iri = "http://id.nlm.nih.gov/mesh/heading"
graph_uri = "http://id.nlm.nih.gov/mesh/heading"

# Setup paths
working_path = os.path.abspath("")
exports_path = os.path.join(working_path, "..", "..", "docs", "notebooks", dataset_name)
os.makedirs(exports_path, exist_ok=True)

# RDFSolve Schema Extraction

This notebook demonstrates RDF schema extraction from SPARQL endpoints using VoID (Vocabulary of Interlinked Datasets) descriptions.

**Key Features:**
- Automatic VoID discovery from endpoints
- Multiple extraction modes (fast, complete, simple)
- Schema export to CSV, JSON, JSON-LD, and LinkML formats
- Class partition coverage analysis

In [None]:
# Import libraries
import pandas as pd
import json
import time
from rdfsolve.rdfsolve import RDFSolver
from rdfsolve.void_parser import VoidParser
import warnings
warnings.filterwarnings('ignore')

## 1. Initialize RDFSolver

In [None]:
# Initialize RDFSolver
solver = RDFSolver(
    endpoint=endpoint_url,
    path=working_path,
    void_iri=void_iri,
    dataset_name=dataset_name,
)

# Minimal confirmation
print(dataset_name)

## 2. Load VoID Schema

In [None]:
# Two concise approaches: (A) high-level (RDFSolver), (B) manual (VoidParser)

# Approach A — high level: generate VoID (fast), extract schema, show sample
void_output = os.path.join(exports_path, f"{dataset_name}_generated_void.ttl")
void_graph = solver.void_generator(graph_uris=graph_uri, output_file=void_output, counts=False)
parser = solver.extract_schema()
schema_df = parser.to_schema(filter_void_nodes=True)

print("Schema triples:", len(schema_df))
display(schema_df.head())

# Approach B — manual (optional): discover VoID and inspect
vp = VoidParser.from_endpoint_with_discovery(endpoint_url=endpoint_url, dataset_name=dataset_name, exports_path=exports_path)
manual_df = vp.to_schema(filter_void_nodes=True)
print("Manual discovery triples:", len(manual_df))
# display(manual_df.head())  # uncomment if you need to inspect manually

In [None]:
filtered_schema_df = existing_schema_df[
    ~existing_schema_df["subject_uri"].str.contains("openlinksw", na=False)
]
filtered_schema_df

## 4. Complete VoID Generation (with counts)

In [None]:
# Complete VoID with counts (may take longer)
complete_output = os.path.join(exports_path, f"{dataset_name}_complete_void.ttl")
complete_void = solver.void_generator(graph_uris=graph_uri, output_file=complete_output, counts=True)
complete_parser = VoidParser(complete_void)
complete_schema_df = complete_parser.to_schema(filter_void_nodes=True)

print("Complete schema triples:", len(complete_schema_df))
display(complete_schema_df.head())

## 5. Simple Extraction (Python post-processing)

In [None]:
# Simple extraction (alternative): quick VoID generation using from_sparql
simple_output = os.path.join(exports_path, f"{dataset_name}_simple_void.ttl")
simple_parser = VoidParser.from_sparql(endpoint_url=endpoint_url, output_file=simple_output, exclude_other_graphs=True)
simple_schema_df = simple_parser.to_schema(filter_void_nodes=True)

print("Simple schema triples:", len(simple_schema_df))
display(simple_schema_df.head())

## 6. Class Partition Coverage Analysis

In [None]:
# Analyze class partition usage and coverage
analysis_parser = None
if 'existing_parser' in globals() and existing_parser is not None:
    analysis_parser = existing_parser
elif 'simple_parser' in globals() and simple_parser is not None:
    analysis_parser = simple_parser
elif 'fast_parser' in globals() and fast_parser is not None:
    analysis_parser = fast_parser
elif 'complete_parser' in globals() and complete_parser is not None:
    analysis_parser = complete_parser

if analysis_parser:
    try:
        output_path = os.path.join(exports_path, f"{dataset_name}_coverage.csv")
        
        instance_counts, class_mappings, coverage_stats = analysis_parser.analyze_class_partition_usage(
            endpoint_url=endpoint_url,
            sample_limit=None
        )
        
        coverage_df = analysis_parser.export_coverage_analysis(
            coverage_stats, output_file=output_path
        )
        
        print(f"Coverage analysis completed")
        print(f"Instances analyzed: {len(instance_counts):,}")
        print(f"Class partitions: {len(class_mappings)}")
        print(f"Saved to: {output_path}")
        
    except Exception as e:
        print(f"Coverage analysis failed: {str(e)}")
        coverage_df = None
else:
    print("No parser available for coverage analysis")
    coverage_df = None

## 7. Schema Comparison

In [None]:
# Compare extraction modes available in this session
schemas = {}
if 'schema_df' in globals() and schema_df is not None:
    schemas['HighLevel'] = schema_df
if 'complete_schema_df' in globals() and complete_schema_df is not None:
    schemas['Complete'] = complete_schema_df
if 'simple_schema_df' in globals() and simple_schema_df is not None:
    schemas['Simple'] = simple_schema_df
if 'manual_df' in globals() and manual_df is not None:
    schemas['Manual'] = manual_df

if schemas:
    rows = []
    for name, df in schemas.items():
        rows.append({'Mode': name, 'Triples': len(df), 'Classes': df['subject_class'].nunique(), 'Properties': df['property'].nunique()})
    comparison_df = pd.DataFrame(rows)
    display(comparison_df)

    # Top classes and properties in the primary schema
    main = list(schemas.values())[0]
    top_classes = main['subject_class'].value_counts().head(5).rename_axis('Class').reset_index(name='Triples')
    top_props = main['property'].value_counts().head(5).rename_axis('Property').reset_index(name='Usage')
    display(top_classes)
    display(top_props)
else:
    print('No schemas to compare')

## 8. LinkML Schema Generation

In [None]:
# Generate LinkML from available parser (use parser from above)
linkml_parser = None
for candidate in ('complete_parser','parser','simple_parser','vp'):
    if candidate in globals() and globals()[candidate] is not None:
        linkml_parser = globals()[candidate]
        break

if linkml_parser:
    schema_name = f"{dataset_name}_schema"
    yaml_text = linkml_parser.to_linkml_yaml(schema_name=schema_name, schema_description=f"LinkML schema for {dataset_name}", filter_void_nodes=True)
    linkml_file = os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")
    with open(linkml_file, 'w', encoding='utf-8') as f:
        f.write(yaml_text)
    print('LinkML saved to', linkml_file)
    print('Sample (first 200 chars):')
    print(yaml_text[:200])
else:
    print('No parser available for LinkML generation')

## 9. Export Formats

In [None]:
# Export available schemas (CSV/JSON) — concise
export_count = 0
for name, df in list(schemas.items()):
    if df is None or len(df) == 0:
        continue
    safe = name.lower().replace(' ', '_')
    csv_path = os.path.join(exports_path, f"{dataset_name}_{safe}_schema.csv")
    df.to_csv(csv_path, index=False)

    parser = globals().get(f"{name.lower()}_parser") or globals().get('parser')
    if parser:
        json_path = os.path.join(exports_path, f"{dataset_name}_{safe}_schema.json")
        with open(json_path, 'w', encoding='utf-8') as fh:
            json.dump(parser.to_json(filter_void_nodes=True), fh, indent=2)
    export_count += 1

print('Exported', export_count, 'schemas to', exports_path)

## 10. Results Summary

In [None]:
# Results summary — concise DataFrame and status
summary_rows = []
if 'schemas' in globals() and schemas:
    total_triples = sum(len(df) for df in schemas.values())
    total_classes = sum(df['subject_class'].nunique() for df in schemas.values())
    total_properties = sum(df['property'].nunique() for df in schemas.values())
    summary_rows = [
        {'Metric': 'Total Schema Triples', 'Count': total_triples},
        {'Metric': 'Unique Classes', 'Count': total_classes},
        {'Metric': 'Unique Properties', 'Count': total_properties},
    ]
summary_df = pd.DataFrame(summary_rows)
if not summary_df.empty:
    display(summary_df)

print('Endpoint:', endpoint_url)
print('Dataset:', dataset_name)

print('Coverage analysis available:' , 'coverage_df' in globals() and coverage_df is not None)
print('LinkML generated:' , os.path.exists(os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")))