# RDFSolve: PubChem MeSH - Fast Discovery

This notebook demonstrates faster schema discovery:
1. Setting up an endpoint and graph
2. Generating fast VoID descriptions using CONSTRUCT queries **without** COUNT aggregations
3. Extracting schema from the VoID description
4. Analyzing the results as DataFrame and JSON

In [8]:
import pandas as pd
from rdfsolve.rdfsolve import RDFSolver
from rdfsolve.void_parser import VoidParser, generate_void_from_endpoint
import warnings
warnings.filterwarnings('ignore')

## Step 1: Configure Dataset Parameters

We'll configure the PubChem MeSH dataset with its SPARQL endpoint and metadata.

In [9]:
# MeSH configuration
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "mesh_headers"
void_iri = "http://id.nlm.nih.gov/mesh/heading"
graph_uri = "http://id.nlm.nih.gov/mesh/heading"  # Specify the correct graph URI
working_path = "."

print(f"Dataset: {dataset_name}")
print(f"Endpoint: {endpoint_url}")
print(f"VoID IRI: {void_iri}")
print(f"Graph URI: {graph_uri}")
print(f"Mode: Fast Discovery (no COUNT aggregations)")

Dataset: mesh_headers
Endpoint: https://idsm.elixir-czech.cz/sparql/endpoint/idsm
VoID IRI: http://id.nlm.nih.gov/mesh/heading
Graph URI: http://id.nlm.nih.gov/mesh/heading
Mode: Fast Discovery (no COUNT aggregations)


## Step 2: Initialize RDFSolver

Create an RDFSolver instance with our configuration.

In [10]:
try:
    # Initialize RDFSolver with our configuration
    solver = RDFSolver(
        endpoint=endpoint_url,
        path=working_path,
        void_iri=void_iri,
        dataset_name=dataset_name
    )
    
    print("RDFSolver initialized successfully")
    print(f"Endpoint: {solver.endpoint}")
    print(f"Dataset: {solver.dataset_name}")
    
except Exception as e:
    print(f"Error: {e}")

RDFSolver initialized successfully
Endpoint: https://idsm.elixir-czech.cz/sparql/endpoint/idsm
Dataset: mesh_headers


## Step 3: Generate Fast VoID Description

Generate VoID **without** COUNT aggregations for fast discovery. This is much faster but doesn't provide count statistics.

Three CONSTRUCT queries get the partitions for classes, properties, and datatypes using SELECT DISTINCT instead of COUNT.

In [None]:
try:
    # Generate fast VoID without count aggregations
    
    fast_void_graph = solver.void_generator(
        graph_uri=graph_uri,
        output_file=f"{dataset_name}_void.ttl",
        counts=False  # Fast discovery without counts
    )
    
    print(f"Saved to: {dataset_name}_void.ttl")
    
except Exception as e:
    print(f"Error: {e}")

Generating VoID from endpoint: https://idsm.elixir-czech.cz/sparql/endpoint/idsm
Using graph URI: http://id.nlm.nih.gov/mesh/heading
Fast mode: Skipping COUNT aggregations
Starting query: class_partitions
Finished query: class_partitions (took 0.80s)
Starting query: property_partitions


## Step 4: Extract Schema from Fast VoID

Extract schema structure from the fast-generated VoID description.

In [None]:
try:
    print("Extracting schema from fast VoID...")
    fast_parser = VoidParser(fast_void_graph)
    
    # Get schema as DataFrame
    fast_schema_df = fast_parser.to_schema(filter_void_nodes=True)
    
    print("Fast schema extraction completed")
    print(f"Total schema triples: {len(fast_schema_df)}")
    print(f"Unique classes: {fast_schema_df['subject_class'].nunique()}")
    print(f"Unique properties: {fast_schema_df['property'].nunique()}")
    
except Exception as e:
    print(f"Fast schema extraction failed: {e}")

Extracting schema from fast VoID...
Fast schema extraction failed: name 'fast_void_graph' is not defined


## Step 5: Schema Visualization

Display a sample of the extracted schema from fast discovery.

In [None]:
# Show sample of the fast schema (excluding generic classes)
display(fast_schema_df[~fast_schema_df.object_class.isin(["Class", "Resource"])].head(10))

NameError: name 'fast_schema_df' is not defined

## Step 6: Analyze MeSH Classes (Fast Mode)

Examine MeSH-specific classes using the fast discovery results:

In [None]:
try:
    print(f"PubChem MeSH Fast Discovery Analysis:")
    print(f"Total unique classes: {fast_schema_df['subject_class'].nunique()}")
    
    # Show top classes by frequency
    print("\nTop 10 classes by property count:")
    class_counts = fast_schema_df['subject_class'].value_counts().head(10)
    for cls, count in class_counts.items():
        print(f"  {cls:30} ({count} properties)")
    
    # Look for MeSH-specific classes
    mesh_classes = fast_schema_df[fast_schema_df['subject_class'].str.contains('mesh|MeSH', case=False, na=False)]['subject_class'].unique()
    if len(mesh_classes) > 0:
        print(f"\nMeSH-specific classes found (fast discovery):")
        for cls in mesh_classes[:10]:
            print(f"  - {cls}")
            
        # Analyze first MeSH class in detail
        first_mesh_class = mesh_classes[0]
        mesh_schema = fast_schema_df[fast_schema_df['subject_class'] == first_mesh_class]
        print(f"\n{first_mesh_class} Properties (Fast Discovery):")
        for _, row in mesh_schema.head(10).iterrows():
            print(f"  {row['property']:25} -> {row['object_class']}")
    else:
        print("\nNo MeSH-specific classes found in fast discovery")
        print("Available classes sample:")
        for cls in fast_schema_df['subject_class'].unique()[:15]:
            print(f"  - {cls}")
            
except Exception as e:
    print(f"Fast MeSH analysis failed: {e}")

## Step 7: Export Fast Discovery Results

Export the fast discovery schema as JSON and CSV files.

In [None]:
try:
    # Export as JSON
    print("Generating JSON schema (fast discovery)...")
    fast_schema_json = fast_parser.to_json(filter_void_nodes=True)
    
    print("Fast JSON export completed")
    print(f"Total triples: {fast_schema_json['metadata']['total_triples']}")
    print(f"Classes: {len(fast_schema_json['metadata']['classes'])}")
    print(f"Properties: {len(fast_schema_json['metadata']['properties'])}")
    print(f"Object types: {len(fast_schema_json['metadata']['objects'])}")
    
    # Save JSON to file
    import json
    with open(f"{dataset_name}_schema.json", "w") as f:
        json.dump(fast_schema_json, f, indent=2)
    print(f"\nFast JSON schema saved to: {dataset_name}_schema.json")
    
    # Export as CSV
    fast_schema_df.to_csv(f"{dataset_name}_schema.csv", index=False)
    print(f"Fast CSV schema saved to: {dataset_name}_schema.csv")
    
except Exception as e:
    print(f"Fast export failed: {e}")

## Optional: Sample Limiting for Very Large Datasets

For extremely large datasets like PubChem, you can add a sample limit for even faster discovery:

In [None]:
# Example: Faster discovery with sample limit
# Uncomment to try with a sample of 1000 triples for faster exploration

# try:
#     
#     sampled_void_graph = solver.void_generator(
#         graph_uri=graph_uri,
#         output_file=f"{dataset_name}_sampled_void.ttl",
#         counts=False,
#         sample_limit=1000  # Only sample 1000 triples
#     )
#     
#     print(f"Sampled VoID contains {len(sampled_void_graph)} triples")
#     
# except Exception as e:
#     print(f"Sampled mode error: {e}")