In [None]:
# wikipathways Dataset Configuration
import os
import traceback

try:
    endpoint_url = "https://sparql.wikipathways.org/sparql/"
    dataset_name = "wikipathways"
    void_iri = "http://rdf.wikipathways.org"
    graph_uri = "http://rdf.wikipathways.org"
    working_path = os.path.abspath("")
    # Create the results directory if it does not exist
    if not os.path.exists(os.path.join(working_path, "..", "..", "results", dataset_name)):
        os.makedirs(os.path.join(working_path, "..", "..", "results", dataset_name))
    exports_path = os.path.join(working_path, "..", "..", "results", dataset_name)
    print("Dataset configuration completed successfully")
except Exception as e:
    print(f"ERROR in dataset configuration: {e}")
    print("Full traceback:")
    traceback.print_exc()

# RDFSolve: Complete Workflow Demonstration

This notebook demonstrates schema extraction and exports. There are different querying modes for mining the graphs:

1. **Traditional VoID Generation** - Full COUNT aggregations
2. **Fast Discovery Mode** - No COUNT aggregations
3. **Sampled Analysis** - Limited sample size for large datasets
4. **Simple Extraction Mode** - Python post-processing

The exports include:
1. **Class Partition Coverage Analysis** - count partition coverage csv.
2. **Export Formats** - rdf-config (TODO), JSON, JSON-LD schemas


In [None]:
# Import all required libraries
import pandas as pd
import json
import time
from rdfsolve.rdfsolve import RDFSolver
from rdfsolve.void_parser import VoidParser, generate_void_from_endpoint
import warnings
warnings.filterwarnings('ignore')


## 1. Dataset Configuration

Configure the wikipathways dataset parameters for all analysis modes.

In [None]:
try:
    print("Dataset Configuration:")
    print(f"  Dataset: {dataset_name}")
    print(f"  Endpoint: {endpoint_url}")
    print(f"  Graph URI: {graph_uri}")
    print(f"  VoID IRI: {void_iri}")

    # Initialize RDFSolver
    solver = RDFSolver(
        endpoint=endpoint_url,
        path=working_path,
        void_iri=void_iri,
        dataset_name=dataset_name
    )
    print("RDFSolver initialized successfully")
except Exception as e:
    print(f"ERROR initializing RDFSolver: {e}")
    print("Full traceback:")
    traceback.print_exc()

## 2. Test with LIMIT

Test whether the schema can be extracted.

In [None]:
try:
    print("=== TEST ENDPOINT ===")
    output_path = os.path.join(exports_path, f"{dataset_name}_fast_void.ttl")

    start_time = time.time()

    # Generate fast VoID without count aggregations
    fast_void_graph = solver.void_generator(
        graph_uri=graph_uri,
        output_file=output_path,
        counts=False,  # Disable COUNT aggregations for speed
        sample_limit=10  # Just test if we can extract schema from endpoint/graph
    )

    execution_time = time.time() - start_time

    print(f"Fast VoID generation completed")
    print(f"Execution time: {execution_time:.2f} seconds")
    print(f"VoID triples: {len(fast_void_graph):,}")
    print(f"Output file:", output_path)

    # Extract schema from fast VoID
    fast_parser = VoidParser(fast_void_graph)
    fast_schema_df = fast_parser.to_schema(filter_void_nodes=True)

    print(f"\nFast Schema Analysis:")
    print(f"  Schema triples: {len(fast_schema_df):,}")
    print(f"  Unique classes: {fast_schema_df['subject_class'].nunique()}")
    print(f"  Unique properties: {fast_schema_df['property'].nunique()}")
    print(f"  Unique objects: {fast_schema_df['object_class'].nunique()}")
    
except Exception as e:
    print(f"ERROR in fast VoID generation: {e}")
    print("Full traceback:")
    traceback.print_exc()
    fast_void_graph = None
    fast_parser = None
    fast_schema_df = None

## 2. SPARQL-based VoID Generation

Lets the SPARQL query create the partitions and get coverages. Can (and will often) time out.

In [None]:
try:
    print("=== COMPLETE SPARQL VOID GENERATION (WITH COUNTS) ===")
    output_path = os.path.join(exports_path, f"{dataset_name}_complete_void.ttl")
    start_time = time.time()

    # Generate complete VoID with COUNT aggregations
    complete_void_graph = solver.void_generator(
        graph_uri=graph_uri,
        output_file=output_path,
        counts=True,  # Enable COUNT aggregations
    )

    execution_time = time.time() - start_time

    print(f"Complete VoID generation completed")
    print(f"Execution time: {execution_time:.2f} seconds")
    print(f"VoID triples: {len(complete_void_graph):,}")
    print(f"Output file:", output_path)

    # Extract schema with full statistics
    complete_parser = VoidParser(complete_void_graph)
    complete_schema_df = complete_parser.to_schema(filter_void_nodes=True)

    print(f"\nComplete Schema Analysis:")
    print(f"  Schema triples: {len(complete_schema_df):,}")
    print(f"  Unique classes: {complete_schema_df['subject_class'].nunique()}")
    print(f"  Unique properties: {complete_schema_df['property'].nunique()}")
    print(f"  Unique objects: {complete_schema_df['object_class'].nunique()}")
    
except Exception as e:
    print(f"ERROR in complete VoID generation: {e}")
    print("Full traceback:")
    traceback.print_exc()
    complete_void_graph = None
    complete_parser = None
    complete_schema_df = None

## 3. Fallback Extraction Mode (Python Post-Processing)

Simpler SPARQL querying, process stats in Python. Preserves original values instead of Resource/Literal classification.

In [None]:
try:
    print("=== SIMPLE EXTRACTION MODE (PYTHON POST-PROCESSING) ===")
    output_path = os.path.join(exports_path, f"{dataset_name}_simple_void.ttl")

    start_time = time.time()

    # simple extraction with Python post-processing - using generate_void_from_sparql directly
    simple_void_graph = VoidParser.generate_void_from_sparql(
        endpoint_url=endpoint_url,
        graph_uri=graph_uri,
        output_file=output_path,
        counts=False,        # Disable counts for speed
        #sample_limit=5000    # Use sampling for demonstration
    )

    # Create parser from the generated graph
    simple_parser = VoidParser(simple_void_graph)

    execution_time = time.time() - start_time

    print(f"simple extraction completed")
    print(f"Execution time: {execution_time:.2f} seconds")
    print(f"Processing: Python post-processing enabled")
    # print(f"Sample limit: 5,000 triples per query")

    # Extract schema with preserved values
    simple_schema_df = simple_parser.to_schema(filter_void_nodes=True)

    print(f"\nsimple Extraction Schema Analysis:")
    print(f"  Schema triples: {len(simple_schema_df):,}")
    print(f"  Unique classes: {simple_schema_df['subject_class'].nunique()}")
    print(f"  Unique properties: {simple_schema_df['property'].nunique()}")
    print(f"  Unique objects: {simple_schema_df['object_class'].nunique()}")
    print(f"Output file:", output_path)

    # Show detailed object types (preserved values)
    if not simple_schema_df.empty:
        preserved_objects = simple_schema_df[
            ~simple_schema_df['object_class'].isin(['Resource', 'Literal'])
        ]
        print(f"  Preserved object types: {len(preserved_objects):,} (vs Resource/Literal classification)")

except Exception as e:
    print(f"ERROR in simple extraction: {e}")
    print("Full traceback:")
    traceback.print_exc()
    simple_void_graph = None
    simple_parser = None
    simple_schema_df = None

## 4. Class Partition Coverage Analysis

Modular analysis of class partition usage and coverage (counting instances for each).

In [None]:
try:
    print("=== CLASS PARTITION COVERAGE ANALYSIS ===")

    # Use the best available parser for coverage analysis
    analysis_parser = None
    if 'simple_parser' in globals() and simple_parser is not None:
        analysis_parser = simple_parser
        output_path = os.path.join(exports_path, f"{dataset_name}_coverage.csv")
        print("Using simple_parser for coverage analysis...")
    elif 'fast_parser' in globals() and fast_parser is not None:
        analysis_parser = fast_parser
        print("Using fast_parser for coverage analysis...")
    elif 'sampled_parser' in globals() and sampled_parser is not None:
        analysis_parser = sampled_parser
        print("Using sampled_parser for coverage analysis...")
    elif 'complete_parser' in globals() and complete_parser is not None:
        analysis_parser = complete_parser
        print("Using complete_parser for coverage analysis...")
    else:
        print("No parser available for coverage analysis. Please run previous cells first.")
        analysis_parser = None

    if analysis_parser:
        try:
            print("Running class partition coverage analysis...")

            # Complete analysis pipeline
            instance_counts, class_mappings, coverage_stats = analysis_parser.analyze_class_partition_usage(
                endpoint_url=endpoint_url,
                graph_uri=graph_uri,
                #sample_limit=10000  # Sample for demonstration
            )

            print(f"\nCoverage Analysis Results:")
            print(f"  Unique instances analyzed: {len(instance_counts):,}")
            print(f"  Class partitions found: {len(class_mappings)}")
            print(f"  Coverage statistics generated: {len(coverage_stats)}")

            # Export coverage analysis
            coverage_df = analysis_parser.export_coverage_analysis(
                coverage_stats, output_file=output_path
            )
            print(f"Output file:", output_path)

        except Exception as e:
            print(f"Coverage analysis failed: {e}")
            print("Full traceback:")
            traceback.print_exc()
            coverage_df = None
    else:
        coverage_df = None
        
except Exception as e:
    print(f"ERROR in coverage analysis setup: {e}")
    print("Full traceback:")
    traceback.print_exc()
    coverage_df = None

## 5. Schema Analysis and Comparison

Compare results across different extraction modes and analyze schema patterns.

In [None]:
try:
    print("=== SCHEMA ANALYSIS AND COMPARISON ===")

    # Collect available schemas for comparison
    schemas = {}
    if 'complete_schema_df' in globals() and complete_schema_df is not None:
        schemas['Complete (with counts)'] = complete_schema_df
    if 'fast_schema_df' in globals() and fast_schema_df is not None:
        schemas['Fast (no counts)'] = fast_schema_df
    if 'sampled_schema_df' in globals() and sampled_schema_df is not None:
        schemas['Sampled (limited)'] = sampled_schema_df
    if 'simple_schema_df' in globals() and simple_schema_df is not None:
        schemas['simple (Python processed)'] = simple_schema_df

    print(f"Available schemas for comparison: {len(schemas)}")

    if schemas:
        print("\nMode Comparison:")
        print("-" * 80)
        print(f"{'Mode':<25} {'Triples':<10} {'Classes':<10} {'Properties':<12} {'Objects':<10}")
        print("-" * 80)
        
        for mode_name, schema_df in schemas.items():
            print(f"{mode_name:<25} {len(schema_df):<10,} {schema_df['subject_class'].nunique():<10} "
                  f"{schema_df['property'].nunique():<12} {schema_df['object_class'].nunique():<10}")
        
        # Focus analysis on a representative schema
        main_schema = list(schemas.values())[0]
        print(f"\nDetailed Analysis ({list(schemas.keys())[0]}):")
        print("-" * 50)
        
        # Top classes by frequency
        top_classes = main_schema['subject_class'].value_counts().head(10)
        print(f"\nTop 10 Classes by Triple Count:")
        for class_name, count in top_classes.items():
            print(f"  {class_name:<30}: {count:,} triples")
        
        # Top properties by frequency
        top_properties = main_schema['property'].value_counts().head(10)
        print(f"\nTop 10 Properties by Usage:")
        for prop_name, count in top_properties.items():
            print(f"  {prop_name:<30}: {count:,} usages")
        
        # Object type distribution
        object_dist = main_schema['object_class'].value_counts().head(10)
        print(f"\nObject Type Distribution:")
        for obj_type, count in object_dist.items():
            print(f"  {obj_type:<30}: {count:,} instances")

        # KeyEvent class analysis (AOP-Wiki specific)
        if 'KeyEvent' in main_schema['subject_class'].values:
            ke_schema = main_schema[main_schema['subject_class'] == 'KeyEvent']
            print(f"\nKeyEvent Class Analysis:")
            print(f"  Properties: {len(ke_schema)}")
            print(f"  Unique properties: {ke_schema['property'].nunique()}")
            
            print(f"\n  KeyEvent Properties:")
            for _, row in ke_schema.head(10).iterrows():
                print(f"    {row['property']:<25} -> {row['object_class']}")
    else:
        print("No schemas available for comparison")
        
except Exception as e:
    print(f"ERROR in schema analysis and comparison: {e}")
    print("Full traceback:")
    traceback.print_exc()
    schemas = {}

## 6. Export Formats and Serializations

Generate exports in multiple formats: CSV, JSON, JSON-LD with automatic prefix extraction (`bioregistry`'s `curie_from_iri`' ).

In [None]:
try:
    print("=== EXPORT FORMATS AND SERIALIZATIONS ===")

    # Export schemas in multiple formats
    export_count = 0

    for mode_name, schema_df in schemas.items():
        if schema_df is not None and len(schema_df) > 0:
            try:
                safe_name = mode_name.lower().replace(' ', '_').replace('(', '').replace(')', '')
                
                # CSV Export - use exports_path
                csv_file = os.path.join(exports_path, f"{dataset_name}_{safe_name}_schema.csv")
                schema_df.to_csv(csv_file, index=False)
                print(f"CSV exported: {csv_file} ({len(schema_df):,} rows)")
                
                # JSON Export (using appropriate parser)
                if 'complete_parser' in globals() and mode_name.startswith('Complete'):
                    parser = complete_parser
                elif 'fast_parser' in globals() and mode_name.startswith('Fast'):
                    parser = fast_parser
                elif 'sampled_parser' in globals() and mode_name.startswith('Sampled'):
                    parser = sampled_parser
                elif 'simple_parser' in globals() and mode_name.startswith('Simple'):
                    parser = simple_parser
                else:
                    parser = fast_parser if 'fast_parser' in globals() else None
                    
                if parser:
                    schema_json = parser.to_json(filter_void_nodes=True)
                    json_file = os.path.join(exports_path, f"{dataset_name}_{safe_name}_schema.json")
                    
                    with open(json_file, 'w') as f:
                        json.dump(schema_json, f, indent=2)
                    print(f"JSON exported: {json_file} ({len(schema_json['triples']):,} triples)")
                
                export_count += 1
                
            except Exception as e:
                print(f"ERROR exporting {mode_name}: {e}")
                print("Full traceback:")
                traceback.print_exc()

    print(f"\nTotal exports generated: {export_count * 2} files")

    # JSON-LD Export with automatic prefix extraction
    try:
        print(f"\nJSON-LD Export with Automatic Prefix Extraction:")

        # Export VoID as JSON-LD - use exports_path
        void_jsonld_file = os.path.join(exports_path, f"{dataset_name}_void.jsonld")
        void_jsonld = solver.export_void_jsonld(
            output_file=void_jsonld_file,
            indent=2
        )

        # Export schema as JSON-LD - use exports_path
        schema_jsonld_file = os.path.join(exports_path, f"{dataset_name}_schema.jsonld")
        schema_jsonld = solver.export_schema_jsonld(
            output_file=schema_jsonld_file,
            indent=2,
            filter_void_nodes=True
        )

        print(f"  VoID JSON-LD: {void_jsonld_file} ({len(void_jsonld):,} characters)")
        print(f"  Schema JSON-LD: {schema_jsonld_file} ({len(schema_jsonld):,} characters)")

        # Show automatically extracted prefixes
        prefixes = solver._extract_prefixes_from_void()
        print(f"  Auto-extracted prefixes: {len(prefixes)}")

        prefix_list = list(prefixes.keys())[:10]
        print(f"  Sample prefixes: {', '.join(prefix_list)}")
        
    except Exception as e:
        print(f"ERROR in JSON-LD export: {e}")
        print("Full traceback:")
        traceback.print_exc()

    # Coverage Analysis Export
    try:
        if 'coverage_df' in globals() and coverage_df is not None:
            coverage_file = os.path.join(exports_path, f"{dataset_name}_coverage_analysis.csv")
            print(f"\nClass Partition Coverage Analysis:")
            print(f"  Coverage CSV: {coverage_file} ({len(coverage_df)} class partitions)")
    except Exception as e:
        print(f"ERROR in coverage export: {e}")
        print("Full traceback:")
        traceback.print_exc()

    print(f"\nAll exports completed for dataset: {dataset_name}")
    print(f"All files saved to: {exports_path}")
    
except Exception as e:
    print(f"ERROR in export formats and serializations: {e}")
    print("Full traceback:")
    traceback.print_exc()

## 7. Data Visualization and Final Analysis

Display schema samples and provide summary of all analysis modes.

In [None]:
try:
    print("=== RESULTS ===")

    # Display sample schemas
    if schemas:
        try:
            # Show representative sample from the main schema
            main_schema_name, main_schema = list(schemas.items())[0]
            print(f"\nSchema Sample ({main_schema_name}):")
            
            # Filter out generic classes and show meaningful relationships
            sample_schema = main_schema[
                ~main_schema['object_class'].isin(['Class', 'Resource', 'Literal'])
            ].head(15)
            
            if len(sample_schema) > 0:
                print("Subject Class                 | Property              | Object Class")
                print("-" * 75)
                for _, row in sample_schema.iterrows():
                    subj = row['subject_class'][:25].ljust(25)
                    prop = row['property'][:20].ljust(20)
                    obj = row['object_class'][:25]
                    print(f"{subj} | {prop} | {obj}")
            else:
                # Fallback to full sample
                sample_schema = main_schema.head(10)
                print("Subject Class                 | Property              | Object Class")
                print("-" * 75)
                for _, row in sample_schema.iterrows():
                    subj = row['subject_class'][:25].ljust(25)
                    prop = row['property'][:20].ljust(20)
                    obj = row['object_class'][:25]
                    print(f"{subj} | {prop} | {obj}")
        except Exception as e:
            print(f"ERROR displaying schema samples: {e}")
            print("Full traceback:")
            traceback.print_exc()

    # Display coverage analysis if available
    try:
        if 'coverage_df' in globals() and coverage_df is not None:
            print(f"\nClass Partition Coverage Analysis:")
            print("Class Name                    | Instances | Coverage% | Avg/Instance")
            print("-" * 70)
            
            for _, row in coverage_df.head(5).iterrows():
                name = row['class_name'][:25].ljust(25)
                instances = f"{row['total_instances']:,}".rjust(8)
                coverage = f"{row['occurrence_coverage_percent']:.1f}%".rjust(8)
                avg = f"{row['avg_occurrences_per_instance']:.1f}".rjust(10)
                print(f"{name} | {instances} | {coverage} | {avg}")
    except Exception as e:
        print(f"ERROR displaying coverage analysis: {e}")
        print("Full traceback:")
        traceback.print_exc()

    print(f"\n" + "="*80)
    print("SUMMARY")
    print("="*80)

    try:
        workflow_summary = {
            "Traditional VoID": "Complete COUNT aggregations - comprehensive but slower",
            "Fast Discovery": "No COUNT aggregations - rapid schema exploration", 
            "Sampled Analysis": "LIMIT clause - ultra-fast for large datasets",
            "simple Extraction": "Python post-processing - preserves original values",
            "Coverage Analysis": "Instance counting and class partition percentages",
            "Export Formats": "CSV, JSON, JSON-LD with automatic prefixes"
        }

        for workflow, description in workflow_summary.items():
            status = "✓" if any(workflow.lower().replace(' ', '_') in str(k).lower() 
                           for k in globals().keys()) else "○"
            print(f"{status} {workflow:<20}: {description}")

        print(f"\nDataset: {dataset_name}")
        print(f"Endpoint: {endpoint_url}")
        print(f"Total analysis modes demonstrated: {len([k for k in globals().keys() if 'schema_df' in k])}")

        if schemas:
            total_triples = sum(len(df) for df in schemas.values())
            total_classes = sum(df['subject_class'].nunique() for df in schemas.values())
            total_properties = sum(df['property'].nunique() for df in schemas.values())
            
            print(f"Combined schema statistics:")
            print(f"  Total schema triples across all modes: {total_triples:,}")
            print(f"  Unique classes identified: {total_classes:,}")
            print(f"  Unique properties identified: {total_properties:,}")
    except Exception as e:
        print(f"ERROR in summary generation: {e}")
        print("Full traceback:")
        traceback.print_exc()

except Exception as e:
    print(f"ERROR in results section: {e}")
    print("Full traceback:")
    traceback.print_exc()