# drugbank.drugs Schema Extraction

This notebook demonstrates RDF schema extraction from the drugbank.drugs SPARQL endpoint using a unified JSON-LD first approach. It discovers VoID (Vocabulary of Interlinked Datasets) descriptions and generates standards-compliant JSON-LD as the source for all downstream outputs including frequency analysis and LinkML schemas.

## Exports

- [JSON-LD Schema](https://github.com/jmillanacosta/rdfsolve/blob/main/docs/notebooks/schema_extraction/drugbank.drugs_schema/drugbank.drugs_schema.jsonld) (primary output)
- [N-Quads RDF](https://github.com/jmillanacosta/rdfsolve/blob/main/docs/notebooks/schema_extraction/drugbank.drugs_schema/drugbank.drugs_schema.nq)
- [VoID Graph](https://github.com/jmillanacosta/rdfsolve/blob/main/docs/notebooks/schema_extraction/drugbank.drugs_schema/drugbank.drugs_generated_void.ttl) for the dataset in its original source
- [Coverage report](https://github.com/jmillanacosta/rdfsolve/blob/main/docs/notebooks/schema_extraction/drugbank.drugs_schema/drugbank.drugs_pattern_coverage.csv)
- [LinkML Schema](https://github.com/jmillanacosta/rdfsolve/blob/main/docs/notebooks/schema_extraction/drugbank.drugs_schema/drugbank.drugs_linkml_schema.yaml)
- [Full parquet entity dataframe](https://github.com/jmillanacosta/rdfsolve/blob/main/docs/notebooks/schema_extraction/drugbank.drugs_schema/drugbank.drugsinstances.parquet)

In [None]:
# Dataset Configuration
import os

# Dataset parameters
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "drugbank.drugs"
void_iri = "http://wifo5-04.informatik.uni-mannheim.de/drugbank/resource/drugs/"
graph_uri = "http://wifo5-04.informatik.uni-mannheim.de/drugbank/resource/drugs/"

# Setup paths
working_path = os.path.abspath("")
exports_path = os.path.join(working_path, "..", "..", "docs", "notebooks", dataset_name)
os.makedirs(exports_path, exist_ok=True)

In [None]:
import logging
import sys

# Minimal notebook logger using existing dataset_name
logger = logging.getLogger(dataset_name or "notebook")
logger.setLevel(logging.DEBUG)  # Set to DEBUG to see SPARQL queries

# Also configure the rdfsolve.parser logger to see query details
parser_logger = logging.getLogger('rdfsolve.parser')
parser_logger.setLevel(logging.DEBUG)

# Avoid adding duplicate handlers if the cell is re-run
if not logger.handlers:
    fmt = logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s", "%Y-%m-%d %H:%M:%S")

    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(logging.DEBUG)  # Set to DEBUG to see all logs
    sh.setFormatter(fmt)
    logger.addHandler(sh)
    
    # Add the same handler to the parser logger
    parser_logger.addHandler(sh)

logger.info(f"Logging configured for {dataset_name}")

In [None]:
# Import libraries
import json
from rdfsolve.parser import VoidParser
from IPython.display import display, Markdown

# Configure Plotly for HTML output
import plotly.io as pio
import plotly.offline as pyo

# Set renderer to 'notebook' for Jupyter, but ensure HTML export works
pio.renderers.default = "notebook+plotly_mimetype"

# Initialize offline mode for Plotly
pyo.init_notebook_mode(connected=True)

In [None]:
# Pickle caching utilities
import pickle
import os

def save_cache(data, filename, cache_dir=None):
    """Save data to pickle cache"""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")
    os.makedirs(cache_dir, exist_ok=True)
    
    cache_path = os.path.join(cache_dir, f"{filename}.pkl")
    with open(cache_path, 'wb') as f:
        pickle.dump(data, f)
    print(f"Cached data to: {cache_path}")
    return cache_path

def load_cache(filename, cache_dir=None):
    """Load data from pickle cache if it exists"""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")
    
    cache_path = os.path.join(cache_dir, f"{filename}.pkl")
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            data = pickle.load(f)
        print(f"Loaded cached data from: {cache_path}")
        return data
    return None

def cache_exists(filename, cache_dir=None):
    """Check if cache file exists"""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")
    
    cache_path = os.path.join(cache_dir, f"{filename}.pkl")
    return os.path.exists(cache_path)

In [None]:
# Cache management utilities
def list_cache_files(cache_dir=None):
    """List all cache files"""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")
    
    if not os.path.exists(cache_dir):
        print("No cache directory found")
        return []
    
    cache_files = [f for f in os.listdir(cache_dir) if f.endswith('.pkl')]
    print(f"Cache directory: {cache_dir}")
    for f in cache_files:
        file_path = os.path.join(cache_dir, f)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"  {f} ({size_mb:.2f} MB)")
    return cache_files

def clear_cache(filename=None, cache_dir=None):
    """Clear specific cache file or all cache"""
    if cache_dir is None:
        cache_dir = os.path.join(exports_path, "cache")
    
    if filename:
        cache_path = os.path.join(cache_dir, f"{filename}.pkl")
        if os.path.exists(cache_path):
            os.remove(cache_path)
            print(f"Removed cache: {filename}")
        else:
            print(f"Cache not found: {filename}")
    else:
        # Clear all cache files
        if os.path.exists(cache_dir):
            import shutil
            shutil.rmtree(cache_dir)
            print(f"Cleared all cache files")
        else:
            print("No cache directory to clear")

# Show current cache status
list_cache_files()

### Cache Control

Use these cells to manage cached data. When testing new code changes, you may want to clear relevant cache files to force re-computation.

In [None]:
# Clear specific cache files (uncomment lines as needed for testing)

# When testing new parser changes:
clear_cache(f"{dataset_name}_voidparser")

# When testing JSON-LD generation (primary output):
clear_cache(f"{dataset_name}_jsonld_schema")

# When testing frequency calculations:
# clear_cache(f"{dataset_name}_frequencies_basic")
# clear_cache(f"{dataset_name}_frequencies_with_instances")

# Clear everything:
clear_cache()

print("Cleared parser and JSON-LD caches to test schema pattern fix")
print("Note: JSON-LD cache is now the primary cache - clear it when testing schema changes.")

## Discover or get VoID Schema

In [None]:
# Generate VoID schema with caching
cache_key = f"{dataset_name}_voidparser"

# Try to load from cache first
vp = load_cache(cache_key)

if vp is None:
    print("VoidParser not found in cache, generating...")
    vp = VoidParser.from_endpoint_with_discovery(
        endpoint_url=endpoint_url,
        dataset_name=dataset_name,
        exports_path=exports_path,
        # Arguments for the case when a suitable VoID is not found
        graph_uris=[graph_uri] if graph_uri else None,
        exclude_graph_patterns=["openlinksw",
                                "well-known",
                                "void",
                                "service"], # Filter out service description and administrative graphs
        counts=True,
        offset_limit_steps=300 # pagination
    )
    # Cache the VoidParser for future use
    save_cache(vp, cache_key)
else:
    print("Loaded VoidParser from cache")

## Schema Discovery and Exports Workflow

### Workflow Steps:

1. **VoID Discovery**: Extract schema patterns from SPARQL endpoint VoID descriptions
2. **JSON-LD Generation**: Convert to standards-compliant JSON-LD using established vocabularies:
   - VoID (Vocabulary of Interlinked Datasets) for dataset descriptions
   - SHACL for property shape patterns  
   - Dublin Core Terms for metadata
   - PROV-O for provenance information
   - RDF/RDFS/OWL for core semantic types

3. **Derived Outputs**: All other formats are generated from the JSON-LD structure:
   - **Frequencies**: Schema pattern coverage analysis
   - **LinkML**: LinkML YAML used elsewhere for other features.
   - **CSV/JSON**: Tabular and structured data exports
   - **RDF**: N-Quads serialization for triplestore import

In [None]:
# Primary JSON-LD schema export (source of truth for all other formats)
cache_key = f"{dataset_name}_jsonld_schema"
jsonld_schema = load_cache(cache_key)

if jsonld_schema is None:
    print("Generating standards-compliant JSON-LD schema...")
    jsonld_schema = vp.to_jsonld(filter_void_admin_nodes=True)
    save_cache(jsonld_schema, cache_key)
else:
    print("Loaded JSON-LD schema from cache")

# Save JSON-LD schema file
jsonld_file = os.path.join(exports_path, f"{dataset_name}_schema.jsonld")
with open(jsonld_file, "w", encoding="utf-8") as f:
    json.dump(jsonld_schema, f, indent=2, ensure_ascii=False)

print(f"JSON-LD Schema saved to: {jsonld_file}")

# Display JSON-LD structure info
if "@graph" in jsonld_schema:
    print(f"Standards-compliant JSON-LD with {len(jsonld_schema['@context'])} prefixes and {len(jsonld_schema['@graph'])} resources")
    
    # Show dataset metadata
    dataset_info = jsonld_schema["@graph"][0] if jsonld_schema["@graph"] else {}
    if dataset_info.get("@type") == "void:Dataset":
        print(f"Dataset: {dataset_info.get('dcterms:title', 'Unknown')}")
        print(f"Classes: {dataset_info.get('void:classes', 0)}")
        print(f"Properties: {dataset_info.get('void:properties', 0)}")
        print(f"Triples: {dataset_info.get('void:triples', 0)}")

# Basic schema DataFrame derived from JSON-LD
schema_df = vp.to_schema(filter_void_admin_nodes=True)
display(schema_df.head())

In [None]:
# Validate JSON-LD with pyld (optional - requires pyld installation)
try:
    from pyld import jsonld as pyld_lib
    
    print("Validating JSON-LD compliance...")
    
    # Test JSON-LD expansion (validates structure)
    expanded = pyld_lib.expand(jsonld_schema)
    print(f"JSON-LD expansion successful: {len(expanded)} top-level items")
    
    # Convert to RDF triples (validates semantic correctness)
    nquads = pyld_lib.to_rdf(jsonld_schema, {'format': 'application/n-quads'})
    triples_count = len(nquads.strip().split('\n')) if nquads.strip() else 0
    print(f"RDF conversion successful: {triples_count} triples generated")
    
    # Save N-Quads for reference
    nquads_file = os.path.join(exports_path, f"{dataset_name}_schema.nq")
    with open(nquads_file, "w", encoding="utf-8") as f:
        f.write(nquads)
    print(f"N-Quads saved to: {nquads_file}")
    
except ImportError:
    print("pyld not available - skipping JSON-LD validation (install with: pip install pyld)")
except Exception as e:
    print(f"JSON-LD validation failed: {e}")

## Schema Pattern Coverage Analysis (derived from JSON-LD)
For each subject class type, calculate how many entities participate in each schema pattern divided by the total number of entities of that class type. This gives coverage ratios showing what percentage of entities actually use each relationship pattern.

In [None]:
# Calculate schema pattern coverage ratios with caching
cache_key = f"{dataset_name}_frequencies_basic"
cached_data = load_cache(cache_key)

if cached_data is None:
    print("Calculating schema pattern frequencies...")
    frequencies_df, _ = vp.count_schema_shape_frequencies(
        endpoint_url=endpoint_url,
        offset_limit_steps=300,
    )
    save_cache(frequencies_df, cache_key)
else:
    print("Loaded frequencies DataFrame from cache")
    frequencies_df = cached_data

frequencies_df[['subject_class', 'property', 'object_class', 'coverage_percent']].sample(4)

In [None]:
# Export coverage analysis
frequencies_output_path = os.path.join(exports_path, f"{dataset_name}_pattern_coverage.csv")
exported_df = vp.export_schema_shape_frequencies(frequencies_df, output_file=frequencies_output_path)

# Simple summary
if not frequencies_df.empty:
    avg_coverage = frequencies_df['coverage_percent'].mean()
    high_coverage = (frequencies_df['coverage_percent'] > 50).sum()
    display(Markdown(f"""
**Pattern Coverage Summary:**
- Average pattern coverage: **{avg_coverage:.1f}%**
- Patterns with >50% coverage: **{high_coverage}/{len(frequencies_df)}**
- Exported to: `{frequencies_output_path}`
- Derived from JSON-LD schema for consistency
"""))

In [None]:
frequencies_df.describe()

## Schema Pattern Instance Collection
Collect actual subject and object IRI instances for each schema pattern. This provides detailed access to the specific entities participating in each relationship pattern.

In [None]:
# Collect both frequency data and actual instances with caching
cache_key = f"{dataset_name}_frequencies_with_instances"
cached_data = load_cache(cache_key)

if cached_data is None:
    print("Collecting frequency data and instances...")
    frequencies_with_instances_df, instances_df = vp.count_schema_shape_frequencies(
        endpoint_url=endpoint_url,
        #sample_limit=100,  # Limited sample for demonstration
        collect_instances=True,
        offset_limit_steps=300
    )
    # Cache both DataFrames as a tuple
    save_cache((frequencies_with_instances_df, instances_df), cache_key)
else:
    print("Loaded frequencies and instances DataFrames from cache")
    frequencies_with_instances_df, instances_df = cached_data

# Display basic information about the data structure
print(f"Frequencies DataFrame: {len(frequencies_with_instances_df)} shapes")
if instances_df is not None:
    print(f"Instances DataFrame: {len(instances_df)} subject-object pairs")
    print(f"Memory usage - Frequencies: {frequencies_with_instances_df.memory_usage(deep=True).sum() / 1024:.1f} KB")
    print(f"Memory usage - Instances: {instances_df.memory_usage(deep=True).sum() / 1024:.1f} KB")
else:
    print("No instances collected")

In [None]:
# Examine the structure of the linked DataFrames
if instances_df is not None:
    print("Frequencies DataFrame columns:")
    print(list(frequencies_with_instances_df.columns))
    print("\nInstances DataFrame columns:")
    print(list(instances_df.columns))
    print(f"\nInstances DataFrame dtypes (note categorical optimization):")
    print(instances_df.dtypes)
    
    # Show sample of frequencies data with shape_id for linking
    print("\nSample frequencies data:")
    display(frequencies_with_instances_df.head(3))
    
    # Show sample instances data
    print("\nSample instances data:")
    display(instances_df.head(3))

In [None]:
# Example 1: Filter shapes by coverage threshold and get their instances
high_coverage_data = vp.get_shape_instances(
    frequencies_with_instances_df, 
    instances_df,
    #min_coverage=0.3
)
print(f"{len(high_coverage_data)} instance records")

# Example 2: Direct DataFrame operations for specific analysis
if not instances_df.empty:
    # Count unique subjects per shape
    subjects_per_shape = instances_df.groupby('shape_id')['subject_iri'].nunique()
    print(f"Shape with most unique subjects: {subjects_per_shape.max()} subjects")
    
    # Find most common object types
    common_objects = instances_df['object_iri'].value_counts().head(3)
    print(f"Most frequent object IRIs:")
    for iri, count in common_objects.items():
        print(f"  {iri}: {count} occurrences")

In [None]:
# Analyze distribution and memory characteristics
if instances_df is not None:
    distribution_analysis = vp.analyze_shape_distribution(frequencies_with_instances_df, instances_df)
    
    print("Distribution Analysis:")
    print(f"  Total shapes: {distribution_analysis.get('total_shapes', 0)}")
    print(f"  Shapes with instances: {distribution_analysis.get('shapes_with_instances', 0)}")
    print(f"  Total instance records: {distribution_analysis.get('total_instance_records', 0)}")
    print(f"  Average instances per shape: {distribution_analysis.get('avg_instances_per_shape', 0):.1f}")
    print(f"  Maximum instances per shape: {distribution_analysis.get('max_instances_per_shape', 0)}")
    
    memory_info = distribution_analysis.get('memory_usage_mb', {})
    print(f"\nMemory Usage:")
    print(f"  Frequencies DataFrame: {memory_info.get('frequencies_df', 0):.2f} MB")
    print(f"  Instances DataFrame: {memory_info.get('instances_df', 0):.2f} MB")
    
    # Show top shapes by instance count
    top_shapes = distribution_analysis.get('top_shapes_by_instances', {})
    if top_shapes:
        print(f"\nTop shapes by instance count:")
        for shape_id, info in list(top_shapes.items())[:3]:
            count = info.get('actual_instances', 0)
            pattern = info.get('shape_pattern', 'Unknown')
            coverage = info.get('coverage_ratio', 0)
            print(f"  {pattern[:50]}... ({count} instances, {coverage:.1%} coverage)")

In [None]:
# Example of accessing specific shape instances
if instances_df is not None and not frequencies_with_instances_df.empty:
    # Get instances for the fifth shape as an example
    sample_shape_id = frequencies_with_instances_df.iloc[4]['shape_id']
    sample_shape_pattern = frequencies_with_instances_df.iloc[4]['shape_pattern']
    
    # Filter instances for this specific shape
    shape_instances = instances_df[instances_df['shape_id'] == sample_shape_id]
    
    print(f"Sample shape: {sample_shape_pattern}")
    print(f"Shape ID: {sample_shape_id}")
    print(f"Instance count: {len(shape_instances)}")
    
    if not shape_instances.empty:
        print("\nSample instances:")
        display(shape_instances.head(3))
        
        print(f"\nUnique subjects: {shape_instances['subject_iri'].nunique()}")
        print(f"Unique objects: {shape_instances['object_iri'].nunique()}")

In [None]:
# Analyze self-referential instances
if instances_df is not None and not instances_df.empty:
    print("Analyzing self-referential instances:")
    
    # Find cases where subject and object are identical
    self_refs = instances_df[instances_df['subject_iri'] == instances_df['object_iri']]
    
    print(f"Total instances: {len(instances_df)}")
    print(f"Self-referential instances: {len(self_refs)}")
    print(f"Percentage self-referential: {len(self_refs)/len(instances_df)*100:.1f}%")
    
    if not self_refs.empty:
        print(f"\nProperties involved in self-references:")
        self_ref_props = self_refs['property'].value_counts()
        display(self_ref_props.head(10))
        
        print(f"\nSample self-referential cases:")
        display(self_refs[['subject_iri', 'object_iri', 'property', 'subject_class', 'object_class']].head())
        
        # Check if these are meaningful or problematic
        meaningful_props = {'owl:sameAs', 'rdfs:seeAlso', 'skos:exactMatch', 'dc:identifier', 'foaf:primaryTopic'}
        problematic_self_refs = self_refs[~self_refs['property'].isin(meaningful_props)]
        
        print(f"\nPotentially problematic self-references: {len(problematic_self_refs)}")
        if not problematic_self_refs.empty:
            print("Properties that shouldn't be self-referential:")
            display(problematic_self_refs['property'].value_counts().head())
            
    # Check for potential data quality issues
    print(f"\nData quality check:")
    print(f"Unique subjects: {instances_df['subject_iri'].nunique()}")
    print(f"Unique objects: {instances_df['object_iri'].nunique()}")
    print(f"Unique properties: {instances_df['property'].nunique()}")
    
    # Look for patterns in the problematic case you mentioned
    identifier_cases = instances_df[instances_df['property'] == 'dc:identifier']
    if not identifier_cases.empty:
        print(f"\nAnalyzing dc:identifier cases:")
        print(f"Total dc:identifier instances: {len(identifier_cases)}")
        id_self_refs = identifier_cases[identifier_cases['subject_iri'] == identifier_cases['object_iri']]
        print(f"dc:identifier self-references: {len(id_self_refs)}")
        if not id_self_refs.empty:
            print("Sample dc:identifier self-references:")
            display(id_self_refs[['subject_iri', 'object_iri', 'subject_class', 'object_class']].head(3))

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd

if not frequencies_df.empty:
    df = frequencies_df.copy()
    df["coverage_percent"] = pd.to_numeric(
        df["coverage_percent"], errors="coerce"
    ).fillna(0)
    df = df.sort_values("coverage_percent", ascending=False).reset_index(drop=True)

    def make_label(row):
        return (
            f"<b>{row['subject_class']}</b> "
            f"<span style='color:#888;'></span> "
            f"<i>{row['property']}</i> "
            f"<span style='color:#888;'></span> "
            f"<b>{row['object_class']}</b>"
        )

    df["styled_label"] = df.apply(make_label, axis=1)

    text_positions = ["outside" if v < 95 else "inside" for v in df["coverage_percent"]]
    custom_colorscale = [
        [0.0, "#d36e61"],
        [0.4, "#e5cdbd"],
        [0.7, "#e8e4cf"],
        [1.0, "#c3d9c0"],
    ]

    # Figure sizing
    bar_height = 26
    fig_height = min(2000, bar_height * len(df) + 200)

    fig = go.Figure(
        go.Bar(
            x=df["coverage_percent"],
            y=df["styled_label"],
            orientation="h",
            text=[f"{v:.1f}%" for v in df["coverage_percent"]],
            textposition=text_positions,
            marker=dict(
                color=df["coverage_percent"],
                colorscale=custom_colorscale,
                cmin=0,
                cmax=100,
                line=dict(color="white", width=0.6),
            ),
            hovertemplate="<b>%{y}</b><br>Coverage: %{x:.1f}%<extra></extra>",
        )
    )

    fig.update_layout(
        title={
            "text": f"Schema Pattern Coverage for {dataset_name}",
            "x": 0.5,
            "font": {"size": 18},
        },
        xaxis=dict(
            title="Coverage (%)",
            range=[0, 100],  # fixed x-axis range
            ticksuffix="%",
            showgrid=True,
            gridcolor="rgba(220,220,220,0.3)",
        ),
        yaxis=dict(
            title="",
            autorange="reversed",
            automargin=True,
            fixedrange=False,  # allow vertical zoom/pan
        ),
        template="plotly_white",
        autosize=True,  # allow figure to scale with container
        height=fig_height,  # base height (will scale)
        margin=dict(t=80, b=50, l=480, r=150),  # extra right margin for text
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    # Disable horizontal zoom/pan
    fig.update_xaxes(fixedrange=True)

    # Show figure with config for HTML export compatibility
    fig.show(config={
        "scrollZoom": True, 
        "responsive": True,
        "toImageButtonOptions": {
            "format": "png",
            "filename": f"{dataset_name}_schema_coverage",
            "height": fig_height,
            "width": 600,
            "scale": 1
        }
    })

else:
    display(Markdown("**No coverage data to visualize**"))

## LinkML (derived from JSON-LD)

In [None]:
# Generate LinkML directly from JSON-LD with custom schema URI
print("Regenerating LinkML schema from JSON-LD with custom schema URI...")

schema_name = f"{dataset_name}_schema"
custom_schema_uri = f"http://jmillanacosta.github.io/rdfsolve/{dataset_name}/linkml"  # User-definable base URI

yaml_text = vp.to_linkml_yaml(
    schema_name=schema_name,
    schema_description=f"LinkML schema for {dataset_name} generated from JSON-LD",
    schema_base_uri=custom_schema_uri,
    filter_void_nodes=True,
)

# Save to LinkML YAML
linkml_file = os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")
with open(linkml_file, "w", encoding="utf-8") as f:
    f.write(yaml_text)

print(f"LinkML YAML saved to: {linkml_file}")

### Mermaid  diagram for LinkML Schema

In [None]:
# Generate Mermaid diagram
from linkml.generators.erdiagramgen import ERDiagramGenerator
from linkml_runtime.utils.schemaview import SchemaView
    
print("Generating Mermaid class diagram...")
    
 # Reload schema to ensure we have the latest version
sv = SchemaView(linkml_file)
linkml_schema = sv.schema

display(Markdown(f"**Parsed LinkML schema:** Classes = {len(sv.all_classes())}, Slots = {len(sv.all_slots())}"))

# Generate Mermaid diagram with error handling
mermaid_code = ERDiagramGenerator(linkml_file).serialize()
display(Markdown(mermaid_code))

In [None]:
json_path = os.path.join(exports_path, f"{dataset_name}_schema.json")
csv_path = os.path.join(exports_path, f"{dataset_name}_schema.csv")

# Export CSV from frequencies
frequencies_df.to_csv(csv_path, index=False)

# Export JSON derived from JSON-LD (maintains consistency)
with open(json_path, 'w', encoding='utf-8') as fh:
    json.dump(vp.to_json(filter_void_nodes=True), fh, indent=2)

print(f"CSV exported to: {csv_path}")
print(f"JSON exported to: {json_path}")

In [None]:
# Export instances DataFrame as parquet
if instances_df is not None and not instances_df.empty:
    # Reset categorical columns to strings for better parquet compatibility
    instances_export = instances_df.copy()
    for col in instances_export.select_dtypes(include=['category']).columns:
        instances_export[col] = instances_export[col].astype(str)
    
    parquet_path = os.path.join(exports_path, f"{dataset_name}_instances.parquet")
    instances_export.to_parquet(parquet_path, index=False)
    print(f"Exported instances DataFrame to: {parquet_path}")
    print(f"Shape: {instances_export.shape}")
    print(f"File size: {os.path.getsize(parquet_path) / (1024*1024):.2f} MB")
    print(f"Columns: {list(instances_export.columns)}")
    print(f"Linked to frequencies via 'shape_id' column")
else:
    print("No instances DataFrame available to export")