In [1]:
# Neo4j Schema Documentation Generator
import pandas as pd
from neo4j import GraphDatabase
import logging
from typing import List, Dict, Any
import json
from collections import defaultdict

# Configuration
NEO4J_URI = "neo4j://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "kuxFc8HN"  # Update as needed

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [2]:
class Neo4jSchemaAnalyzer:
    def __init__(self, uri: str, user: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
    def close(self):
        self.driver.close()
        
    def get_node_labels(self) -> List[str]:
        """Get all node labels in the database"""
        query = "CALL db.labels()"
        with self.driver.session() as session:
            result = session.run(query)
            return [record["label"] for record in result]
    
    def get_relationship_types(self) -> List[str]:
        """Get all relationship types in the database"""
        query = "CALL db.relationshipTypes()"
        with self.driver.session() as session:
            result = session.run(query)
            return [record["relationshipType"] for record in result]
    
    def get_node_properties(self, label: str) -> Dict[str, str]:
        """Get property names and types for a node label"""
        query = f"""
        MATCH (n:{label})
        WITH n LIMIT 1
        RETURN properties(n) as props
        """
        with self.driver.session() as session:
            result = session.run(query).single()
            if result:
                props = result["props"]
                return {k: type(v).__name__ for k, v in props.items()}
            return {}
    
    def get_constraints(self) -> List[Dict]:
        """Get all constraints in the database"""
        query = "SHOW CONSTRAINTS"
        with self.driver.session() as session:
            result = session.run(query)
            return [dict(record) for record in result]
    
    def get_indexes(self) -> List[Dict]:
        """Get all indexes in the database"""
        query = "SHOW INDEXES"
        with self.driver.session() as session:
            result = session.run(query)
            return [dict(record) for record in result]
    
    def get_relationship_patterns(self) -> List[Dict]:
        """Get patterns of how nodes are connected"""
        query = """
        MATCH (a)-[r]->(b)
        RETURN DISTINCT
            labels(a) as source_labels,
            type(r) as relationship,
            labels(b) as target_labels,
            count(*) as frequency
        ORDER BY frequency DESC
        """
        with self.driver.session() as session:
            result = session.run(query)
            return [dict(record) for record in result]
    
    def get_node_counts(self) -> Dict[str, int]:
        """Get counts of nodes by label"""
        counts = {}
        for label in self.get_node_labels():
            query = f"MATCH (n:{label}) RETURN count(n) as count"
            with self.driver.session() as session:
                result = session.run(query).single()
                counts[label] = result["count"]
        return counts
    
    def get_relationship_counts(self) -> Dict[str, int]:
        """Get counts of relationships by type"""
        counts = {}
        for rel_type in self.get_relationship_types():
            query = f"MATCH ()-[r:{rel_type}]->() RETURN count(r) as count"
            with self.driver.session() as session:
                result = session.run(query).single()
                counts[rel_type] = result["count"]
        return counts

In [3]:
class SchemaDocumentGenerator:
    def __init__(self, analyzer: Neo4jSchemaAnalyzer):
        self.analyzer = analyzer
        
    def generate_markdown(self) -> str:
        """Generate complete schema documentation in markdown format"""
        sections = [
            self._generate_overview(),
            self._generate_node_labels_section(),
            self._generate_relationships_section(),
            self._generate_constraints_section(),
            self._generate_indexes_section(),
            self._generate_patterns_section()
        ]
        return "\n\n".join(sections)
    
    def _generate_overview(self) -> str:
        node_counts = self.analyzer.get_node_counts()
        rel_counts = self.analyzer.get_relationship_counts()
        
        overview = [
            "# Neo4j Database Schema Documentation",
            
            "## Overview",
            "### Node Summary",
            "| Label | Count |",
            "|-------|-------|",
        ]
        
        for label, count in node_counts.items():
            overview.append(f"| {label} | {count:,} |")
            
        overview.extend([
            "\n### Relationship Summary",
            "| Type | Count |",
            "|------|-------|",
        ])
        
        for rel_type, count in rel_counts.items():
            overview.append(f"| {rel_type} | {count:,} |")
            
        return "\n".join(overview)
    
    def _generate_node_labels_section(self) -> str:
        sections = ["\n## Node Labels and Properties"]
        
        for label in self.analyzer.get_node_labels():
            properties = self.analyzer.get_node_properties(label)
            sections.extend([
                f"\n### {label}",
                "| Property | Type |",
                "|----------|------|"
            ])
            for prop, prop_type in properties.items():
                sections.append(f"| {prop} | {prop_type} |")
                
        return "\n".join(sections)
    
    def _generate_relationships_section(self) -> str:
        sections = ["\n## Relationship Types"]
        patterns = self.analyzer.get_relationship_patterns()
        
        rel_by_type = defaultdict(list)
        for pattern in patterns:
            rel_type = pattern["relationship"]
            source = ", ".join(pattern["source_labels"])
            target = ", ".join(pattern["target_labels"])
            freq = pattern["frequency"]
            rel_by_type[rel_type].append(f"({source}) -> ({target}) [{freq:,} instances]")
            
        for rel_type, patterns in rel_by_type.items():
            sections.extend([
                f"\n### {rel_type}",
                "Patterns:"
            ])
            for pattern in patterns:
                sections.append(f"- {pattern}")
                
        return "\n".join(sections)
    
    def _generate_constraints_section(self) -> str:
        constraints = self.analyzer.get_constraints()
        sections = [
            "\n## Constraints",
            "| Name | Type | For | Properties |",
            "|------|------|-----|------------|"
        ]
        
        for constraint in constraints:
            name = constraint.get("name", "")
            type_ = constraint.get("type", "")
            for_ = constraint.get("for", "")
            properties = ", ".join(constraint.get("properties", []))
            sections.append(f"| {name} | {type_} | {for_} | {properties} |")
            
        return "\n".join(sections)
    
    def _generate_indexes_section(self) -> str:
        indexes = self.analyzer.get_indexes()
        sections = [
            "\n## Indexes",
            "| Name | Type | For | Properties |",
            "|------|------|-----|------------|"
        ]
        
        for index in indexes:
            name = index.get("name", "")
            type_ = index.get("type", "")
            for_ = index.get("for", "")
            # Handle case where properties might not be a list
            props = index.get("properties", [])
            if not isinstance(props, list):
                props = [str(props)] if props else []
            properties = ", ".join(props)
            sections.append(f"| {name} | {type_} | {for_} | {properties} |")
            
        return "\n".join(sections)
    
    def _generate_patterns_section(self) -> str:
        patterns = self.analyzer.get_relationship_patterns()
        sections = [
            "\n## Common Patterns",
            "| Source | Relationship | Target | Frequency |",
            "|--------|--------------|--------|------------|"
        ]
        
        for pattern in patterns:
            source = ", ".join(pattern["source_labels"])
            rel = pattern["relationship"]
            target = ", ".join(pattern["target_labels"])
            freq = pattern["frequency"]
            sections.append(f"| {source} | {rel} | {target} | {freq:,} |")
            
        return "\n".join(sections)

In [4]:
# Generate and display schema documentation
analyzer = Neo4jSchemaAnalyzer(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
generator = SchemaDocumentGenerator(analyzer)

def datetime_handler(obj):
    """Handle Neo4j DateTime serialization"""
    if hasattr(obj, 'isoformat'):
        return obj.isoformat()
    raise TypeError(f'Object of type {type(obj)} is not JSON serializable')

try:
    # Generate markdown documentation
    documentation = generator.generate_markdown()
    
    # Display in notebook
    from IPython.display import display, Markdown
    display(Markdown(documentation))
    
    # Save to file
    with open('neo4j_schema_documentation.md', 'w', encoding='utf-8') as f:
        f.write(documentation)
    print("\nDocumentation has been saved to 'neo4j_schema_documentation.md'")
    
    # Generate JSON schema
    schema = {
        "nodes": {},
        "relationships": {},
        "constraints": analyzer.get_constraints(),
        "indexes": analyzer.get_indexes(),
        "patterns": analyzer.get_relationship_patterns()
    }
    
    # Add node information
    for label in analyzer.get_node_labels():
        schema["nodes"][label] = {
            "properties": analyzer.get_node_properties(label),
            "count": analyzer.get_node_counts()[label]
        }
    
    # Add relationship information
    for rel_type in analyzer.get_relationship_types():
        schema["relationships"][rel_type] = {
            "count": analyzer.get_relationship_counts()[rel_type]
        }
    
    # Save JSON schema with custom serializer
    with open('neo4j_schema.json', 'w', encoding='utf-8') as f:
        json.dump(schema, f, indent=2, default=datetime_handler)
    print("Schema has been saved to 'neo4j_schema.json'")
    
finally:
    analyzer.close()

# Neo4j Database Schema Documentation
## Overview
### Node Summary
| Label | Count |
|-------|-------|
| AICategory | 14 |
| Keyword | 237 |
| Capability | 178 |
| Zone | 4 |
| IntegrationPattern | 70 |
| Version | 1 |
| Metadata | 1 |
| UseCase | 2,052 |
| Agency | 42 |
| Bureau | 325 |
| Output | 1,709 |
| PurposeBenefit | 2,094 |
| EvaluationRun | 0 |
| System | 322 |

### Relationship Summary
| Type | Count |
|------|-------|
| BELONGS_TO | 14 |
| TAGGED_WITH | 242 |
| HAS_CAPABILITY | 192 |
| DEPENDS_ON | 24 |
| INTEGRATES_VIA | 84 |
| CURRENT_VERSION | 1 |
| IMPLEMENTS | 0 |
| HAS_BUREAU | 336 |
| HAS_USE_CASE | 2,052 |
| USES_SYSTEM | 768 |
| HAS_PURPOSE | 2,213 |
| PRODUCES | 2,074 |


## Node Labels and Properties

### AICategory
| Property | Type |
|----------|------|
| id | str |
| status | str |
| maturity_level | str |
| name | str |
| created_at | DateTime |
| last_updated | DateTime |
| category_definition | str |
| zone | str |

### Keyword
| Property | Type |
|----------|------|
| name | str |

### Capability
| Property | Type |
|----------|------|
| name | str |

### Zone
| Property | Type |
|----------|------|
| name | str |

### IntegrationPattern
| Property | Type |
|----------|------|
| name | str |

### Version
| Property | Type |
|----------|------|
| author | str |
| created_at | DateTime |
| number | str |
| changes | str |

### Metadata
| Property | Type |
|----------|------|
| description | str |
| last_updated | DateTime |
| schema_version | str |
| version | str |

### UseCase
| Property | Type |
|----------|------|
| has_ato | bool |
| topic_area | str |
| updated_at | DateTime |
| contains_pii | bool |
| date_initiated | str |
| name | str |
| agency | str |
| purpose_benefits | str |
| outputs | str |
| dev_stage | str |
| infrastructure | str |

### Agency
| Property | Type |
|----------|------|
| name | str |
| abbreviation | str |

### Bureau
| Property | Type |
|----------|------|
| name | str |

### Output
| Property | Type |
|----------|------|
| description | str |

### PurposeBenefit
| Property | Type |
|----------|------|
| description | str |

### EvaluationRun
| Property | Type |
|----------|------|

### System
| Property | Type |
|----------|------|
| name | str |


## Relationship Types

### HAS_PURPOSE
Patterns:
- (UseCase) -> (PurposeBenefit) [2,213 instances]

### PRODUCES
Patterns:
- (UseCase) -> (Output) [2,074 instances]

### HAS_USE_CASE
Patterns:
- (Agency) -> (UseCase) [2,052 instances]

### USES_SYSTEM
Patterns:
- (UseCase) -> (System) [768 instances]

### HAS_BUREAU
Patterns:
- (Agency) -> (Bureau) [336 instances]

### TAGGED_WITH
Patterns:
- (AICategory) -> (Keyword) [242 instances]

### HAS_CAPABILITY
Patterns:
- (AICategory) -> (Capability) [192 instances]

### INTEGRATES_VIA
Patterns:
- (AICategory) -> (IntegrationPattern) [84 instances]

### DEPENDS_ON
Patterns:
- (AICategory) -> (AICategory) [24 instances]

### BELONGS_TO
Patterns:
- (AICategory) -> (Zone) [14 instances]

### CURRENT_VERSION
Patterns:
- (Metadata) -> (Version) [1 instances]


## Constraints
| Name | Type | For | Properties |
|------|------|-----|------------|
| agency_name | UNIQUENESS |  | name |
| ai_category_id | UNIQUENESS |  | id |
| bureau_name | UNIQUENESS |  | name |
| capability_name | UNIQUENESS |  | name |
| evaluation_run_id | UNIQUENESS |  | run_id |
| keyword_name | UNIQUENESS |  | name |
| output_composite | UNIQUENESS |  | description, agency |
| purpose_benefit_composite | UNIQUENESS |  | description, agency |
| system_name | UNIQUENESS |  | name |
| use_case_composite | UNIQUENESS |  | name, agency |
| zone_name | UNIQUENESS |  | name |


## Indexes
| Name | Type | For | Properties |
|------|------|-----|------------|
| agency_name | RANGE |  | name |
| ai_category_id | RANGE |  | id |
| ai_category_name | RANGE |  | name |
| ai_category_zone | RANGE |  | zone |
| bureau_name | RANGE |  | name |
| capability_name | RANGE |  | name |
| evaluation_run_id | RANGE |  | run_id |
| evaluation_timestamp | RANGE |  | timestamp |
| index_343aff4e | LOOKUP |  |  |
| index_f7700477 | LOOKUP |  |  |
| keyword_name | RANGE |  | name |
| output_composite | RANGE |  | description, agency |
| purpose_benefit_composite | RANGE |  | description, agency |
| system_name | RANGE |  | name |
| use_case_agency | RANGE |  | agency |
| use_case_composite | RANGE |  | name, agency |
| use_case_confidence | RANGE |  | confidence |
| use_case_match_method | RANGE |  | match_method |
| use_case_match_score | RANGE |  | final_score |
| use_case_topic | RANGE |  | topic_area |
| zone_name | RANGE |  | name |


## Common Patterns
| Source | Relationship | Target | Frequency |
|--------|--------------|--------|------------|
| UseCase | HAS_PURPOSE | PurposeBenefit | 2,213 |
| UseCase | PRODUCES | Output | 2,074 |
| Agency | HAS_USE_CASE | UseCase | 2,052 |
| UseCase | USES_SYSTEM | System | 768 |
| Agency | HAS_BUREAU | Bureau | 336 |
| AICategory | TAGGED_WITH | Keyword | 242 |
| AICategory | HAS_CAPABILITY | Capability | 192 |
| AICategory | INTEGRATES_VIA | IntegrationPattern | 84 |
| AICategory | DEPENDS_ON | AICategory | 24 |
| AICategory | BELONGS_TO | Zone | 14 |
| Metadata | CURRENT_VERSION | Version | 1 |


Documentation has been saved to 'neo4j_schema_documentation.md'
Schema has been saved to 'neo4j_schema.json'
