In [None]:
import re
import os
import google.generativeai as genai
from typing import Dict, List, Set, Tuple, Optional
from getpass import getpass
from rdflib import Graph, Namespace

# =================== CONFIGURATION ===================
class Config:
    """Configuration with paper-specific settings"""

    # Example CIDOC-CRM prefixes
    PREFIXES = """
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
"""

    # Precomputed path patterns for A=1 to A=4 (mocked from paper examples)
    MOCK_PATH_PATTERNS = {
        1: [
            "E22_Man-Made_Object->P108i_was_produced_by->E12_Production",
            "E39_Actor->P92i_was_brought_into_existence_by->E63_Beginning_of_Existence"
        ],
        2: [
            "E22_Man-Made_Object->P108i_was_produced_by->E12_Production->P14_carried_out_by->E39_Actor",
            "E12_Production->P14_carried_out_by->E39_Actor->P92i_was_brought_into_existence_by->E63_Beginning_of_Existence"
        ],
        3: [
            "E22_Man-Made_Object->P108i_was_produced_by->E12_Production->P14_carried_out_by->E39_Actor->P92i_was_brought_into_existence_by->E63_Beginning_of_Existence"
        ],
        4: [
            "E22_Man-Made_Object->P108i_was_produced_by->E12_Production->P14_carried_out_by->E39_Actor->P92i_was_brought_into_existence_by->E63_Beginning_of_Existence->P7_took_place_at->E53_Place"
        ]
    }

    @staticmethod
    def get_api_key():
        """Securely get Gemini API key"""
        return os.getenv("GEMINI_API_KEY") or getpass("Enter Gemini API key: ")

# Configure Gemini
genai.configure(api_key=Config.get_api_key())
gemini = genai.GenerativeModel('gemini-2.0-flash')

# =================== CORE IMPLEMENTATION ===================
class CIDOCQuerySystem:
    def __init__(self):
        self.path_patterns = Config.MOCK_PATH_PATTERNS  # Replace with real KG path queries

    def _format_patterns(self, patterns: List[str]) -> str:
        """Format path patterns for prompts"""
        return "\n".join([f"• {p}" for p in patterns])

    def get_prediction_prompt(self, question: str) -> str:
        """Stage 1: Predict relevant classes/properties"""
        triple_patterns = self._format_patterns(self.path_patterns.get(1, []))

        return f"""{Config.PREFIXES}

# TASK: Identify relevant CIDOC-CRM classes and properties
# From these TRIPLE PATTERNS:
{triple_patterns}

# INSTRUCTIONS:
1. List CLASSES and PROPERTIES needed to answer:
Question: {question}

Output format:
List of Classes: {{class1, class2, ...}}
List of Properties: {{prop1, prop2, ...}}
"""

    def predict_classes_properties(self, question: str) -> Tuple[Set[str], Set[str]]:
        """Get predictions from LLM"""
        prompt = self.get_prediction_prompt(question)

        try:
            response = gemini.generate_content(prompt)
            text = response.text

            # Extract classes and properties
            classes_match = re.search(r"List of Classes:\s*{([^}]*)}", text)
            props_match = re.search(r"List of Properties:\s*{([^}]*)}", text)

            classes = set(classes_match.group(1).split(", ")) if classes_match else set()
            properties = set(props_match.group(1).split(", ")) if props_match else set()

            return classes, properties

        except Exception as e:
            print(f"Prediction error: {e}")
            return set(), set()

    def filter_path_patterns(self,
                             classes: Set[str],
                             properties: Set[str],
                             max_radius: int = 4) -> List[str]:
        """Apply filtering rules A → B → C"""
        all_patterns = []
        for r in range(1, max_radius + 1):
            all_patterns.extend(self.path_patterns.get(r, []))

        # Rule A: All predicted elements
        rule_a = [p for p in all_patterns
                 if all(c in p for c in classes) and all(p_ in p for p_ in properties)]
        if rule_a:
            return rule_a

        # Rule B: All classes OR all properties
        rule_b = [p for p in all_patterns
                 if all(c in p for c in classes) or all(p_ in p for p_ in properties)]
        if rule_b:
            return rule_b

        # Rule C: At least one predicted element
        return [p for p in all_patterns
               if any(c in p for c in classes) or any(p_ in p for p_ in properties)]

    def get_generation_prompt(self,
                            question: str,
                            patterns: List[str]) -> str:
        """Stage 2: Generate SPARQL using filtered patterns"""
        patterns_text = self._format_patterns(patterns)

        return f"""{Config.PREFIXES}

# TASK: Generate SPARQL using these patterns:
{patterns_text}

# IMPORTANT RULES:
1. Never use a class as a property (or vice versa)
2. Maintain proper path structure from patterns
3. Return ONLY the SPARQL query in triple backticks

Question: {question}
"""

    def generate_sparql(self, question: str) -> str:
        """Full two-stage process"""
        # Stage 1: Predict relevant elements
        classes, properties = self.predict_classes_properties(question)
        print(f"Predicted classes: {classes}, Properties: {properties}")

        # Stage 2: Filter patterns and generate query
        filtered_patterns = self.filter_path_patterns(classes, properties)
        prompt = self.get_generation_prompt(question, filtered_patterns)

        try:
            response = gemini.generate_content(prompt)
            match = re.search(r"```sparql(.*?)```", response.text, re.DOTALL)
            if match:
                sparql = match.group(1).strip()
                return Config.PREFIXES + "\n" + sparql
            return "No valid SPARQL generated"
        except Exception as e:
            return f"Generation error: {str(e)}"

# =================== BENCHMARK TESTING ===================
class BenchmarkTester:
    """Test against paper's benchmark questions"""
    BENCHMARK_QUESTIONS = [
        # A=1
        "Who is the holder of an artwork?",
        # A=2
        "Which is the birthplace of an artist?",
        # A=3
        "For which artworks did production start after 1934-01-01?",
        # A=4
        "Give me the number of artworks per place of birth of artist",
        # Mixed A
        "Give me the production starting date of an artwork and the birth date of its artist"
    ]

    def __init__(self):
        self.system = CIDOCQuerySystem()

    def run_benchmark(self):
        """Run through all benchmark questions"""
        results = []
        for q in self.BENCHMARK_QUESTIONS:
            print(f"\n{'='*50}\nQuestion: {q}")
            sparql = self.system.generate_sparql(q)
            print("Generated SPARQL:")
            print(sparql)
            results.append((q, sparql))
        return results

# =================== MAIN EXECUTION ===================
if __name__ == "__main__":
    print("🧠 CIDOC-CRM SPARQL Generation System")
    print("Based on: 'Generating SPARQL Queries over CIDOC-CRM Using a Two-Stage Ontology Path Patterns Method'")

    tester = BenchmarkTester()

    while True:
        choice = input("\nChoose mode:\n1. Run Benchmark Tests\n2. Interactive Mode\n3. Exit\n> ")

        if choice == "1":
            print("\n🧪 Running benchmark tests...")
            tester.run_benchmark()

        elif choice == "2":
            system = CIDOCQuerySystem()
            print("\n🔁 Interactive Mode (type 'exit' to quit)")
            while True:
                question = input("\n❓ Question: ").strip()
                if question.lower() in ['exit', 'quit']:
                    break
                sparql = system.generate_sparql(question)
                print("\n📄 Generated SPARQL:")
                print(sparql)

        elif choice == "3":
            print("👋 Goodbye!")
            break

        else:
            print("❌ Invalid choice. Please try again.")

🧠 CIDOC-CRM SPARQL Generation System
Based on: 'Generating SPARQL Queries over CIDOC-CRM Using a Two-Stage Ontology Path Patterns Method'

🔁 Interactive Mode (type 'exit' to quit)
Predicted classes: {'crm:E22_Man-Made_Object', 'crm:E39_Actor', 'crm:E63_Beginning_of_Existence', 'crm:E12_Production'}, Properties: {'crm:P108i_was_produced_by', 'crm:P92i_was_brought_into_existence_by'}

📄 Generated SPARQL:

PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?model
WHERE {
  ?model rdf:type crm:E39_Model .
}
Predicted classes: {'crm:E22_Man-Made_Object', 'crm:E39_Actor', 'crm:E63_Beginning_of_Existence', 'crm:E12_Production'}, Properties: {'crm:P108i_was_produced_by', 'crm:P92i_was_brought_into_existence_by'}

📄 Gen