In [3]:
import logging, sys

# Xoá toàn bộ handler hiện có
for h in logging.root.handlers[:]:
    logging.root.removeHandler(h)

# Tạo handler mới ghi ra stdout
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s:%(name)s: %(message)s"))

# Gắn vào root logger, và bật mức DEBUG
root = logging.getLogger()
root.setLevel(logging.DEBUG)
root.addHandler(handler)

import streamlit as st
from neo4j import GraphDatabase
import ollama
import numpy as np
from functools import lru_cache
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
# Neo4j Connection
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(
            uri, 
            auth=(user, password),
            max_connection_pool_size=50,
            connection_acquisition_timeout=30
        )
        
    def run_query(self, query, parameters=None):
        try:
            with self._driver.session(
                database="neo4j",
                default_access_mode=GraphDatabase.WRITE_ACCESS
            ) as session:
                result = session.run(query, parameters or {})
                return [dict(r.items()) for r in result]
        except Exception as e:
            logging.error(f"Neo4j query failed: {str(e)}")
            logging.debug(f"Failed query: {query}")
            logging.debug(f"Parameters: {parameters}")
            return []
    def close(self):
        self.driver.close()

    # def run_query(self, query, parameters=None):
    #     with self.driver.session() as session:
    #         result = session.run(query, parameters or {})
    #         records = [r.data() for r in result]


    #     cleaned = []
    #     for record in records:
    #         clean = {}
    #         for k, v in record.items():
    #             new_key = k.strip().strip('"').strip()
    #             clean[new_key] = v
    #         cleaned.append(clean)

    #     return cleaned
    # def run_query(self, query, parameters=None):
    #     try:
    #         with self.driver.session() as session:
    #             return session.execute_write(
    #                 lambda tx: list(tx.run(query, parameters or {})))
    #     except Exception as e:
    #         logging.error(f"Query failed: {str(e)}")
    #         return []

    def store_embedding(self, node_id, embedding):
        query = """
        MATCH (c:Course {url: $node_id})
        SET c.embedding = $embedding
        """
        self.run_query(query, {"node_id": node_id, "embedding": embedding.tolist()})

# LSTM Embedding Model
class LSTMEmbeddingModel:
    def __init__(self, vocab_size=10000, embedding_dim=128, max_length=200):
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.max_length = max_length
        self.model = Sequential([
            Embedding(vocab_size, embedding_dim, input_length=max_length),
            LSTM(64, return_sequences=False),
            Dense(embedding_dim, activation='tanh')
        ])

    def fit(self, texts):
        self.tokenizer.fit_on_texts(texts)
        sequences = self.tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(sequences, maxlen=self.max_length, padding='post')
        self.model.compile(optimizer='adam', loss='mse')
        dummy_labels = np.zeros((len(texts), 128))
        self.model.fit(padded, dummy_labels, epochs=1, verbose=0)

    def get_embedding(self, text):
        sequence = self.tokenizer.texts_to_sequences([text])
        padded = pad_sequences(sequence, maxlen=self.max_length, padding='post')
        return self.model.predict(padded, verbose=0)[0]

# Query Processor with Qwen/deepseek
class QueryProcessor:
    def __init__(self, qwen_model):
        self.qwen_model = qwen_model
        # Improved regex pattern to better capture JSON array
        self.json_pattern = re.compile(r'\[\s*{.*}\s*\]', re.DOTALL)

    def extract_json_from_text(self, text):
        """Extract JSON array from text handling multiple formats"""
        # Try to find JSON part with more flexible parsing
        json_pattern = re.compile(
            r'```json\s*(\[.*?\])\s*```', 
            re.DOTALL | re.IGNORECASE
        )
        
        # Attempt 1: Look for ```json ``` blocks
        match = json_pattern.search(text)
        if match:
            try:
                return json.loads(match.group(1))
            except Exception as e:
                logging.debug(f"JSON extraction attempt 1 failed: {e}")

        # Attempt 2: Look after </think> tag
        parts = text.split('</think>')
        if len(parts) > 1:
            json_part = parts[-1].strip()
            try:
                # Remove any non-JSON characters before/after
                json_part = json_part[json_part.find('['):json_part.rfind(']')+1]
                return json.loads(json_part)
            except Exception as e:
                logging.debug(f"JSON extraction attempt 2 failed: {e}")

        # Attempt 3: Find first valid JSON array
        try:
            json_str = re.search(r'(\[.*?\])', text, re.DOTALL).group(1)
            return json.loads(json_str)
        except Exception as e:
            logging.debug(f"JSON extraction attempt 3 failed: {e}")
            raise ValueError("Failed to parse JSON from LLM response")
    def analyze_query(self, user_query):
        prompt = """
        You are an expert in creating Cypher queries for Neo4j to answer complex, multihop questions about course data. The knowledge graph contains the following nodes and relationships:

        - Nodes:
        - Course: Represents a course in the dataset, with properties like url (unique identifier), name (course title), duration (length in hours), rating (user rating), description (course summary), embedding (vector representation).
        - Skill: Represents a skill taught by a course, with properties like name (e.g., 'AWS Lambda', 'Python').
        - Level: Represents the difficulty level of a course, with properties like name (e.g., 'Beginner', 'Intermediate', 'Advanced').
        - Organization: Represents the organization offering the course, with properties like name (e.g., 'Coursera', 'Udemy').
        - Instructor: Represents the instructor teaching the course, with properties like name (e.g., 'John Doe').
        - Career: Represents a career path, with properties like name (e.g., 'Cloud Computing', 'Data Science').

        - Relationships:
        - TEACHES: Connects Course to Skill (Course -> Skill), indicating the course teaches the skill.
        - HAS_LEVEL: Connects Course to Level (Course -> Level), indicating the course's difficulty level.
        - OFFERED_BY: Connects Course to Organization (Course -> Organization), indicating the course is offered by the organization.
        - TAUGHT_BY: Connects Course to Instructor (Course -> Instructor), indicating the course is taught by the instructor.
        - REQUIRES: Connects Career to Skill (Career -> Skill), indicating the career requires the skill.

        Your task is to analyze the natural language query below and **decompose** it into a **minimal**, **sequential** chain of Cypher sub-queries.  

        1. First output a '<think> ... </think>' block with your step-by-step reasoning in plain English, describing:
        - Which entity or relationship you'll target.
        - Which filters you'll apply.
        - How intermediate results will flow into the next step.

        2. Then, immediately after '</think>', output **only** a JSON array. Each element must be an object with exactly these keys:
        - "description": one-sentence summary of this step.
        - "cypher": the full Cypher query.
        - "parameters": a dict of named parameters (use $skill_name, $level, etc.) or refer to prior binds via "$<bind_name>".
        - "return": either "intermediate" or "final candidates".
        - "bind": (only for "intermediate" steps) the name you'll use to pass results forward (e.g. "skill_node", "courses").

        3. **Do not** output any other text outside the <think> block and the JSON.

        4. Escape all literal {{ and }} in the JSON by doubling them ({{/}}) if you plan to call Python's '.format()'.  

        Example for query “Find beginner courses about AWS SageMaker”:

        <think>
        First locate the Skill node “AWS SageMaker”, then collect all Courses teaching that skill, then filter those by Level = “Beginner”. Remember to put those sub-process in list: [{{...}},..., {{...}}] for example.
        </think>
        '''json
        [
        {{
            "description": "Locate the AWS SageMaker skill node",
            "cypher": "MATCH (sk:Skill {{name: $skill_name}}) RETURN sk AS skill_node",
            "parameters": {{"skill_name": "AWS SageMaker"}},
            "return": "intermediate",
            "bind": "skill_node"
        }},
        {{
            "description": "Find all courses that teach that skill",
            "cypher": "MATCH (c:Course)-[:TEACHES]->(sk) RETURN collect(c) AS courses",
            "parameters": {{"sk": "$skill_node"}},
            "return": "intermediate",
            "bind": "courses"
        }},
        {{
            "description": "Filter those courses to Beginner level",
            "cypher": "UNWIND $courses AS c MATCH (c)-[:HAS_LEVEL]->(l:Level {{name: $level}}) RETURN c.url AS course_url, c.name AS course_title",
            "parameters": {{"courses": "$courses", "level": "Beginner"}},
            "return": "final candidates"
        }}
        ]
        **Important Formatting Rules:**
        - Wrap JSON output between ```json and ```
        - Do NOT add any text/comments after JSON
        - Use ONLY double quotes for JSON
        - Escape special characters properly

        Query: {user_query}
        """
        
        # Generate response from LLM
        resp = ollama.generate(
            model=self.qwen_model,
            prompt=prompt.format(user_query=user_query),
            options={'stop': ['```']}  # Stop generation after JSON block
        )

        # Log the entire raw response
        raw = resp["response"]
        logging.debug("=== Raw Qwen response start ===")
        logging.debug(raw)
        logging.debug("=== Raw Qwen response end ===")

        # Extract JSON using improved method
        try:
            steps = self.extract_json_from_text(raw)
            return steps
        except Exception as e:
            logging.error(f"Error extracting JSON: {e}")
            logging.error(f"Raw response causing error: {raw}")
            raise RuntimeError(f"Failed to parse query steps from LLM response: {e}")

# Knowledge Base QA System
class KnowledgeBaseQA:
    def __init__(self, neo4j_uri, neo4j_user, neo4j_password, qwen_model, embedding_model):
        self.neo4j_conn = Neo4jConnection(neo4j_uri, neo4j_user, neo4j_password)
        self.query_processor = QueryProcessor(qwen_model)
        self.embedding_model = embedding_model

    def precompute_embeddings(self, courses):
        descriptions = [course.get('description', '') for course in courses]
        self.embedding_model.fit(descriptions)
        for course in courses:
            embedding = self.embedding_model.get_embedding(course.get('description', ''))
            self.neo4j_conn.store_embedding(course.get('url', ''), embedding)

    @lru_cache(maxsize=100)
    # def process_query(self, user_query):
    #     try:
    #         # Analyze query with Qwen
    #         steps = self.query_processor.analyze_query(user_query)
            
    #         if not steps or not isinstance(steps, list):
    #             logging.error(f"Invalid steps format returned: {steps}")
    #             return []
                
    #         intermediate_results = []
    #         candidates = []

    #         # Execute each subquery
    #         for step_idx, step in enumerate(steps):
    #             logging.debug(f"Processing step {step_idx+1}: {step}")
                
    #             # Validate step format
    #             if not isinstance(step, dict) or "cypher" not in step:
    #                 logging.error(f"Invalid step format at index {step_idx}: {step}")
    #                 continue
                    
    #             cypher = step["cypher"]
    #             params = step.get("parameters", {})
    #             step_type = step.get("return", "").lower()
                
    #             logging.debug(f"Executing cypher: {cypher} with params: {params}")
                
    #             if step_type == "intermediate":
    #                 results = self.neo4j_conn.run_query(cypher, params)
    #                 for record in results:
    #                     # Extract various possible return values
    #                     for key in ["skill", "description", "name"]:
    #                         if key in record and record[key]:
    #                             intermediate_results.append(record[key])
    #                             break
                                
    #                 logging.debug(f"Intermediate results: {intermediate_results}")
                    
    #             elif step_type == "candidates":
    #                 # If we have intermediate results, use them in the query
    #                 if intermediate_results and params:
    #                     # Find a parameter that might accept a list of values
    #                     list_param = None
    #                     for param_name, param_value in params.items():
    #                         if isinstance(param_value, list):
    #                             list_param = param_name
    #                             break
                        
    #                     if list_param:
    #                         # Use intermediate results in the list parameter
    #                         params[list_param] = intermediate_results
    #                         results = self.neo4j_conn.run_query(cypher, params)
    #                         for record in results:
    #                             if "course_name" in record and record["course_name"]:
    #                                 candidates.append(record["course_name"])
    #                     else:
    #                         # Process each intermediate result individually
    #                         for item in intermediate_results:
    #                             for param_name in params:
    #                                 params_copy = params.copy()
    #                                 params_copy[param_name] = item
    #                                 results = self.neo4j_conn.run_query(cypher, params_copy)
    #                                 for record in results:
    #                                     if "course_name" in record and record["course_name"]:
    #                                         candidates.append(record["course_name"])
    #                 else:
    #                     # No intermediate results or params, just run the query directly
    #                     results = self.neo4j_conn.run_query(cypher, params)
    #                     for record in results:
    #                         if "course_name" in record and record["course_name"]:
    #                             candidates.append(record["course_name"])
                                
    #                 logging.debug(f"Candidates: {candidates}")

    #         # Rank candidates using embeddings
    #         if candidates:
    #             query_embedding = self.embedding_model.get_embedding(user_query)
    #             similarities = []
    #             unique_candidates = list(set(candidates))
    #             logging.debug(f"Unique candidates to rank: {unique_candidates}")
                
    #             for course_name in unique_candidates:
    #                 description = self.get_course_description(course_name)
    #                 if description:
    #                     course_embedding = self.embedding_model.get_embedding(description)
    #                     sim = np.dot(query_embedding, course_embedding) / (
    #                         np.linalg.norm(query_embedding) * np.linalg.norm(course_embedding) + 1e-8  # Add small epsilon to avoid division by zero
    #                     )
    #                     similarities.append((course_name, sim))
    #                 else:
    #                     # If no description, use a default low similarity
    #                     similarities.append((course_name, 0.0))

    #             similarities.sort(key=lambda x: x[1], reverse=True)
    #             return [x[0] for x in similarities[:5]]
    #         return []
            
    #     except Exception as e:
    #         logging.error(f"Error in process_query: {e}", exc_info=True)
    #         raise

    # 
    
    def process_query(self, user_query):
        try:
            steps = self.query_processor.analyze_query(user_query)
            if not isinstance(steps, list):
                return []

            intermediate = {}
            candidates = []
            
            for step_idx, step in enumerate(steps):
                # Validate step structure
                if not self._validate_step(step, step_idx):
                    continue

                # Process parameters with type checking
                processed_params = self._process_parameters(step, intermediate)
                
                # Execute query with error handling
                records = self._execute_cypher(
                    step["cypher"], 
                    processed_params, 
                    step_idx
                )
                if records is None:  # Execution failed
                    return []

                # Handle results
                if step["return"].lower() == "intermediate":
                    self._handle_intermediate(step, records, intermediate)
                else:
                    self._collect_candidates(records, candidates)

            return self._rank_results(candidates, user_query)

        except Exception as e:
            logging.error(f"Query processing failed: {str(e)}", exc_info=True)
            return ["An error occurred while processing your query"]

    # Helper methods
    def _validate_step(self, step, step_idx):
        required_keys = ["description", "cypher", "return"]
        if not all(k in step for k in required_keys):
            logging.error(f"Missing keys in step {step_idx+1}: {step}")
            return False
        if step["return"].lower() == "intermediate" and "bind" not in step:
            logging.error(f"Missing bind key in intermediate step {step_idx+1}")
            return False
        return True

    def _process_parameters(self, step, intermediate):
        processed = {}
        for k, v in step.get("parameters", {}).items():
            if isinstance(v, str) and v.startswith("$"):
                key = v[1:]
                processed[k] = intermediate.get(key)
                
                # Add type conversion for Neo4j
                if isinstance(processed[k], str) and 'node' in key:
                    processed[k] = GraphDatabase.types.Node(
                        element_id='dummy_id',
                        labels=['Skill' if 'skill' in key else 'Course'],
                        properties={'name': processed[k]}
                    )
            else:
                processed[k] = v
        return processed

    def _execute_cypher(self, cypher, params, step_idx):
        try:
            return self.neo4j_conn.run_query(cypher, params)
        except Exception as e:
            logging.error(f"Query failed at step {step_idx+1}: {e}")
            logging.debug(f"Failed query: {cypher}")
            logging.debug(f"Parameters: {params}")
            return None

    def _handle_intermediate(self, step, records, intermediate):
        bind_key = step["bind"]
        if records and bind_key in records[0]:
            node = records[0][bind_key]
            
            # Handle different return types
            if isinstance(node, GraphDatabase.types.Node):
                if 'Skill' in node.labels:
                    intermediate[bind_key] = node.get('name')
                elif 'Course' in node.labels:
                    intermediate[bind_key] = node.get('url')
            elif isinstance(node, dict):
                intermediate[bind_key] = node.get('name') or node.get('url')
            else:
                intermediate[bind_key] = node

    def _collect_candidates(self, records, candidates):
        for record in records:
            if 'course_title' in record and 'course_url' in record:
                candidates.append((
                    record['course_title'],
                    record['course_url']
                ))

    def _rank_results(self, candidates, query):
        if not candidates:
            return []

        unique_courses = list(set(candidates))
        
        # Batch processing for efficiency
        descriptions = {
            title: self.get_course_description(title) 
            for title, _ in unique_courses
        }
        
        query_embedding = self.embedding_model.get_embedding(query)
        course_embeddings = {
            title: self.embedding_model.get_embedding(desc) if desc 
            else np.zeros(self.embedding_model.model.output_shape[1])
            for title, desc in descriptions.items()
        }
        
        # Calculate similarities
        similarities = []
        for title, url in unique_courses:
            emb = course_embeddings[title]
            norm = np.linalg.norm(query_embedding) * np.linalg.norm(emb) + 1e-8
            sim = np.dot(query_embedding, emb) / norm
            similarities.append((title, url, sim))
        
        # Return formatted results
        return [
            f"{title} ({url})" 
            for title, url, _ in sorted(
                similarities, 
                key=lambda x: x[2], 
                reverse=True
            )[:5]
        ]

    def get_course_description(self, course_name):
        query = "MATCH (c:Course {name: $name}) RETURN c.description AS description"
        result = self.neo4j_conn.run_query(query, {"name": course_name})
        if result and "description" in result[0]:
            return result[0]["description"]
        return ""

    def close(self):
        self.neo4j_conn.close()

if __name__ == "__main__":
    qa = KnowledgeBaseQA(
        neo4j_uri="bolt://localhost:7687",
        neo4j_user="neo4j",
        neo4j_password="12345678",
        qwen_model="deepseek-r1:7b",
        embedding_model=LSTMEmbeddingModel()
    )
    try:
        while True:
            # Show prompt and read user input
            user_query = input("Enter your query (e.g., Find intermediate courses about AWS Lambda): ")
            if not user_query.strip():
                print("Exiting.")
                break
            logging.info(f"User query: {user_query}")
            try:
                results = qa.process_query(user_query)
                if results:
                    print("Top matching courses:")
                    for idx, course in enumerate(results, 1):
                        print(f"{idx}. {course}")
                else:
                    print("No matching courses found.")
            except Exception as e:
                logging.error(f"Error processing query: {e}", exc_info=True)
                print(f"An error occurred: {e}")
    finally:
        qa.close()

2025-05-10 16:29:49,259 DEBUG:neo4j.pool: [#0000]  _: <POOL> created, direct address IPv4Address(('localhost', 7687))
2025-05-10 16:30:02,240 INFO:root: User query: Find intermediate courses about AWS Lambda
2025-05-10 16:30:02,293 DEBUG:httpcore.connection: close.started
2025-05-10 16:30:02,295 DEBUG:httpcore.connection: close.complete
2025-05-10 16:30:02,296 DEBUG:httpcore.connection: connect_tcp.started host='127.0.0.1' port=11434 local_address=None timeout=None socket_options=None
2025-05-10 16:30:02,296 DEBUG:httpcore.connection: connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001B52AAA8B30>
2025-05-10 16:30:02,300 DEBUG:httpcore.http11: send_request_headers.started request=<Request [b'POST']>
2025-05-10 16:30:02,300 DEBUG:httpcore.http11: send_request_headers.complete
2025-05-10 16:30:02,306 DEBUG:httpcore.http11: send_request_body.started request=<Request [b'POST']>
2025-05-10 16:30:02,307 DEBUG:httpcore.http11: send_request_body.complete
202

AttributeError: 'Neo4jConnection' object has no attribute 'driver'