In [1]:
import os
import json
import numpy as np
import torch
from typing import List, Dict, Any, Optional

# Data Processing
import pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings and Vector Store
from sentence_transformers import SentenceTransformer
import chromadb

# Graph Database
from neo4j import GraphDatabase

# Language Model
from transformers import AutoTokenizer, AutoModelForCausalLM



In [2]:
!export HF_TOKEN="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"
os.environ["HF_TOKEN"] = "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"
HF_TOKEN = "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"

In [3]:
#!/usr/bin/env python
# coding: utf-8

NEO4J_URI='neo4j+s://fdb1cdfe.databases.neo4j.io'
NEO4J_USERNAME='neo4j'
NEO4J_PASSWORD='4ygC6vXH3auM-yPJ8XW1oUjHQDSJCL0IXCSAK0xKUF4'
NEO4J_DATABASE='neo4j'
AURA_INSTANCEID='fdb1cdfe'
AURA_INSTANCENAME='Free instance'
!export HF_TOKEN="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"




class InsuranceRAGSystem:
    """
    A comprehensive RAG (Retrieval-Augmented Generation) system for insurance policy analysis.
    Combines vector search (ChromaDB) and graph database (Neo4j) for document retrieval,
    with DeepSeek model for intelligent response generation.
    """
    
    def __init__(self, 
                 neo4j_uri: str,
                 neo4j_username: str, 
                 neo4j_password: str,
                 neo4j_database: str = "neo4j",
                 hf_token: str = "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",
                 chromadb_path: str = "doc_db",
                 embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
                 llm_model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"):
        """
        Initialize the Insurance RAG System.
        
        Args:
            neo4j_uri: Neo4j database URI
            neo4j_username: Neo4j username
            neo4j_password: Neo4j password
            neo4j_database: Neo4j database name
            hf_token: Hugging Face token
            chromadb_path: Path for ChromaDB persistence
            embedding_model: Name of the embedding model
            llm_model: Name of the language model
        """
        # Configuration
        self.neo4j_uri = NEO4J_URI
        self.neo4j_username = NEO4J_USERNAME
        self.neo4j_password = NEO4J_PASSWORD
        self.neo4j_database = NEO4J_DATABASE
        self.chromadb_path = 'doc_db'
        self.embedding_model_name = 'Qwen/Qwen3-Embedding-0.6B'
        self.llm_model_name = "microsoft/Phi-3-mini-128k-instruct"#"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
        
        # Model configurations
        self.embedding_dimension = 1024
        self.vector_index_name = "chunkEmbeddings"
        self.collection_name = "document_chunks_collection"
        
        # Initialize components
        self.embedding_model = None
        self.tokenizer = None
        self.llm_model = None
        self.chromadb_client = chromadb.PersistentClient(path="doc_db")
        self.chromadb_collection = self.chromadb_client.get_or_create_collection(name="document_chunks_collection")
        self.neo4j_driver = GraphDatabase.driver(self.neo4j_uri,auth=(self.neo4j_username,  self.neo4j_password))           # e.g., 'neo4j+s://fdb1cdfe.databases.neo4j.io'auth=(username, password)  # Neo4j credentials)
        self.chunks = []
        self.embeddings = []
        
        # Set HF token if provided
        if hf_token:
            os.environ["HF_TOKEN"] = hf_token
        
        # Initialize device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Initialize all components
        self._initialize_models()
        self._initialize_databases()
    
    def _initialize_models(self):
        """Initialize embedding and language models."""
        print("Initializing models...")
        
        # Initialize embedding model
        self.embedding_model = SentenceTransformer(self.embedding_model_name, device="cpu")
        print(f"✓ Embedding model loaded: {self.embedding_model_name}")
        
        # Initialize tokenizer and LLM
        self.tokenizer =  AutoTokenizer.from_pretrained(self.llm_model_name)
        self.llm_model = AutoModelForCausalLM.from_pretrained(self.llm_model_name,torch_dtype=torch.float32, device_map="auto")
        print(f"✓ Language model loaded: {self.llm_model_name}")
    
    def _initialize_databases(self):
        """Initialize ChromaDB and Neo4j connections."""
        print("Initializing databases...")
        
        # Initialize ChromaDB
        self.chromadb_client = chromadb.PersistentClient(path=self.chromadb_path)
        self.chromadb_collection = self.chromadb_client.get_or_create_collection(
            name=self.collection_name
        )
        print(f"✓ ChromaDB initialized: {self.collection_name}")
        
        # Initialize Neo4j
        try:
            self.neo4j_driver = GraphDatabase.driver(
                self.neo4j_uri, 
                auth=(self.neo4j_username, self.neo4j_password)
            )
            self.neo4j_driver.verify_connectivity()
            print("✓ Neo4j connection established")
        except Exception as e:
            print(f"✗ Neo4j connection failed: {e}")
            self.neo4j_driver = None
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract text from a PDF file.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Extracted text as string
        """
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                reader = pypdf.PdfReader(file)
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    text += page.extract_text()
            print(f"✓ Extracted text from PDF: {pdf_path}")
            return text
        except Exception as e:
            print(f"✗ Error extracting PDF: {e}")
            return ""
    
    def split_text_into_chunks(self, text: str, chunk_size: int = 256, chunk_overlap: int = 32) -> List[str]:
        """
        Split text into chunks using RecursiveCharacterTextSplitter.
        
        Args:
            text: Input text to split
            chunk_size: Size of each chunk
            chunk_overlap: Overlap between chunks
            
        Returns:
            List of text chunks
        """
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            is_separator_regex=False
        )
        
        documents = text_splitter.create_documents([text])
        self.chunks = [doc.page_content for doc in documents]
        print(f"✓ Split text into {len(self.chunks)} chunks")
        return self.chunks
    
    def generate_embeddings(self, chunks: List[str] = None) -> np.ndarray:
        """
        Generate embeddings for text chunks.
        
        Args:
            chunks: List of text chunks (uses self.chunks if None)
            
        Returns:
            NumPy array of embeddings
        """
        if chunks is None:
            chunks = self.chunks
        
        if not chunks:
            print("✗ No chunks available for embedding generation")
            return np.array([])
        
        self.embeddings = self.embedding_model.encode(chunks, convert_to_tensor=False)
        print(f"✓ Generated {len(self.embeddings)} embeddings")
        print(f"✓ Embedding shape: {self.embeddings[0].shape}")
        return self.embeddings
    
    def store_in_chromadb(self, chunks: List[str] = None, embeddings: np.ndarray = None):
        """
        Store chunks and embeddings in ChromaDB.
        
        Args:
            chunks: List of text chunks
            embeddings: Array of embeddings
        """
        if chunks is None:
            chunks = self.chunks
        if embeddings is None:
            embeddings = self.embeddings
        
        if len(chunks) == 0 or len(embeddings) == 0:
            print("✗ No data to store in ChromaDB")
            return
        
        # Generate unique IDs
        ids = [f"chunk_{i}" for i in range(len(chunks))]
        
        # Add to collection
        self.chromadb_collection.add(
            ids=ids,
            documents=chunks,
            embeddings=embeddings.tolist()
        )
        
        print(f"✓ Stored {self.chromadb_collection.count()} items in ChromaDB")
    
    def store_in_neo4j(self, chunks: List[str] = None, embeddings: np.ndarray = None):
        """
        Store chunks and embeddings in Neo4j graph database.
        
        Args:
            chunks: List of text chunks
            embeddings: Array of embeddings
        """
        if not self.neo4j_driver:
            print("✗ Neo4j driver not available")
            return
        
        if chunks is None:
            chunks = self.chunks
        if embeddings is None:
            embeddings = self.embeddings
        
        if len(chunks) == 0 or len(embeddings) == 0:
            print("✗ No data to store in Neo4j")
            return
        
        with self.neo4j_driver.session(database=self.neo4j_database) as session:
            # Create constraint
            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Chunk) REQUIRE c.id IS UNIQUE")
            
            # Create chunk nodes
            for i, (text, embedding) in enumerate(zip(chunks, embeddings)):
                params = {
                    'id': f"chunk_{i}",
                    'text': text,
                    'embedding': embedding.tolist(),
                    'chunk_index': i
                }
                session.run("""
                    MERGE (c:Chunk {id: $id})
                    ON CREATE SET c.text = $text, c.embedding = $embedding, c.chunkIndex = $chunk_index
                """, params)
            
            # Create relationships between consecutive chunks
            for i in range(len(chunks) - 1):
                params = {'current_id': f"chunk_{i}", 'next_id': f"chunk_{i+1}"}
                session.run("""
                    MATCH (current:Chunk {id: $current_id})
                    MATCH (next:Chunk {id: $next_id})
                    MERGE (current)-[:NEXT]->(next)
                """, params)
        
        print(f"✓ Stored {len(chunks)} chunks in Neo4j")
        self._create_neo4j_vector_index()
    
    def _create_neo4j_vector_index(self):
        """Create vector index in Neo4j for similarity search."""
        if not self.neo4j_driver:
            return
        
        with self.neo4j_driver.session(database=self.neo4j_database) as session:
            # Check if index exists
            result = session.run(
                "SHOW INDEXES YIELD name WHERE name = $index_name", 
                index_name=self.vector_index_name
            )
            
            if result.single():
                print(f"✓ Vector index '{self.vector_index_name}' already exists")
                return
            
            # Create vector index
            query = f"""
            CREATE VECTOR INDEX `{self.vector_index_name}` IF NOT EXISTS
            FOR (c:Chunk) ON (c.embedding)
            OPTIONS {{ indexConfig: {{
                `vector.dimensions`: {self.embedding_dimension},
                `vector.similarity_function`: 'cosine'
            }}
            }}
            """
            
            try:
                session.run(query)
                print(f"✓ Created vector index '{self.vector_index_name}'")
            except Exception as e:
                print(f"✗ Error creating vector index: {e}")
    
    def retrieve_from_chromadb(self, query_text: str, k: int = 3) -> List[Dict[str, Any]]:
        """
        Retrieve top-k similar chunks from ChromaDB.
        
        Args:
            query_text: Query text
            k: Number of results to retrieve
            
        Returns:
            List of retrieved chunks with metadata
        """
        query_embedding = self.embedding_model.encode([query_text], convert_to_tensor=True)
        
        results = self.chromadb_collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=k
        )
        
        retrieved_chunks = []
        for i, chunk in enumerate(results['documents'][0]):
            retrieved_chunks.append({
                "clause_id": f"C{i+1}",
                "text": chunk.strip(),
                "source": "chromadb"
            })
        
        print(f"✓ Retrieved {len(retrieved_chunks)} chunks from ChromaDB")
        return retrieved_chunks
    
    def retrieve_from_neo4j(self, query_text: str, k: int = 3) -> List[Dict[str, Any]]:
        """
        Retrieve top-k similar chunks from Neo4j using vector similarity.
        
        Args:
            query_text: Query text
            k: Number of results to retrieve
            
        Returns:
            List of retrieved chunks with similarity scores
        """
        if not self.neo4j_driver:
            print("✗ Neo4j driver not available")
            return []
        
        query_embedding = self.embedding_model.encode(query_text).tolist()
        
        cypher_query = f"""
        CALL db.index.vector.queryNodes('{self.vector_index_name}', $k, $embedding)
        YIELD node, score
        RETURN node.text AS text, score
        """
        
        with self.neo4j_driver.session(database=self.neo4j_database) as session:
            try:
                result = session.run(cypher_query, k=k, embedding=query_embedding)
                chunks = []
                for i, record in enumerate(result):
                    chunks.append({
                        "clause_id": f"C{i+1}",
                        "text": record["text"].strip(),
                        "score": record["score"],
                        "source": "neo4j"
                    })
                
                print(f"✓ Retrieved {len(chunks)} chunks from Neo4j")
                return chunks
                
            except Exception as e:
                print(f"✗ Error querying Neo4j: {e}")
                return []
    
    def hybrid_retrieve(self, query_text: str, k: int = 3) -> List[Dict[str, Any]]:
        """
        Retrieve chunks using both ChromaDB and Neo4j, then merge results.
        
        Args:
            query_text: Query text
            k: Number of results to retrieve from each source
            
        Returns:
            Merged list of retrieved chunks
        """
        chromadb_results = self.retrieve_from_chromadb(query_text, k)
        neo4j_results = self.retrieve_from_neo4j(query_text, k)
        
        # Simple merge (could be enhanced with deduplication)
        all_results = chromadb_results + neo4j_results
        
        # Re-index clause IDs
        for i, result in enumerate(all_results):
            result["clause_id"] = f"C{i+1}"
        
        return all_results
    
    def generate_response(self, query_text: str, retrieved_chunks: List[Dict[str, Any]]) -> str:
        """
        Generate response using the DeepSeek model with retrieved context.
        
        Args:
            query_text: User query
            retrieved_chunks: Retrieved relevant chunks
            
        Returns:
            Generated response as JSON string
        """
        retrieved_clauses_json_str = json.dumps(retrieved_chunks, indent=2)
        
        prompt = f"""
You are an expert insurance-policy decision assistant with deep domain knowledge in maternity and routine-care covers. You process natural-language queries by strictly analyzing **only** the user's input and the provided policy clauses—no invented rules.

Your task:
1. Parse the user's query and extract structured information.
2. Evaluate eligibility **using only** the retrieved clauses—do not assume or invent any additional rules.
3. Return a final decision and justification by referencing the exact clause IDs.

——

USER QUERY:
"{query_text}"

——

RETRIEVED CLAUSES (from internal document search):
{retrieved_clauses_json_str}

——

Step 1: Extract these fields from the query:
- age (number)
- gender
- procedure
- city
- policy_duration_months
- selected_cover_option (i, ii, or iii)

Step 2: Using **only** the clauses above:
- Determine if the requested procedure/cover is eligible under the selected option.
- Identify any exclusions.
- Compute payout amount (if applicable).

Step 3: Output exactly and only this JSON (no extra text), then immediately stop:
{{
  "decision": "<approved|rejected>",
  "amount": <number|null>,
  "justification": [
    {{
      "clause_id": "C1",
      "reason": "…"
    }}
  ]
}}
<<END_OF_JSON>>
"""
        
        # Generate response
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.llm_model.generate(
            **inputs,
            max_new_tokens=200,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=False,
            num_beams=3,
        )
        
        raw_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract JSON response
        try:
            json_response = raw_response.split("<<END_OF_JSON>>")[0].split(prompt)[1].strip()
            return json_response
        except:
            return raw_response
    
    def process_documents(self, pdf_paths: List[str], chunk_size: int = 256, chunk_overlap: int = 32):
        """
        Complete pipeline to process PDF documents and store in both databases.
        
        Args:
            pdf_paths: List of PDF file paths
            chunk_size: Size of text chunks
            chunk_overlap: Overlap between chunks
        """
        print("Starting document processing pipeline...")
        
        # Extract text from all PDFs
        all_text = ""
        for pdf_path in pdf_paths:
            text = self.extract_text_from_pdf(pdf_path)
            all_text += text + "\n\n"
        
        if not all_text.strip():
            print("✗ No text extracted from PDFs")
            return
        
        # Split into chunks
        chunks = self.split_text_into_chunks(all_text, chunk_size, chunk_overlap)
        
        # Generate embeddings
        embeddings = self.generate_embeddings(chunks)
        
        # Store in both databases
        self.store_in_chromadb(chunks, embeddings)
        self.store_in_neo4j(chunks, embeddings)
        
        print("✓ Document processing pipeline completed")
    
    def query(self, query_text: str, retrieval_method: str = "hybrid", k: int = 3) -> str:
        """
        End-to-end query processing: retrieve relevant chunks and generate response.
        
        Args:
            query_text: User query
            retrieval_method: "chromadb", "neo4j", or "hybrid"
            k: Number of chunks to retrieve
            
        Returns:
            Generated response
        """
        print(f"Processing query: '{query_text}'")
        
        # Retrieve relevant chunks
        if retrieval_method == "chromadb":
            retrieved_chunks = self.retrieve_from_chromadb(query_text, k)
        elif retrieval_method == "neo4j":
            retrieved_chunks = self.retrieve_from_neo4j(query_text, k)
        elif retrieval_method == "hybrid":
            retrieved_chunks = self.hybrid_retrieve(query_text, k//2 + 1)
        else:
            raise ValueError("retrieval_method must be 'chromadb', 'neo4j', or 'hybrid'")
        
        # Generate response
        response = self.generate_response(query_text, retrieved_chunks)
        
        print("✓ Query processing completed")
        return response
    
    def close_connections(self):
        """Close database connections."""
        if self.neo4j_driver:
            self.neo4j_driver.close()
            print("✓ Neo4j connection closed")


# Example usage
# if __name__ == "__main__":
#     # Configuration
#     NEO4J_URI = 'neo4j+s://fdb1cdfe.databases.neo4j.io'
#     NEO4J_USERNAME = 'neo4j'
#     NEO4J_PASSWORD = '4ygC6vXH3auM-yPJ8XW1oUjHQDSJCL0IXCSAK0xKUF4'
#     HF_TOKEN = "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"
    
# # Initialize the RAG system
# rag_system = InsuranceRAGSystem(
#     neo4j_uri=NEO4J_URI,
#     neo4j_username=NEO4J_USERNAME,
#     neo4j_password=NEO4J_PASSWORD,
#     hf_token=HF_TOKEN
# )

# # Process documents
# pdf_files = ['./CHOTGDP23004V012223.pdf']  # Add your PDF paths here
# rag_system.process_documents(pdf_files)

# # Query examples
# queries = [
#     "What is the definition of Burglary in the insurance policy?",
#     "What is covered under accidental death in this policy?",
#     "Who qualifies as a family member under the family travel policy?"
# ]

# for query in queries:
#     print(f"\n{'='*60}")
#     print(f"Query: {query}")
#     print(f"{'='*60}")
#     response = rag_system.query(query)
#     print("Response:", response)

# # Clean up
# rag_system.close_connections()

In [4]:
rag_system = InsuranceRAGSystem(
    neo4j_uri=NEO4J_URI,
    neo4j_username=NEO4J_USERNAME,
    neo4j_password=NEO4J_PASSWORD,
    hf_token=HF_TOKEN
)


Using device: cuda
Initializing models...
✓ Embedding model loaded: Qwen/Qwen3-Embedding-0.6B


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


✓ Language model loaded: microsoft/Phi-3-mini-128k-instruct
Initializing databases...
✓ ChromaDB initialized: document_chunks_collection
✓ Neo4j connection established


In [5]:
# Process documents
pdf_files = ['./CHOTGDP23004V012223.pdf']  # Add your PDF paths here
rag_system.process_documents(pdf_files)

Starting document processing pipeline...
✓ Extracted text from PDF: ./CHOTGDP23004V012223.pdf
✓ Split text into 1699 chunks


KeyboardInterrupt: 

In [None]:
# # Query examples
queries = [
    "What is the definition of Burglary in the insurance policy?",
    "What is covered under accidental death in this policy?",
    "Who qualifies as a family member under the family travel policy?",
    "What are the exclusions under trip cancellation benefits?",
    "Does the policy cover hospitalization due to COVID-19?",
    "What is meant by deductible in this policy?",
    "Explain the conditions under which repatriation of remains is covered.",
]

for query in queries:
    print(f"\n{'='*60}")
    print(f"Query: {query}")
    print(f"{'='*60}")
    response = rag_system.query(query)
    print("Response:", response)

# Clean up
rag_system.close_connections()