In [None]:
import os
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from typing import List, Tuple
import re

class DocumentRetrievalReranker:
    def __init__(self, openai_api_key: str = None):
        """
        Initialize the document retrieval and reranking system.
        
        Args:
            openai_api_key: OpenAI API key (if not provided, will use env variable)
        """
        if openai_api_key:
            os.environ["OPENAI_API_KEY"] = openai_api_key
        
        # Initialize embeddings and chat model
        self.embeddings = OpenAIEmbeddings()
        self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        
        # Vector store (will be initialized with documents)
        self.vectorstore = None
        
        # Reranking prompt template
        self.rerank_prompt = ChatPromptTemplate.from_template("""
You are an expert document ranker. Given a user question and a list of documents, 
rerank the documents based on their relevance to the question.

Question: {question}

Documents:
{documents}

Please rank these documents from most relevant to least relevant by returning only 
the document numbers separated by commas (e.g., "3,1,5,2,4").

Consider:
1. Direct relevance to the question
2. Completeness of information
3. Quality and specificity of content
4. Practical applicability

Return only the ranking numbers, no explanation:
""")
        
        # Create the reranking chain
        self.rerank_chain = LLMChain(
            llm=self.llm,
            prompt=self.rerank_prompt
        )
    
    def load_documents(self, documents: List[str]) -> None:
        """
        Load documents into the vector store.
        
        Args:
            documents: List of document texts
        """
        # Convert strings to Document objects
        doc_objects = [Document(page_content=doc) for doc in documents]
        
        # Create FAISS vector store
        self.vectorstore = FAISS.from_documents(doc_objects, self.embeddings)
        print(f"✅ Loaded {len(documents)} documents into vector store")
    
    def retrieve_documents(self, query: str, k: int = 6) -> List[Document]:
        """
        Retrieve top-k documents using vector similarity search.
        
        Args:
            query: Search query
            k: Number of documents to retrieve
            
        Returns:
            List of retrieved documents
        """
        if not self.vectorstore:
            raise ValueError("No documents loaded. Call load_documents() first.")
        
        retrieved_docs = self.vectorstore.similarity_search(query, k=k)
        print(f"🔍 Retrieved {len(retrieved_docs)} documents for query: '{query}'")
        return retrieved_docs
    
    def format_documents_for_ranking(self, documents: List[Document]) -> str:
        """
        Format documents for the reranking prompt.
        
        Args:
            documents: List of Document objects
            
        Returns:
            Formatted string with numbered documents
        """
        formatted = []
        for i, doc in enumerate(documents, 1):
            formatted.append(f"{i}. {doc.page_content}")
        return "\n\n".join(formatted)
    
    def rerank_documents(self, query: str, documents: List[Document]) -> List[Document]:
        """
        Rerank documents using LLM-based scoring.
        
        Args:
            query: Original query
            documents: List of retrieved documents
            
        Returns:
            List of reranked documents
        """
        # Format documents for the prompt
        formatted_docs = self.format_documents_for_ranking(documents)
        
        # Get reranking from LLM
        response = self.rerank_chain.invoke({
            "question": query,
            "documents": formatted_docs
        })
        
        # Parse the response to extract indices
        indices = self._parse_ranking_response(response["text"], len(documents))
        
        # Rerank documents based on LLM response
        reranked_docs = [documents[i] for i in indices if 0 <= i < len(documents)]
        
        print(f"🎯 Reranked {len(reranked_docs)} documents")
        return reranked_docs
    
    def _parse_ranking_response(self, response: str, num_docs: int) -> List[int]:
        """
        Parse the LLM ranking response to extract document indices.
        
        Args:
            response: Raw response from LLM
            num_docs: Number of documents to validate against
            
        Returns:
            List of document indices (0-based)
        """
        # Extract numbers from response
        numbers = re.findall(r'\d+', response)
        
        # Convert to 0-based indices and validate
        indices = []
        for num_str in numbers:
            idx = int(num_str) - 1  # Convert to 0-based
            if 0 <= idx < num_docs and idx not in indices:
                indices.append(idx)
        
        # Add any missing indices to ensure all documents are included
        for i in range(num_docs):
            if i not in indices:
                indices.append(i)
        
        return indices
    
    def search_and_rerank(self, query: str, k: int = 6) -> List[Document]:
        """
        Complete pipeline: retrieve and rerank documents.
        
        Args:
            query: Search query
            k: Number of documents to retrieve initially
            
        Returns:
            List of reranked documents
        """
        # Step 1: Retrieve documents
        retrieved_docs = self.retrieve_documents(query, k)
        
        # Step 2: Rerank documents
        reranked_docs = self.rerank_documents(query, retrieved_docs)
        
        return reranked_docs
    
    def display_results(self, documents: List[Document], title: str = "Results") -> None:
        """
        Display the results in a formatted way.
        
        Args:
            documents: List of documents to display
            title: Title for the results section
        """
        print(f"\n📊 {title}:")
        print("=" * 50)
        for i, doc in enumerate(documents, 1):
            print(f"\nRank {i}:")
            print("-" * 20)
            print(doc.page_content)


# Example usage and demonstration
def main():
    # Sample documents about LangChain
    sample_documents = [
        'LangChain supports tool integration including web search, calculators, and APIs, allowing LLMs to interact with external systems and respond more accurately to dynamic queries. Memory in LangChain enables context retention across multiple steps in a conversation or task, making the application more coherent and stateful.',
        
        'LangChain is a flexible framework designed for developing applications powered by large language models (LLMs). It provides tools and abstractions to work with LLMs more effectively and includes components for prompt management, chains, memory, and agents.',
        
        'LangChain integrates with many third-party services such as OpenAI, Hugging Face, and Cohere. This enables developers to experiment with different models and optimize performance for specific use cases like summarization, question answering, or translation.',
        
        'FAISS is a popular library used for fast approximate nearest neighbor search in high-dimensional spaces. It supports both flat and compressed indexes, which makes it scalable for large document stores. Agents in LangChain are chains that use LLMs to decide which tools to use and in what order. This makes them suitable for multi-step tasks like question answering with search and code execution.',
        
        'Retrieval-Augmented Generation (RAG) is a powerful technique where external knowledge is retrieved and passed into the prompt to ground LLM responses. LangChain makes it easy to implement RAG using vector databases like FAISS, Chroma, and Pinecone. BM25 is a traditional sparse retrieval method that scores documents based on keyword matching. Although fast, it often struggles with synonyms and semantic similarity.',
        
        'Dense retrieval uses embeddings to match query and documents in a vector space. This allows capturing semantic meaning, making it useful for fuzzy or natural language queries. LangChain supports hybrid retrieval by combining BM25 and dense similarity scores. This approach improves both precision and recall in document search.'
    ]
    
    # Initialize the system
    print("🚀 Initializing Document Retrieval and Reranking System...")
    
    # Note: You'll need to set your OpenAI API key
    # retriever = DocumentRetrievalReranker(openai_api_key="your-api-key-here")
    
    # For demo purposes, we'll show the structure
    print("\n📚 Sample Documents Loaded:")
    for i, doc in enumerate(sample_documents, 1):
        print(f"{i}. {doc[:100]}...")
    
    # Example queries to test
    test_queries = [
        "How does RAG work with LangChain?",
        "What are agents in LangChain?",
        "Tell me about vector databases and FAISS",
        "How does LangChain integrate with external services?"
    ]
    
    print("\n🔍 Example Queries to Test:")
    for i, query in enumerate(test_queries, 1):
        print(f"{i}. {query}")
    
    print("\n💡 To use this system:")
    print("1. Set your OpenAI API key")
    print("2. Initialize: retriever = DocumentRetrievalReranker()")
    print("3. Load documents: retriever.load_documents(your_documents)")
    print("4. Search and rerank: results = retriever.search_and_rerank(your_query)")
    print("5. Display results: retriever.display_results(results)")


if __name__ == "__main__":
    main()

In [None]:
sql_to_documents("data/databases/company.db")

[Document(metadata={'source': 'data/databases/company.db', 'table_name': 'employees', 'num_records': 4, 'data_type': 'sql_table'}, page_content="Table: employees\nColumns: id, name, role, department, salary\nTotal Records: 4\n\nSample Records:\n{'id': 1, 'name': 'John Doe', 'role': 'Senior Developer', 'department': 'Engineering', 'salary': 95000.0}\n{'id': 2, 'name': 'Jane Smith', 'role': 'Data Scientist', 'department': 'Analytics', 'salary': 105000.0}\n{'id': 3, 'name': 'Mike Johnson', 'role': 'Product Manager', 'department': 'Product', 'salary': 110000.0}\n{'id': 4, 'name': 'Sarah Williams', 'role': 'DevOps Engineer', 'department': 'Engineering', 'salary': 98000.0}\n"),
 Document(metadata={'source': 'data/databases/company.db', 'table_name': 'projects', 'num_records': 4, 'data_type': 'sql_table'}, page_content="Table: projects\nColumns: id, name, status, budget, lead_id\nTotal Records: 4\n\nSample Records:\n{'id': 1, 'name': 'RAG Implementation', 'status': 'Active', 'budget': 150000.