In [1]:
#pip install langchain langchain-community pymilvus

In [2]:
#!pip install langchain-community

In [1]:
#pip install langchain_huggingface

In [2]:
#pip install langchain_openai

# Check ups

In [17]:
# check there is readable file inside my folder
#!ls "./FINAL DATASET/"

# show whether PDF was embedded and stored in milvus
#print(rag_system.vector_store_manager.collection.num_entities)

# Set up

In [3]:
import os
import logging
from typing import List, Dict, Any, Optional, Tuple

#  Document Processing Imports
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings 

# Vector Store
from langchain_community.vectorstores import Milvus

# LLM and prompts
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Agent components
from langchain.agents import AgentExecutor, create_react_agent
from langchain.tools import Tool

# Feedback and evaluation
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Open AI Key
'''
from dotenv import load_dotenv
load_dotenv()
# print(os.getenv("OPENAI_API_KEY")) # print API KEY 
'''
MODEL_NAME = 'gpt-4o'

# Check Milvus Connection
def check_milvus_connection(host="localhost", port="19530"):
    try:
        from pymilvus import connections, utility
        connections.connect(host=host, port=port)
        collections = utility.list_collections()
        logger.info(f"Successfully connected to Milvus. Collections: {collections}")
        return True
    except Exception as e:
        logger.error(f"Failed to connect to Milvus: {e}")
        logger.warning("Make sure Milvus is installed and running!")
        return False


In [19]:
'''from dotenv import load_dotenv
load_dotenv(dotenv_path="/Users/brunamedeiros/Documents/University of Chicago/Spring 2025 - Capstone I/Assignment Research 1 - Agents/.env")
!ls
'''

'from dotenv import load_dotenv\nload_dotenv(dotenv_path="/Users/brunamedeiros/Documents/University of Chicago/Spring 2025 - Capstone I/Assignment Research 1 - Agents/.env")\n!ls\n'

In [20]:
from pymilvus import connections, utility
connections.connect(host="localhost", port="19530")
print(utility.list_collections())

[]


# Set-Up

## Processing PDFs

In [21]:

class DocumentProcessor:
    """Handles document loading, chunking, and embedding"""
    
    # Initialize document processor
    def __init__(self, 
                 embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                 chunk_size: int = 1000,
                 chunk_overlap: int = 200):
        """
        Args:
            embedding_model_name: HuggingFace model name for embeddings
            chunk_size: Size of text chunks
            chunk_overlap: Overlap between chunks
        """

        # Store constructor inputs as instance variables (so they can be used throughout the object)
        self.embedding_model_name = embedding_model_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        # Initialize embedding model
        self.embedder = HuggingFaceEmbeddings( # HuggingFaceEmbeddings from LangChain
            model_name=embedding_model_name,
            model_kwargs={'device': 'cuda' 
                          if os.environ.get('USE_GPU', 'false').lower() == 'true' 
                          else 'cpu'}  
                          # automatically switches between GPU (cuda) or CPU depending on environment variable USE_GPU
        )
        
        # Initialize text splitter
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
        
        # Prints model used to initialize document embedding
        logger.info(f"Document processor initialized with {embedding_model_name}")

    # Function: Process 1 PDF
    def process_file(self, file_path: str) -> List[Any]: 
            """Process a single file into chunks with metadata"""
            logger.info(f"Processing file: {file_path}")
            
            # Select appropriate loader based on file type
            if file_path.endswith('.pdf'):
                loader = PyPDFLoader(file_path)
            else:
                raise ValueError(f"Unsupported file type: {file_path}")
                
            # Load and split the document
            documents = loader.load()
            chunks = self.splitter.split_documents(documents)
            
            logger.info(f"Generated {len(chunks)} chunks from {file_path}")
            return chunks

    # Function: Bulk-process PDFs 
    def process_directory(self, directory_path: str) -> List[Any]:
            """Process all supported files in a directory"""
            
            # Path of folder being processed
            logger.info(f"Processing directory: {directory_path}")
            
            # Creates folder for all PDFs files inside directory
            loader = DirectoryLoader(  # DirectoryLoader: walks through folder recursively
                directory_path,
                glob="**/*.pdf",       # Grabs every .pdf file
                loader_cls=PyPDFLoader # Same as process_file
            )
            
            # Loads PDFs and returns list of document objects (1 per page of each file), each with metadata
            documents = loader.load()

            # Splits documents in smaller chunks
            chunks = self.splitter.split_documents(documents)
            '''
            So now you've got: 
            - chunk1 from abc.pdf, page 5
            - chunk2 from def.pdf, page 2
            '''

            logger.info(f"Generated {len(chunks)} chunks from {directory_path}")
            return chunks

## Centralized Vector Store

In [22]:
class VectorStoreManager:
    """Manages interactions with the Milvus vector store"""
    
    def __init__(self, 
                 embedder: Any,
                 host: str = "localhost",
                 port: str = "19530", # default
                 collection_name: str = "knowledge_base"):
        """
        Initialize the Milvus vector store manager
        
        Args:
            embedder: The embedding model to use
            host: Milvus host address
            port: Milvus port
            collection_name: Name of the Milvus collection
        """

        # Save inputs as instance variables (so other methods in the class can use them later)
        self.embedder = embedder
        self.host = host
        self.port = port
        self.collection_name = collection_name
        self.connection_args = {"host": host, "port": port} # bundles host + port so it’s easier to reuse
        self.vector_store = None                            # initialized as `none`, but will later store the actual Milvus vector store object once it's created
        
        logger.info(f"Vector store manager initialized for collection: {collection_name}")
    
    def initialize_from_documents(self, documents: List[Any]) -> None:
            """Create or update the vector store with documents"""
            logger.info(f"Initializing vector store with {len(documents)} documents")
            
            # Check if collection exists
            try:
                # Try to load existing collection
                self.vector_store = Milvus(
                    embedding_function=self.embedder,
                    collection_name=self.collection_name,
                    connection_args=self.connection_args
                )
                logger.info(f"Connected to existing collection: {self.collection_name}")
                
                # Add new documents to existing collection
                self.vector_store.add_documents(documents)
                logger.info(f"Added {len(documents)} documents to existing collection")
                
            except Exception as e:
                logger.info(f"Creating new collection: {self.collection_name}")
                # Create new collection
                self.vector_store = Milvus.from_documents(
                    documents=documents,
                    embedding=self.embedder,
                    collection_name=self.collection_name,
                    connection_args=self.connection_args
                )
                logger.info(f"Created new collection with {len(documents)} documents")
        
    def get_retriever(self, search_kwargs: Optional[Dict[str, Any]] = None) -> Any:
        """Get a retriever from the vector store with specified parameters"""
        if search_kwargs is None:
            search_kwargs = {"k": 5}
            
        if self.vector_store is None:
            raise ValueError("Vector store not initialized")
            
        return self.vector_store.as_retriever(search_kwargs=search_kwargs)


## RAG

In [23]:
class RAGSystem:
    """Retrieval-Augmented Generation system"""
    
    def __init__(self, 
                 vector_store_manager: VectorStoreManager,
                 model_name: str = "gpt-4o",
                 temperature: float = 0.1):
        """
        Initialize the RAG system
        
        Args:
            vector_store_manager: Vector store manager
            model_name: LLM model name
            temperature: LLM temperature
        """
        self.vector_store_manager = vector_store_manager
        self.model_name = model_name
        self.temperature = temperature
        
        # Initialize LLM
        self.llm = ChatOpenAI(model=model_name, temperature=temperature)
        
        # Default retriever parameters
        self.retrieval_params = {"k": 5}
        
        # Initialize retriever
        self.retriever = self.vector_store_manager.get_retriever(self.retrieval_params)
        
        # Define RAG prompt
        self.prompt = ChatPromptTemplate.from_template("""
        You are a helpful assistant that answers questions based on the provided context.
        
        Context information:
        {context}
        
        Question: {question}
        
        Answer the question based on the context provided. If the context doesn't contain 
        the information needed to answer the question, admit that you don't know rather
        than making up information.
        """)
        
        # Build the RAG chain
        self.chain = (
            {"context": self.retriever, "question": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        )
        
        logger.info(f"RAG system initialized with {model_name}")
    
    def answer_question(self, question: str) -> Tuple[str, List[Any]]:
        """
        Answer a question using RAG
        
        Args:
            question: The question to answer
        
        Returns:
            A tuple of (answer, retrieved_documents)
        """
        # Get retrieved documents (for feedback and evaluation)
        retrieved_docs = self.retriever.invoke(question)
        
        # Generate answer
        answer = self.chain.invoke(question)
        
        return answer, retrieved_docs
    
    def update_retrieval_params(self, params: Dict[str, Any]) -> None:
        """Update retrieval parameters and refresh the retriever"""
        self.retrieval_params.update(params)
        self.retriever = self.vector_store_manager.get_retriever(self.retrieval_params)
        
        # Rebuild the chain with the new retriever
        self.chain = (
            {"context": self.retriever, "question": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        )
        
        logger.info(f"Updated retrieval parameters: {self.retrieval_params}")


## RL

In [24]:
class RLFeedbackSystem:
    """Reinforcement Learning system for optimizing retrieval parameters"""
    
    def __init__(self, rag_system: RAGSystem):
        """
        Initialize the RL feedback system
        
        Args:
            rag_system: The RAG system to optimize
        """
        self.rag_system = rag_system
        self.feedback_history = []
        
        # Parameter exploration settings
        self.exploration_rate = 0.2  # Probability of trying new parameters
        self.min_k = 3
        self.max_k = 10
        
        # Learning rate for parameter updates
        self.learning_rate = 0.1
        
        logger.info("RL feedback system initialized")
    
    def record_feedback(self, 
                       query: str, 
                       retrieved_docs: List[Any], 
                       answer: str,
                       feedback_score: float) -> None:
        """
        Record feedback for a query
        
        Args:
            query: The user query
            retrieved_docs: The documents retrieved for the query
            answer: The answer generated
            feedback_score: User satisfaction score (0-1)
        """
        self.feedback_history.append({
            "query": query,
            "docs": retrieved_docs,
            "answer": answer,
            "score": feedback_score,
            "params": self.rag_system.retrieval_params.copy()
        })
        
        logger.info(f"Recorded feedback with score {feedback_score}")
        
        # Update parameters periodically
        if len(self.feedback_history) % 10 == 0:
            self._update_parameters()
    
    def _update_parameters(self) -> None:
        """Update retrieval parameters based on feedback history"""
        if len(self.feedback_history) < 10:
            return
            
        recent_feedback = self.feedback_history[-10:]
        avg_score = sum(item["score"] for item in recent_feedback) / 10
        
        # Decide whether to explore or exploit
        if np.random.random() < self.exploration_rate:
            # Exploration: try a random k value
            new_k = np.random.randint(self.min_k, self.max_k + 1)
            logger.info(f"Exploration: trying new k={new_k}")
        else:
            # Exploitation: adjust k based on feedback
            current_k = self.rag_system.retrieval_params.get("k", 5)
            
            if avg_score < 0.6:
                # If satisfaction is low, retrieve more documents
                new_k = min(current_k + 1, self.max_k)
                logger.info(f"Increasing k to {new_k} due to low satisfaction")
            elif avg_score > 0.8:
                # If satisfaction is high, retrieve fewer documents for efficiency
                new_k = max(current_k - 1, self.min_k)
                logger.info(f"Decreasing k to {new_k} due to high satisfaction")
            else:
                # Keep current k
                new_k = current_k
                logger.info(f"Maintaining k at {new_k}")
        
        # Update the retrieval parameters
        self.rag_system.update_retrieval_params({"k": new_k})


## Agent

In [25]:
class AgentSystem:
    """Agent system for complex query handling"""
    
    def __init__(self, rag_system: RAGSystem):
        """
        Initialize the agent system
        
        Args:
            rag_system: The RAG system to use for information retrieval
        """
        self.rag_system = rag_system
        
        # Define tools
        self.tools = [
            Tool(
                name="KnowledgeBase",
                description="Search the knowledge base for specific information. Use this for factual questions.",
                func=self._search_knowledge_base
            ),
            Tool(
                name="GenerateAnswer",
                description="Generate a comprehensive answer based on retrieved information. Use this for complex questions requiring synthesis.",
                func=self._generate_comprehensive_answer
            )
        ]
        
        # Define prompt for the agent
        prompt_template = """You are an intelligent agent that helps users find information and answer questions.
        You have access to the following tools:
        
        {tools}
        
        Use the following format:
        
        Question: the input question you must answer
        Thought: you should always think about what to do
        Action: the action to take, should be one of [{tool_names}]
        Action Input: the input to the action
        Observation: the result of the action
        ... (this Thought/Action/Action Input/Observation can repeat N times)
        Thought: I now know the final answer
        Final Answer: the final answer to the original input question
        
        Begin!
        
        Question: {input}
        Thought: """
        
        # Create agent
        self.agent = create_react_agent(
            llm=self.rag_system.llm,
            tools=self.tools,
            prompt=prompt_template
        )
        
        # Create agent executor
        self.agent_executor = AgentExecutor(
            agent=self.agent,
            tools=self.tools,
            verbose=True,
            max_iterations=5
        )
        
        logger.info("Agent system initialized")
    
    def _search_knowledge_base(self, query: str) -> str:
        """Tool to search the knowledge base"""
        retrieved_docs = self.rag_system.retriever.invoke(query)
        
        # Format the retrieved documents as a string
        result = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" 
                             for i, doc in enumerate(retrieved_docs)])
        
        return result
    
    def _generate_comprehensive_answer(self, query: str) -> str:
        """Tool to generate a comprehensive answer"""
        answer, _ = self.rag_system.answer_question(query)
        return answer
    
    def process_query(self, query: str) -> Dict[str, Any]:
        """
        Process a query using the agent system
        
        Args:
            query: User query
            
        Returns:
            Dictionary containing the agent's response and intermediate steps
        """
        logger.info(f"Processing query with agent: {query}")
        return self.agent_executor.invoke({"input": query})

## Combined

In [26]:
class ComprehensiveRAGSystem:
    """Main class that integrates all components"""
    
    def __init__(self,
                embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
                llm_model: str = "gpt-4o",
                milvus_host: str = "localhost",
                milvus_port: str = "19530",
                collection_name: str = "knowledge_base"):
        """
        Initialize the comprehensive RAG system
        
        Args:
            embedding_model: HuggingFace model for embeddings
            llm_model: LLM model name
            milvus_host: Milvus host address
            milvus_port: Milvus port
            collection_name: Milvus collection name
        """
        # Initialize document processor
        self.doc_processor = DocumentProcessor(embedding_model_name=embedding_model)
        
        # Initialize vector store manager
        self.vector_store_manager = VectorStoreManager(
            embedder=self.doc_processor.embedder,
            host=milvus_host,
            port=milvus_port,
            collection_name=collection_name
        )
        
        # Initialize RAG system
        self.rag_system = RAGSystem(
            vector_store_manager=self.vector_store_manager,
            model_name=llm_model
        )
        
        # Initialize RL feedback system
        self.rl_system = RLFeedbackSystem(rag_system=self.rag_system)
        
        # Initialize agent system
        self.agent_system = AgentSystem(rag_system=self.rag_system)
        
        logger.info("Comprehensive RAG system initialized")
    
    def ingest_documents(self, file_path: str) -> None:
        """
        Ingest documents from a file or directory
        
        Args:
            file_path: Path to file or directory
        """
        if os.path.isdir(file_path):
            chunks = self.doc_processor.process_directory(file_path)
        else:
            chunks = self.doc_processor.process_file(file_path)
            
        self.vector_store_manager.initialize_from_documents(chunks)
        logger.info(f"Ingested documents from {file_path}")
    
    def answer_question(self, question: str, use_agent: bool = False) -> str:
        """
        Answer a question using either direct RAG or the agent system
        
        Args:
            question: The question to answer
            use_agent: Whether to use the agent system
            
        Returns:
            The answer to the question
        """
        if use_agent:
            result = self.agent_system.process_query(question)
            return result["output"]
        else:
            answer, retrieved_docs = self.rag_system.answer_question(question)
            return answer
    
    def provide_feedback(self, 
                        query: str, 
                        answer: str, 
                        feedback_score: float,
                        retrieved_docs: Optional[List[Any]] = None) -> None:
        """
        Provide feedback on a response
        
        Args:
            query: The user query
            answer: The generated answer
            feedback_score: User satisfaction score (0-1)
            retrieved_docs: Retrieved documents (if available)
        """
        if retrieved_docs is None:
            # Retrieve documents again if not provided
            retrieved_docs = self.rag_system.retriever.invoke(query)
            
        self.rl_system.record_feedback(query, retrieved_docs, answer, feedback_score)
        logger.info(f"Recorded feedback with score {feedback_score}")


## Implementation

In [27]:
# Initialize the system
rag_system = ComprehensiveRAGSystem(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    llm_model="gpt-4o",
    milvus_host="localhost",
    milvus_port="19530",
    collection_name="medical_knowledge_base" # name inside milvus
)

# Ingest documents
rag_system.ingest_documents("./FINAL DATASET/")

# Now you can check the number of entities
print(rag_system.vector_store_manager.collection.num_entities)

# Answer questions
query = "What are the symptoms of pneumonia?"

# Simple RAG approach
answer = rag_system.answer_question(query, use_agent=False)
print(f"RAG Answer:\n{answer}\n")

# Agent approach
agent_answer = rag_system.answer_question(query, use_agent=True)
print(f"Agent Answer:\n{agent_answer}\n")

# Provide feedback
rag_system.provide_feedback(query, answer, feedback_score=0.8)

2025-05-03 15:56:12,275 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-05-03 15:56:13,108 - INFO - Document processor initialized with sentence-transformers/all-MiniLM-L6-v2
2025-05-03 15:56:13,110 - INFO - Vector store manager initialized for collection: medical_knowledge_base


ValueError: Vector store not initialized