In [None]:
#!pip install chromadb openai
#!pip install -U langchain-openai

In [None]:
# Optimize imports
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
from dotenv import load_dotenv
import os
import psutil

# Load environment variables once at startup
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [None]:
def process_document(pdf_path, chunk_size=512, chunk_overlap=32):
    # Load and split document
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    
    # Split into chunks more efficiently
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False
    )
    chunks = splitter.split_documents(pages)
    
    return [chunk.page_content for chunk in chunks]

# Initialize ChromaDB with OpenAI embedding function
def initialize_chroma(collection_name="nestle_hr", persist_dir="./chroma_db"):
    client = chromadb.PersistentClient(path=persist_dir)
    
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=OPENAI_API_KEY,
        model_name="text-embedding-ada-002"
    )
    
    return client.get_or_create_collection(
        name=collection_name,
        embedding_function=openai_ef
    )

In [None]:
class QASystem:
    def __init__(self, collection, openai_client=None, model="gpt-3.5-turbo", temperature=0.7):
        self.collection = collection
        self.openai_client = openai_client or OpenAI(api_key=OPENAI_API_KEY)
        self.model = model
        self.temperature = temperature
        
        # Cache the system message
        self.system_message = """You are a helpful assistant that answers questions based on the provided text chunks. 
        Answer accurately and concisely using only the information in the chunks. Cite chunk numbers when providing information."""
    
    def get_relevant_chunks(self, query, top_k=5):
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k
        )
        return results['metadatas'][0]
    
    def format_context(self, chunks):
        return "\n\n".join([f"[Chunk {i+1}]: {chunk}" for i, chunk in enumerate(chunks)])
    
    def get_answer(self, query, top_k=5):
        # Get relevant chunks
        chunks = self.get_relevant_chunks(query, top_k)
        context = self.format_context(chunks)
        
        # Create completion request
        response = self.openai_client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": self.system_message},
                {"role": "user", "content": f"CONTEXT:\n{context}\n\nQUESTION: {query}"}
            ],
            temperature=self.temperature
        )
        
        return response.choices[0].message.content

# Create Gradio interface with caching
def create_gradio_interface(qa_system):
    import gradio as gr
    
    @gr.cache_examples
    def answer_question(query):
        return qa_system.get_answer(query)
    
    return gr.Interface(
        fn=answer_question,
        inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
        outputs=gr.Textbox(lines=30),
        title="Nestlé HR Policy Q&A Chatbot",
        description="Ask questions about Nestlé's HR policies",
        examples=[
            ["What is the Total Rewards program?"],
            ["How does Nestlé handle employee training?"],
        ],
        cache_examples=True
    )

In [None]:
def main():
    # Initialize components
    collection = initialize_chroma()
    qa_system = QASystem(collection)
    
    # Create and launch Gradio interface
    iface = create_gradio_interface(qa_system)
    iface.launch(
        inline=True,
        share=False,
        cache_examples=True,
        max_threads=4
    )

if __name__ == "__main__":
    main()