In [1]:
# Import required libraries
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

# # Set your OpenAI API key (replace with your actual key)
# os.environ["OPENAI_API_KEY"] = "your-openai-api-key-here"

# Import libraries
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

import warnings
warnings.filterwarnings('ignore')


In [2]:
def load_and_process_pdfs(pdf_directory):
    """
    Load and process PDF files from the specified directory
    """
    pdf_dir = Path(pdf_directory)

    # Check if directory exists
    if not pdf_dir.exists():
        print(f"Directory {pdf_directory} does not exist!")
        return []

    # Get all PDF files
    pdf_files = list(pdf_dir.glob("*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in {pdf_directory}")
        return []

    print(f"Found {len(pdf_files)} PDF files:")
    for pdf_file in pdf_files:
        print(f"  - {pdf_file.name}")

    # Load all PDFs
    all_documents = []

    for pdf_file in pdf_files:
        try:
            print(f"Loading {pdf_file.name}...")
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source information to each document
            for doc in documents:
                doc.metadata["source"] = pdf_file.name

            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
        except Exception as e:
            print(f"  ✗ Error loading {pdf_file.name}: {e}")

    return all_documents


In [3]:
def split_documents(documents):
    """
    Split documents into smaller chunks for embedding
    """
    # Create text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )

    # Split documents
    print("Splitting documents into chunks...")
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} document chunks")

    return chunks


In [4]:
def create_vector_store(chunks, persist_directory="./chroma_db"):
    """
    Create and persist vector store using Chroma
    """
    print("Creating embeddings and vector store...")

    # Initialize embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

    # Create Chroma vector store
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )

    # Persist the database
    vector_store.persist()
    print(f"Vector store created and persisted to {persist_directory}")

    return vector_store


In [5]:
def create_qa_chain(vector_store):
    """
    Create a Retrieval QA chain
    """
    print("Creating QA chain...")

    # Create retriever
    retriever = vector_store.as_retriever(
        search_kwargs={"k": 5}  # Retrieve top 5 most similar documents
    )

    # Initialize LLM
    llm = OpenAI(
        model="gpt-3.5-turbo-instruct",
        temperature=0.1
    )

    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        verbose=False
    )

    print("QA chain created successfully!")
    return qa_chain



In [6]:
def answer_question(qa_chain, question):
    """
    Answer a question using the QA chain
    """
    print(f"\nQuestion: {question}")
    print("-" * 50)

    # Get answer
    result = qa_chain({"query": question})

    # Print answer
    print("Answer:", result["result"])

    # Print sources
    print("\nSources:")
    for i, doc in enumerate(result["source_documents"][:3]):  # Show top 3 sources
        source = doc.metadata.get("source", "Unknown")
        content_preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
        print(f"{i+1}. {source}")
        print(f"   Preview: {content_preview}\n")

    return result


In [7]:
def main():
    """
    Main function to run the RAG pipeline
    """
    # Configuration
    PDF_DIRECTORY = "docs/"
    PERSIST_DIRECTORY = "./chroma_db"

    print("=" * 60)
    print("RAG System - PDF Document Question Answering")
    print("=" * 60)

    # Step 1: Load and process PDFs
    print("\nStep 1: Loading PDF documents...")
    documents = load_and_process_pdfs(PDF_DIRECTORY)

    if not documents:
        print("No documents loaded. Exiting...")
        return

    # Step 2: Split documents into chunks
    print("\nStep 2: Processing documents...")
    chunks = split_documents(documents)

    # Step 3: Create vector store
    print("\nStep 3: Creating vector database...")
    vector_store = create_vector_store(chunks, PERSIST_DIRECTORY)

    # Step 4: Create QA chain
    print("\nStep 4: Setting up QA system...")
    qa_chain = create_qa_chain(vector_store)

    print("\n" + "=" * 60)
    print("RAG System Ready!")
    print("=" * 60)

    # Interactive Q&A loop
    while True:
        print("\n" + "-" * 40)
        question = input("\nEnter your question (or 'quit' to exit): ").strip()

        if question.lower() in ['quit', 'exit', 'q']:
            print("Exiting...")
            break

        if question:
            try:
                answer_question(qa_chain, question)
            except Exception as e:
                print(f"Error answering question: {e}")


In [8]:
# Alternative: Function to load existing vector store
def load_existing_vector_store(persist_directory="./chroma_db"):
    """
    Load an existing vector store (useful for re-running without re-processing)
    """
    if not os.path.exists(persist_directory):
        print(f"No existing vector store found at {persist_directory}")
        return None

    print("Loading existing vector store...")
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    vector_store = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings
    )
    print("Vector store loaded successfully!")
    return vector_store


In [9]:
# If you want to use LlamaIndex instead/as well:
def setup_llama_index(vector_store):
    """
    Optional: Setup with LlamaIndex
    Note: This requires additional installation: pip install llama-index
    """
    try:
        from llama_index import VectorStoreIndex, ServiceContext
        from llama_index.vector_stores import ChromaVectorStore
        from llama_index.embeddings import OpenAIEmbedding
        from llama_index.llms import OpenAI as LlamaOpenAI

        # Convert LangChain vector store to LlamaIndex format
        chroma_client = vector_store._client
        chroma_collection = chroma_client.get_collection(vector_store._collection.name)

        # Create LlamaIndex vector store
        llama_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

        # Create service context
        embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
        llm = LlamaOpenAI(model="gpt-3.5-turbo", temperature=0.1)
        service_context = ServiceContext.from_defaults(
            llm=llm,
            embed_model=embed_model
        )

        # Create index
        index = VectorStoreIndex.from_vector_store(
            llama_vector_store,
            service_context=service_context
        )

        print("LlamaIndex setup completed!")
        return index

    except ImportError:
        print("LlamaIndex not installed. Install with: pip install llama-index")
        return None


In [10]:
# Run the main function
if __name__ == "__main__":
    main()

RAG System - PDF Document Question Answering

Step 1: Loading PDF documents...
Found 10 PDF files:
  - Towards_multi-level-simulation_using_dynamic_cloud_environments.pdf
  - Reflection_LLM_AM_Patterson.pdf
  - Towards_multi-level-simulation_using_dynamic_cloud_environments (1).pdf
  - cv_gk_july25.pdf
  - Towards_multi-level-simulation_using_dynamic_cloud_environments (2).pdf
  - 2024_01_PhD_MaDLab (1).pdf
  - MSc. completetion.pdf
  - Tschorsch_WIMI_w25-206.pdf
  - Db114952.pdf
  - 2504.19394v2.pdf
Loading Towards_multi-level-simulation_using_dynamic_cloud_environments.pdf...
  ✓ Loaded 7 pages
Loading Reflection_LLM_AM_Patterson.pdf...
  ✓ Loaded 7 pages
Loading Towards_multi-level-simulation_using_dynamic_cloud_environments (1).pdf...
  ✓ Loaded 7 pages
Loading cv_gk_july25.pdf...
  ✓ Loaded 58 pages
Loading Towards_multi-level-simulation_using_dynamic_cloud_environments (2).pdf...
  ✓ Loaded 7 pages
Loading 2024_01_PhD_MaDLab (1).pdf...
  ✓ Loaded 1 pages
Loading MSc. completetion

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors..


RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.