In [1]:
# !pip install llama-index==0.8.9

In [2]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter


In [4]:
!pwd

/Users/imbilalbutt/PycharmProjects/FastAPIProject


In [5]:
# ---------------------------
# 1. Load PDF documents
# ---------------------------
docs_dir = "docs/"
pdf_files = [os.path.join(docs_dir, f) for f in os.listdir(docs_dir) if f.endswith(".pdf")][:10]

documents = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()
    documents.extend(pages)

print(f"Loaded {len(documents)} pages from {len(pdf_files)} PDFs.")


Loaded 297 pages from 10 PDFs.


In [None]:
# ---------------------------
# 2. Create embeddings
# ---------------------------
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")



In [None]:
# ---------------------------
# 3. Store embeddings in Chroma
# ---------------------------
vector_db_dir = "./chroma_db"
if not os.path.exists(vector_db_dir):
    os.makedirs(vector_db_dir)

db = Chroma.from_documents(documents, embeddings, persist_directory=vector_db_dir)
db.persist()
print("Embeddings stored in Chroma vector database.")


In [None]:
# ---------------------------
# 4. Create LangChain retriever
# ---------------------------
retriever = db.as_retriever()

# ---------------------------
# 5. Create RetrievalQA chain with LLM
# ---------------------------
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(model="gpt-3.5-turbo"),
    chain_type="stuff",
    retriever=retriever
)


In [None]:
# ---------------------------
# 6. (Optional) Create Llama Index
# ---------------------------
# Using LlamaIndex to index the documents for another retrieval mechanism
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = [text_splitter.split_text(doc.page_content) for doc in documents]

# Flatten the list of lists
flat_docs = [d for sublist in split_docs for d in sublist]

service_context = ServiceContext.from_defaults()
llm_predictor = LLMPredictor(llm=OpenAI(model="gpt-3.5-turbo"))
index = GPTVectorStoreIndex.from_documents(
    SimpleDirectoryReader(input_files=[]).load_data(), # Empty as placeholder
    service_context=service_context
)


In [None]:
# ---------------------------
# 7. Ask questions
# ---------------------------
while True:
    query = input("\nEnter your question (or 'exit' to quit): ")
    if query.lower() == "exit":
        break

    # Retrieve answer using LangChain RAG
    answer = qa_chain.run(query)
    print("\nAnswer (LangChain RAG):")
    print(answer)

    # Optionally, you could also query LlamaIndex
    # response = index.as_query_engine().query(query)
    # print("\nAnswer (LlamaIndex):")
    # print(response)