In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

In [None]:
# ---------------------------
# 1. Load PDF documents
# ---------------------------
docs_dir = "/src/docs"
pdf_files = [os.path.join(docs_dir, f) for f in os.listdir(docs_dir) if f.endswith(".pdf")][:10]

documents = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()
    documents.extend(pages)

print(f"Loaded {len(documents)} pages from {len(pdf_files)} PDFs.")



In [None]:
# ---------------------------
# 2. Create embeddings (Hugging Face)
# ---------------------------
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# ---------------------------
# 3. Store embeddings in Chroma
# ---------------------------
vector_db_dir = "./chroma_db"
if not os.path.exists(vector_db_dir):
    os.makedirs(vector_db_dir)

db = Chroma.from_documents(documents, embeddings, persist_directory=vector_db_dir)
db.persist()
print("Embeddings stored in Chroma vector database.")



In [None]:
# ---------------------------
# 4. Create LangChain retriever
# ---------------------------
retriever = db.as_retriever()

# ---------------------------
# 5. Create Hugging Face LLM
# ---------------------------
# Text generation pipeline (can use "hkunlp/instructor-large" or similar)
hf_pipeline = pipeline(
    "text-generation",
    model="h2oai/h2ogpt-oasst1-512-12b",  # replace with your HF model
    device=0,  # set to -1 for CPU
    max_length=512
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)



In [None]:
# ---------------------------
# 6. Create RetrievalQA chain
# ---------------------------
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)


In [None]:
# ---------------------------
# 7. Ask questions interactively
# ---------------------------
while True:
    query = input("\nEnter your question (or 'exit' to quit): ")
    if query.lower() == "exit":
        break

    answer = qa_chain.run(query)
    print("\nAnswer:")
    print(answer)
