# RAG + QA Bot Pipeline
This notebook demonstrates Tasks 1–6 with LangChain, Chroma, Hugging Face embeddings, and Falcon LLM.

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA

# Task 1: Load PDF
pdf_path = 'heat_bath_paper.pdf'
loader = PyPDFLoader(pdf_path)
pages = loader.load()
print(f'Pages loaded: {len(pages)}')

# Task 2: Split text
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(pages)
print(f'Number of chunks: {len(docs)}')

# Task 3: Embeddings
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Task 4: Vector DB
chroma_db = Chroma.from_documents(docs, embedding)
retriever = chroma_db.as_retriever(search_kwargs={'k': 3})

# Task 6: QA Bot
llm = HuggingFaceHub(repo_id='tiiuae/falcon-7b-instruct', model_kwargs={'temperature': 0.5, 'max_new_tokens': 256})
qa_bot = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

query = 'What this paper is talking about?'
result = qa_bot({'query': query})
print(result['result'])