<a href="https://colab.research.google.com/github/karthik19-cloud/GenAI-Training/blob/main/01-l4-assingmnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# rag_pdf_summarizer.py

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain.chains import RetrievalQA
import os


ModuleNotFoundError: No module named 'langchain_community'

In [None]:
# ----- CONFIG -----
PDF_PATH = "l3-assigments.pdf"      # change to your PDF path
CHROMA_DB_DIR = "chroma_pdf_db"

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # or any other open-source LLM
# -------------------


In [None]:
def load_and_split_pdf(pdf_path: str):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    split_docs = text_splitter.split_documents(docs)
    return split_docs


In [None]:
def create_or_load_vectorstore(docs, persist_directory: str):
    # Embeddings using SentenceTransformers
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME
    )

    # If you want to always rebuild, delete the folder beforehand.
    if os.path.exists(persist_directory) and os.listdir(persist_directory):
        vectorstore = Chroma(
            persist_directory=persist_directory,
            embedding_function=embeddings
        )
    else:
        vectorstore = Chroma.from_documents(
            documents=docs,
            embedding=embeddings,
            persist_directory=persist_directory,
        )
        vectorstore.persist()

    return vectorstore


In [None]:
def load_llm(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",      # "cuda" / "cpu" / "auto"
        torch_dtype="auto"
    )

    generate_text = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.3,
        top_p=0.9
    )

    llm = HuggingFacePipeline(pipeline=generate_text)
    return llm


In [None]:
def build_rag_chain(vectorstore, llm):
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5}
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",  # simplest type: stuff all relevant docs into prompt
        return_source_documents=True,
    )
    return qa_chain


In [None]:
def summarize_document(qa_chain):
    query = (
        "Provide a detailed, well-structured summary of the entire document. "
        "Highlight key sections, main arguments, important data points, and conclusions. "
        "Keep it concise but comprehensive."
    )

    result = qa_chain({"query": query})
    summary = result["result"]
    sources = result["source_documents"]

    print("\n=== SUMMARY ===\n")
    print(summary)

    print("\n=== TOP RETRIEVED CHUNKS (for transparency) ===\n")
    for i, doc in enumerate(sources, start=1):
        print(f"\n--- Chunk {i} (page {doc.metadata.get('page', 'N/A')}) ---\n")
        print(doc.page_content[:500], "...")


In [None]:
def main():
    # 1. Load and split the PDF
    print("Loading and splitting PDF...")
    docs = load_and_split_pdf(PDF_PATH)

    # 2. Build or load vectorstore
    print("Creating/loading Chroma vectorstore...")
    vectorstore = create_or_load_vectorstore(docs, CHROMA_DB_DIR)

    # 3. Load open-source LLM
    print("Loading LLM (this may take some time the first run)...")
    llm = load_llm(LLM_MODEL_NAME)

    # 4. Build RAG chain
    print("Building RAG chain...")
    qa_chain = build_rag_chain(vectorstore, llm)

    # 5. Summarize document via RAG
    print("Generating summary...")
    summarize_document(qa_chain)


if __name__ == "__main__":
    main()
