<a href="https://colab.research.google.com/github/karthik19-cloud/GenAI-Training/blob/main/GenAI-L4/01-pdf-summarization/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install -q langchain langchain-community chromadb sentence-transformers transformers pypdf


In [10]:
!pip install -q langchain-text-splitters

In [14]:
!pip install -q "langchain==0.2.16" "langchain-community==0.2.16"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.1/397.1 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.8/311.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dep

In [37]:
import os

# Fixed path for Google Colab sample file
pdf_path = "/content/sample_data/l3-assigments.pdf"

print("Using PDF:", pdf_path)



Using PDF: /content/sample_data/l3-assigments.pdf


In [48]:
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
#from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# 1) Load and split the PDF into chunks
loader = PyPDFLoader(pdf_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=40,
    length_function=len
)
docs = text_splitter.split_documents(documents)

print(f"Loaded {len(documents)} pages, split into {len(docs)} chunks.")

# 2) Create embeddings with a Sentence-Transformers model (open source)
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# 3) Create / load Chroma vector store
persist_directory = "chroma_pdf_index"

vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=persist_directory
)
vectorstore.persist()
print("Vector store created and persisted at:", persist_directory)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# 4) Load an open-source text generation model for summarization via HuggingFacePipeline
#    You can choose another seq2seq model if you like.
llm_model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

device = 0 if torch.cuda.is_available() else -1

summarization_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True,
    device=device
)

llm = HuggingFacePipeline(pipeline=summarization_pipeline)

def rag_answer(query: str):
    # 1) Retrieve relevant chunks from Chroma
    docs = retriever.invoke(query)  # <- changed here

    # 2) Build context text
    context = "\n\n".join(d.page_content for d in docs)

    # 3) Build a prompt for the LLM
    prompt = (
        "You are a helpful assistant. Use the context below to answer the question.\n\n"
        "Context:\n"
        f"{context}\n\n"
        "Question:\n"
        f"{query}\n\n"
        "Answer:"
    )

    # 4) Call the Hugging Face text2text model
    out = summarization_pipeline(prompt, max_length=512, truncation=True)[0]["generated_text"]
    return out, docs





# 5) Build a RetrievalQA chain (RAG)
# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     retriever=retriever,
#     chain_type="stuff",  # Stuff retrieved chunks into the prompt
#     return_source_documents=True
# )

print("RAG pipeline ready.")


Loaded 2 pages, split into 6 chunks.
Vector store created and persisted at: chroma_pdf_index


Device set to use cpu


RAG pipeline ready.


In [49]:
# def summarize_document_with_rag(chain, summary_prompt: str = None):
#     """
#     Uses the RAG pipeline to summarize the document.
#     `summary_prompt` can be customized; if None, a default is used.
#     """
#     if summary_prompt is None:
#         summary_prompt = (
#             "Provide a comprehensive but concise summary of this document. "
#             "Focus on the main topics, key points, and any important conclusions. "
#             "Write the summary in clear, well-structured paragraphs."
#         )

#     result = chain({"query": summary_prompt})
#     answer = result["result"]
#     return answer, result["source_documents"]

# summary, sources = summarize_document_with_rag()
def summarize_document_with_rag():
    query = (
        "Provide a comprehensive but concise summary of this document. "
        "Focus on the main topics, key points, and important conclusions. "
        "Write the summary in clear, well-structured paragraphs."
    )
    return rag_answer(query)

# print("===== DOCUMENT SUMMARY (RAG) =====\n")
# print(summary)
summary, sources = summarize_document_with_rag()

print("===== DOCUMENT SUMMARY (RAG) =====\n")
print(summary)



# print("===== DOCUMENT SUMMARY (RAG) =====\n")
# print(summary)


===== DOCUMENT SUMMARY (RAG) =====

A comprehensive list of GenAI bronze assignments.
