<a href="https://colab.research.google.com/github/happymondaynkanta/Multi-Format-Document-Retrieval-Augmented-Generation-Pipeline/blob/main/LLM_RAG_Projct_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>




## Install required libraries: LangChain, ChromaDB, HuggingFace, PDF/DOCX/PPTX loaders





In [1]:
!pip -q install -U \
  langchain-community \
  langchain-text-splitters \
  langchain-chroma \
  sentence-transformers \
  chromadb \
  pypdf \
  docx2txt \
  python-pptx \
  beautifulsoup4


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.6/486.6 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m28.8 MB/s[0m eta [36m0:


## Mount Google Drive in Colab and define base directory for documents ---




In [2]:
from google.colab import drive
drive.mount('/content/drive')

# 👇 change only if your Drive path differs
DATA_DIR = "/content/drive/MyDrive/project_two"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import standard libraries and format-specific loaders (PDF, DOCX, PPTX)


In [3]:
from pathlib import Path
from typing import List
import re

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from pptx import Presentation  # python-pptx

## Split loaded documents into manageable chunks with overlap for embeddings ---


In [4]:

def load_pdf(path: Path) -> List[Document]:
    # PyPDFLoader returns one Document per page (nice metadata: "page")
    loader = PyPDFLoader(str(path))
    docs = loader.load()
    for d in docs:
        d.metadata.update({
            "source": str(path),
            "filename": path.name,
            "ext": path.suffix.lower(),
            "week": path.parent.name  # e.g., "Week 1"
        })
    return docs

def load_docx(path: Path) -> List[Document]:
    loader = Docx2txtLoader(str(path))
    docs = loader.load()
    # Docx2txtLoader returns a single Document
    for d in docs:
        d.metadata.update({
            "source": str(path),
            "filename": path.name,
            "ext": path.suffix.lower(),
            "week": path.parent.name
        })
    return docs

def load_pptx(path: Path) -> List[Document]:
    prs = Presentation(str(path))
    docs = []
    for i, slide in enumerate(prs.slides, start=1):
        # collect all text on the slide
        chunks = []
        for shape in slide.shapes:
            if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                txt = "\n".join(p.text for p in shape.text_frame.paragraphs if p.text)
                if txt.strip():
                    chunks.append(txt.strip())
        slide_text = "\n".join(chunks).strip()
        if slide_text:
            docs.append(
                Document(
                    page_content=slide_text,
                    metadata={
                        "source": str(path),
                        "filename": path.name,
                        "ext": path.suffix.lower(),
                        "slide": i,
                        "week": path.parent.name
                    }
                )
            )
    return docs

def load_all(base_dir: str) -> List[Document]:
    base = Path(base_dir)
    all_docs: List[Document] = []
    for p in base.rglob("*"):
        if not p.is_file():
            continue
        ext = p.suffix.lower()
        try:
            if ext == ".pdf":
                all_docs += load_pdf(p)
            elif ext == ".docx":
                all_docs += load_docx(p)
            elif ext == ".pptx":
                all_docs += load_pptx(p)
            # ignore other file types
        except Exception as e:
            print(f"[skip] {p.name}: {e}")
    return all_docs

docs = load_all(DATA_DIR)

# optional: filter empty/tiny pages
docs = [d for d in docs if len(d.page_content.split()) > 10]

print(f"Loaded {len(docs)} Documents")
# quick peek
for d in docs[:5]:
    print(d.metadata, "→", d.page_content[:120].replace("\n"," "), "…")


Loaded 135 Documents
{'source': '/content/drive/MyDrive/project_two/Week 2/Lecture 2.pptx', 'filename': 'Lecture 2.pptx', 'ext': '.pptx', 'slide': 1, 'week': 'Week 2'} → Machine Vision CHC6781 Lecture 2: Fundamental of Image Classification: A core task in computer vision Module Leader: Dr  …
{'source': '/content/drive/MyDrive/project_two/Week 2/Lecture 2.pptx', 'filename': 'Lecture 2.pptx', 'ext': '.pptx', 'slide': 2, 'week': 'Week 2'} → Class Rules Be on time Bring your stuff (Pen, papers, folder, brain) No pressing of phone, or gaming in the class.  Habi …
{'source': '/content/drive/MyDrive/project_two/Week 2/Lecture 2.pptx', 'filename': 'Lecture 2.pptx', 'ext': '.pptx', 'slide': 3, 'week': 'Week 2'} → Learning Outcomes Understand the  basic building block for more complex computer vision tasks By the end of this topic,  …
{'source': '/content/drive/MyDrive/project_two/Week 2/Lecture 2.pptx', 'filename': 'Lecture 2.pptx', 'ext': '.pptx', 'slide': 4, 'week': 'Week 2'} → References 1. 

## Split Documents into Chunks
## Encode chunks into semantic embeddings and store them in ChromaDB ---


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)
print(f"Chunks: {len(chunks)}")

# local, free embeddings (fast on Pro GPU)
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

device = "cuda" if torch.cuda.is_available() else "cpu"
emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device}
)

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=emb,
    collection_name="project_two",
    persist_directory="chroma_db"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
print("Chroma index ready.")


Chunks: 153


  emb = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Chroma index ready.


## --- Define function to retrieve context and generate grounded answers with LLM ---


In [9]:
from google.colab import ai

def ask(question, k=5, max_chars=1200):
    ctx_docs = retriever.get_relevant_documents(question)
    context = "\n\n".join(
        f"[{i+1}] {d.metadata.get('filename')} ({d.metadata.get('week')})\n{d.page_content[:max_chars]}"
        for i, d in enumerate(ctx_docs)
    )
    prompt = (
        "You are a helpful assistant. Answer ONLY from the context.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    )
    return ai.generate_text(prompt), ctx_docs

answer, sources = ask("Then who is Happy?")
print(answer)
print("\nSources:")
for s in sources:
    print("-", s.metadata.get("filename"), "|", s.metadata.get("week"))


Happy N. Monday is the Module Leader.


Sources:
- Lecture 2.pptx | Week 2
- Lecture 1.pptx | Week 1
- week2.pptx.pdf | Week 2
- Lecture 2.pptx | Week 2
- Lecture 1.pptx | Week 1
