### setup

In [2]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.27.1-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.27.1-cp310-abi3-win_amd64.whl (19.2 MB)
   ---------------------------------------- 0.0/19.2 MB ? eta -:--:--
    --------------------------------------- 0.3/19.2 MB ? eta -:--:--
    --------------------------------------- 0.3/19.2 MB ? eta -:--:--
   - -------------------------------------- 0.8/19.2 MB 1.2 MB/s eta 0:00:16
   - -------------------------------------- 0.8/19.2 MB 1.2 MB/s eta 0:00:16
   -- ------------------------------------- 1.3/19.2 MB 1.4 MB/s eta 0:00:13
   --- ------------------------------------ 1.8/19.2 MB 1.6 MB/s eta 0:00:11
   ---- ----------------------------------- 2.1/19.2 MB 1.7 MB/s eta 0:00:11
   ---- ----------------------------------- 2.4/19.2 MB 1.6 MB/s eta 0:00:11
   ----- ---------------------------------- 2.9/19.2 MB 1.6 MB/s eta 0:00:11
   ------- -------------------------------- 3.4/19.2 MB 1.7 MB/s eta 0:00:10
   ------- ----------

In [1]:
import os
import fitz  # PyMuPDF
from tqdm import tqdm

from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

from langchain_groq import ChatGroq
from langchain import hub
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain


  from .autonotebook import tqdm as notebook_tqdm


### 1) Extract text + images from a PDF

This produces:

page text as Documents

images saved to disk + “image documents” (we’ll store a placeholder description now, and you can upgrade to OCR later)

In [2]:
def extract_text_and_images(pdf_path, image_dir="../extracted_images"):
    os.makedirs(image_dir, exist_ok=True)
    doc = fitz.open(pdf_path)

    text_docs = []
    image_docs = []

    for page_idx in range(len(doc)):
        page = doc[page_idx]

        # ---- Text ----
        text = page.get_text("text").strip()
        if text:
            text_docs.append(Document(
                page_content=text,
                metadata={"source": pdf_path, "page": page_idx, "type": "text"}
            ))

        # ---- Images ----
        images = page.get_images(full=True)
        for img_i, img in enumerate(images):
            xref = img[0]
            base = doc.extract_image(xref)
            img_bytes = base["image"]
            ext = base.get("ext", "png")

            img_filename = f"{os.path.basename(pdf_path).replace('.pdf','')}_p{page_idx}_img{img_i}.{ext}"
            img_path = os.path.join(image_dir, img_filename)

            with open(img_path, "wb") as f:
                f.write(img_bytes)

            # For now: store a retrievable “image doc” as text (metadata points to the file)
            image_docs.append(Document(
                page_content=f"[IMAGE] File: {img_filename}. Page: {page_idx}. (No caption/OCR yet.)",
                metadata={"source": pdf_path, "page": page_idx, "type": "image", "image_path": img_path}
            ))

    return text_docs, image_docs


### 2) Chunk the text docs (images don’t need chunking)

In [3]:
def chunk_text_docs(text_docs, chunk_size=1000, chunk_overlap=150):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(text_docs)


### 3) Embed + store in FAISS

In [4]:
def build_vectorstore(docs, db_path="../vector_databases/vector_db_multimodal"):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2",
        encode_kwargs={"normalize_embeddings": True}
    )

    vs = FAISS.from_documents(
        documents=docs,
        embedding=embeddings,
        distance_strategy=DistanceStrategy.COSINE
    )
    vs.save_local(db_path)
    return vs


### 4) Retrieval + generation chain (Groq LLM)

In [5]:
def build_rag_chain(vectorstore, llm):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    stuff_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    rag_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=stuff_chain)
    return rag_chain


### 5) run it end to end

In [9]:
import requests

url = "https://www.congress.gov/crs_external_products/R/PDF/R43419/R43419.98.pdf"
local_path = "../documents/crs_external_products.pdf"

r = requests.get(url)
with open(local_path, "wb") as f:
    f.write(r.content)
print("Saved:", local_path)

Saved: ../documents/crs_external_products.pdf


In [10]:
pdf_path = "../documents/crs_external_products.pdf" 
text_docs, image_docs = extract_text_and_images(pdf_path)

text_chunks = chunk_text_docs(text_docs)
all_docs = text_chunks + image_docs

vectorstore = build_vectorstore(all_docs, db_path="../vector_databases/vector_db_multimodal_v1")

llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

rag_chain = build_rag_chain(vectorstore, llm)

out = rag_chain.invoke({"input": "Summarize the document and mention any figures/images you saw."})
print(out["answer"])

The document appears to be a table or chart comparing NASA's appropriations for various years, specifically for the Construction and EC&R (Engineering, Construction, and Research) and Inspector General categories. The table shows the appropriations for FY2023, FY2024 (requested), and FY2024 (House Introduced and Senate Committee versions).

I saw the following figures/images:

- A series of numbers (3,369, 3,136, 3,100) at the top of the table, but their context is unclear.
- A table with the following categories:
  - Construction and EC&R
  - Inspector General
  - Total
- The table shows the appropriations for FY2023, FY2024 (requested), and FY2024 (House Introduced and Senate Committee versions).
- There are also references to images (crs_external_products_p6_img0.png, crs_external_products_p0_img1.png, crs_external_products_p0_img0.png) but no captions or OCR (Optical Character Recognition) text is available for these images.

The document also mentions sources for the data, includi

In [11]:
out = rag_chain.invoke({"input": "what is this document about? answer in hebrew"
""})
print(out["answer"])

אני לא בטוח שאני יכול לספק תשובה מדויקת, אך נראה שהדוקומנט הזה עוסק בתקציב של נאס"א (NASA) לשנים שונות, ומפרט את התקציבים השונים לתחומים שונים, כגון: 

* אסטרופיזיקה
* תחנת החלל הבינלאומית
* חקר החלל
* פיתוח וייצור
* תקציבי חירום

בסך הכל, הדוקומנט נראה שהוא תיעוד של התקציבים של נאס"א לשנים שונות, ומספק פרטים על התקציבים השונים לתחומים שונים.

תרגום: 
"אני לא בטוח" - "I'm not sure"
"נאס"א" - "NASA"
"תקציב" - "budget"
"תחומים" - "fields"
"חירום" - "emergency"


In [12]:
!pip install pytesseract pillow


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [13]:
import pytesseract
from PIL import Image
import os

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [14]:
import os
os.listdir("../extracted_images")[:30]

['crs_external_products_p0_img0.png',
 'crs_external_products_p0_img1.png',
 'crs_external_products_p6_img0.png',
 'pilates_mat_basic_p0_img0.png',
 'pilates_mat_basic_p0_img1.png',
 'pilates_mat_basic_p0_img10.png',
 'pilates_mat_basic_p0_img100.png',
 'pilates_mat_basic_p0_img101.png',
 'pilates_mat_basic_p0_img11.png',
 'pilates_mat_basic_p0_img12.png',
 'pilates_mat_basic_p0_img13.png',
 'pilates_mat_basic_p0_img14.png',
 'pilates_mat_basic_p0_img15.png',
 'pilates_mat_basic_p0_img16.png',
 'pilates_mat_basic_p0_img17.png',
 'pilates_mat_basic_p0_img18.png',
 'pilates_mat_basic_p0_img19.png',
 'pilates_mat_basic_p0_img2.png',
 'pilates_mat_basic_p0_img20.png',
 'pilates_mat_basic_p0_img21.png',
 'pilates_mat_basic_p0_img22.png',
 'pilates_mat_basic_p0_img23.png',
 'pilates_mat_basic_p0_img24.png',
 'pilates_mat_basic_p0_img25.png',
 'pilates_mat_basic_p0_img26.png',
 'pilates_mat_basic_p0_img27.png',
 'pilates_mat_basic_p0_img28.png',
 'pilates_mat_basic_p0_img29.png',
 'pilates_ma

In [15]:
import pytesseract
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

img_path = "../extracted_images/crs_external_products_p6_img0.png"

text = pytesseract.image_to_string(Image.open(img_path))
print(text[:2500])


2023 Dollars

Current Dollars

40

35

30

ra) ° i)
N a a

(suoying $) Asouiny y88png

10

raara
07207
8107
9T0Z
vtoz
casera
OTOz
8007
9007
007
7007
0007
866T
966T
v66T
766T
O66T
886T
986T
v86T
786T
O86T
8Z46T
9Z6T
vet
tL6T
OL6T
896T
996T
v96T
796T
O96T
8S6T




### 1) Extraction (text + images + OCR)

In [31]:
import os
import fitz
import pytesseract
from PIL import Image
from langchain.schema import Document

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_and_ocr_images(pdf_path, image_dir="../extracted_images_fairytale"):
    os.makedirs(image_dir, exist_ok=True)
    pdf = fitz.open(pdf_path)

    text_docs = []
    ocr_image_docs = []

    for page_idx in range(len(pdf)):
        page = pdf[page_idx]

        # ---- Text ----
        text = page.get_text("text").strip()
        if text:
            text_docs.append(Document(
                page_content=text,
                metadata={"source": pdf_path, "page": page_idx, "type": "text"}
            ))

        # ---- Images + OCR ----
        images = page.get_images(full=True)
        for img_i, img in enumerate(images):
            xref = img[0]
            base = pdf.extract_image(xref)
            img_bytes = base["image"]
            ext = base.get("ext", "png")

            img_filename = f"{os.path.basename(pdf_path).replace('.pdf','')}_p{page_idx}_img{img_i}.{ext}"
            img_path = os.path.join(image_dir, img_filename)

            with open(img_path, "wb") as f:
                f.write(img_bytes)

            ocr_text = pytesseract.image_to_string(Image.open(img_path)).strip()
            content = ocr_text if ocr_text else "Illustration detected (no readable text via OCR)."

            ocr_image_docs.append(Document(
                page_content=f"[IMAGE_OCR]\n{content}",
                metadata={"source": pdf_path, "page": page_idx, "type": "image_ocr", "image_path": img_path}
            ))

    return text_docs, ocr_image_docs


### 2) Chunking + Vectorstore

In [32]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

def chunk_docs(docs, chunk_size=1000, chunk_overlap=150):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

def build_vectorstore(docs, db_path="../vector_databases/vector_db_fairytale_mmocr"):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2",
        encode_kwargs={"normalize_embeddings": True}
    )
    vs = FAISS.from_documents(docs, embeddings, distance_strategy=DistanceStrategy.COSINE)
    vs.save_local(db_path)
    return vs


### 3) RAG chain (retrieval + Groq generation)

In [33]:
from langchain_groq import ChatGroq
from langchain import hub
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

def build_rag_chain(vectorstore):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

    stuff_chain = create_stuff_documents_chain(llm=llm, prompt=hub.pull("langchain-ai/retrieval-qa-chat"))
    return create_retrieval_chain(retriever=retriever, combine_docs_chain=stuff_chain)

### 4) Run it

In [34]:
pdf_path = "../documents/The_Frog-King.pdf"  # replace
text_docs, ocr_docs = extract_text_and_ocr_images(pdf_path)

text_chunks = chunk_docs(text_docs)
all_docs = text_chunks + ocr_docs

vs = build_vectorstore(all_docs)
rag_chain = build_rag_chain(vs)

out = rag_chain.invoke({"input": "Summarize the story and describe what the illustration shows."})
print(out["answer"])

**Summary:**

The story is about a king's daughter who loses her golden ball into a well in the forest. A frog helps her retrieve the ball, and in return, she promises to marry him if he helps her. However, the frog is actually a prince who has been cursed by a witch. The princess's kindness and love break the curse, and the frog transforms back into a prince. They get married, and the prince's servant, Faithful Henry, is overjoyed to see his master happy and free.

**Illustration:**

Unfortunately, there is no specific illustration provided in the given context. However, based on the story, a possible illustration could show the princess and the prince (formerly a frog) sitting together, smiling, and holding hands. The illustration might also depict Faithful Henry in the background, looking happy and relieved. Alternatively, it could show the princess and the prince in their carriage, with Faithful Henry standing behind them, as described in the story.


In [35]:
pdf_path = "../documents/The_Frog-King.pdf"
text_docs, ocr_docs = extract_text_and_ocr_images(pdf_path)
len(text_docs), len(ocr_docs)

(3, 2)

In [36]:
out = rag_chain.invoke({"input": "What illustrations are included and on which pages? Use ONLY the provided context."})

print(out["answer"])

for d in out["context"]:
    print("\n--- CONTEXT DOC ---")
    print(d.metadata)
    print(d.page_content[:300])

There are two illustrations detected, but no readable text via OCR.

--- CONTEXT DOC ---
{'source': '../documents/The_Frog-King.pdf', 'page': 3, 'type': 'text'}
But when she was in bed he crept to her and said, "I am tired, I want to sleep as well as thou, lift me
up or I will tell thy father." Then she was terribly angry, and took him up and threw him with all her
might against the wall. "Now, thou wilt be quiet, odious frog," said she. But when he fell do

--- CONTEXT DOC ---
{'source': '../documents/The_Frog-King.pdf', 'page': 0, 'type': 'image_ocr', 'image_path': '../extracted_images_fairytale\\The_Frog-King_p0_img0.jpeg'}
[IMAGE_OCR]
Illustration detected (no readable text via OCR).

--- CONTEXT DOC ---
{'source': '../documents/The_Frog-King.pdf', 'page': 2, 'type': 'image_ocr', 'image_path': '../extracted_images_fairytale\\The_Frog-King_p2_img0.jpeg'}
[IMAGE_OCR]
Illustration detected (no readable text via OCR).

--- CONTEXT DOC ---
{'source': '../documents/The_Frog-King.pdf', 'p

In [26]:
import fitz

pdf_path = "../documents/The_Frog-King.pdf"
doc = fitz.open(pdf_path)

for p in range(len(doc)):
    imgs = doc[p].get_images(full=True)
    print(f"Page {p}: {len(imgs)} embedded images")



Page 0: 1 embedded images
Page 1: 0 embedded images
Page 2: 1 embedded images
Page 3: 0 embedded images
