# IEEE PDF → LLM-ready chunks (text, tables, figures)

Run these cells top-to-bottom. They will:
- Install lightweight deps
- Extract two-column text in correct reading order
- Extract tables to CSV + Markdown
- Extract figures and link nearby captions
- Build structured chunks with rich metadata for RAG



In [2]:
! pip install --quiet pymupdf pdfplumber unstructured[all-docs] camelot-py ghostscript opencv-python-headless pandas numpy pillow rapidfuzz unidecode python-pptx tqdm


In [3]:
import os
from pathlib import Path

PROJECT_DIR = Path(r"C:\Users\SANJANA\OneDrive\Desktop\random_project")
PDF_PATH = PROJECT_DIR / "ieee_paper.pdf"
OUTPUT_DIR = PROJECT_DIR / "out"
IMG_DIR = OUTPUT_DIR / "figures"
TABLE_DIR = OUTPUT_DIR / "tables"
CHUNKS_PATH = OUTPUT_DIR / "chunks.jsonl"

for p in [OUTPUT_DIR, IMG_DIR, TABLE_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print(f"PDF exists: {PDF_PATH.exists()} -> {PDF_PATH}")
print(f"Outputs -> {OUTPUT_DIR}")


PDF exists: True -> C:\Users\SANJANA\OneDrive\Desktop\random_project\ieee_paper.pdf
Outputs -> C:\Users\SANJANA\OneDrive\Desktop\random_project\out


In [4]:
import fitz  # PyMuPDF
import pdfplumber
import re
import json
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple, Optional
from tqdm import tqdm
from unidecode import unidecode

@dataclass
class Block:
    block_id: str
    page: int
    bbox: Tuple[float, float, float, float]
    type: str  # paragraph | header | caption | table | figure | equation
    text: str
    meta: Dict


def dehyphenate(text: str) -> str:
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
    text = text.replace("\n", " ")
    return re.sub(r"\s+", " ", text).strip()


def guess_block_type(text: str) -> str:
    t = text.strip()
    if len(t) < 3:
        return "paragraph"
    if re.match(r"^(abstract|introduction|related work|methods?|experiments?|results?|discussion|conclusion)s?$", t.lower()):
        return "header"
    if re.match(r"^(fig\.|figure)\s*\d+", t.lower()):
        return "caption"
    if re.match(r"^(table)\s*[ivx\d]+", t.lower()):
        return "caption"
    return "paragraph"


def page_columns_median_x(words: List[Dict]) -> Optional[float]:
    if not words:
        return None
    xs = sorted([(w["x0"], w["x1"]) for w in words])
    # heuristic: find gap between columns by largest horizontal gap in word starts
    starts = sorted([x0 for x0, _ in xs])
    gaps = [(b - a, a, b) for a, b in zip(starts, starts[1:])]
    if not gaps:
        return None
    largest = max(gaps, key=lambda g: g[0])
    _, left_start, right_start = largest
    return (left_start + right_start) / 2


def extract_reading_order_blocks(pdf_path: Path) -> List[Block]:
    blocks: List[Block] = []
    with pdfplumber.open(str(pdf_path)) as pdf:
        for page_idx, page in enumerate(pdf.pages, start=1):
            words = page.extract_words(x_tolerance=2, y_tolerance=2, keep_blank_chars=False, use_text_flow=True)
            if not words:
                continue
            mid_x = page_columns_median_x(words)
            if mid_x:
                left_words = [w for w in words if (w["x1"] + w["x0"]) / 2 <= mid_x]
                right_words = [w for w in words if (w["x1"] + w["x0"]) / 2 > mid_x]
                # sort each column by y then x
                left_words.sort(key=lambda w: (w["top"], w["x0"]))
                right_words.sort(key=lambda w: (w["top"], w["x0"]))
                col_groups = [left_words, right_words]
            else:
                words.sort(key=lambda w: (w["top"], w["x0"]))
                col_groups = [words]

            block_counter = 0
            for col in col_groups:
                if not col:
                    continue
                # group contiguous words into lines by proximity
                lines = []
                current = [col[0]]
                for w in col[1:]:
                    prev = current[-1]
                    if abs(w["top"] - prev["top"]) < 3 and w["x0"] - prev["x1"] < 20:
                        current.append(w)
                    else:
                        lines.append(current)
                        current = [w]
                lines.append(current)

                # group lines into paragraphs by vertical gaps
                paragraphs = []
                cur_lines = [lines[0]]
                for ln in lines[1:]:
                    prev_ln = cur_lines[-1]
                    if ln[0]["top"] - prev_ln[0]["top"] < 14:  # heuristic line spacing
                        cur_lines.append(ln)
                    else:
                        paragraphs.append(cur_lines)
                        cur_lines = [ln]
                paragraphs.append(cur_lines)

                for para in paragraphs:
                    text = "\n".join([" ".join([w["text"] for w in ln]) for ln in para])
                    text = unidecode(text)
                    text = dehyphenate(text)
                    x0 = min(w["x0"] for ln in para for w in ln)
                    y0 = min(w["top"] for ln in para for w in ln)
                    x1 = max(w["x1"] for ln in para for w in ln)
                    y1 = max(w["bottom"] for ln in para for w in ln)
                    btype = guess_block_type(text)
                    block_id = f"p{page_idx}_b{block_counter}"
                    blocks.append(Block(
                        block_id=block_id,
                        page=page_idx,
                        bbox=(x0, y0, x1, y1),
                        type=btype,
                        text=text,
                        meta={}
                    ))
                    block_counter += 1
    return blocks


def link_captions(blocks: List[Block]) -> None:
    # Link captions to nearest preceding or following non-caption block on same page
    by_page: Dict[int, List[Block]] = {}
    for b in blocks:
        by_page.setdefault(b.page, []).append(b)
    for page, items in by_page.items():
        items.sort(key=lambda b: (b.bbox[1], b.bbox[0]))  # top then left
        last_content: Optional[Block] = None
        for b in items:
            if b.type != "caption":
                last_content = b
            else:
                # caption: attach link to last_content if any
                if last_content:
                    b.meta["linked_block_id"] = last_content.block_id
                    last_content.meta.setdefault("captions", []).append(b.block_id)


def save_jsonl(blocks: List[Block], path: Path):
    with path.open("w", encoding="utf-8") as f:
        for b in blocks:
            rec = asdict(b)
            json.dump(rec, f, ensure_ascii=False)
            f.write("\n")

print("Helpers loaded.")


Helpers loaded.


In [5]:
import camelot
import pandas as pd


def extract_tables(pdf_path: Path, out_dir: Path) -> pd.DataFrame:
    records = []
    try:
        tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='lattice')
    except Exception:
        tables = camelot.read_pdf(str(pdf_path), pages='all', flavor='stream')
    for i, t in enumerate(tables):
        df = t.df
        csv_path = out_dir / f"table_{t.page}_{i}.csv"
        md_path = out_dir / f"table_{t.page}_{i}.md"
        df.to_csv(csv_path, index=False)
        # simple markdown rendering
        md = "| " + " | ".join(df.columns.astype(str)) + " |\n"
        md += "| " + " | ".join(["---"] * len(df.columns)) + " |\n"
        for _, row in df.iterrows():
            md += "| " + " | ".join(row.astype(str).tolist()) + " |\n"
        md_path.write_text(md, encoding='utf-8')
        records.append({
            "page": t.page,
            "index": i,
            "csv": str(csv_path),
            "markdown": str(md_path),
            "shape": df.shape,
            "preview": df.head(3).to_dict(orient='list')
        })
    return pd.DataFrame(records)

print("Table extractor ready.")


Table extractor ready.


In [6]:
from PIL import Image
import io


def extract_figures_with_captions(pdf_path: Path, img_dir: Path) -> List[Dict]:
    results = []
    doc = fitz.open(str(pdf_path))
    for pno in range(len(doc)):
        page = doc[pno]
        image_list = page.get_images(full=True)
        # naive caption guess: find blocks containing 'Fig' near image bbox using pdfplumber text boxes
        with pdfplumber.open(str(pdf_path)) as pdf:
            p = pdf.pages[pno]
            words = p.extract_words()
            text_blocks = p.extract_text(layout=True) or ""
        for idx, img in enumerate(image_list):
            xref = img[0]
            base = doc.extract_image(xref)
            ext = base.get("ext", "png")
            image_bytes = base["image"]
            img_path = img_dir / f"figure_p{pno+1}_{idx}.{ext}"
            with open(img_path, "wb") as f:
                f.write(image_bytes)
            caption = None
            # very simple caption heuristic
            if "fig" in text_blocks.lower():
                lines = [ln for ln in text_blocks.split("\n") if re.match(r"^(fig\.|figure)\s*\d+", ln.strip().lower())]
                caption = lines[0] if lines else None
            results.append({
                "page": pno + 1,
                "index": idx,
                "path": str(img_path),
                "caption": caption
            })
    return results

print("Figure extractor ready.")


Figure extractor ready.


In [7]:
# Run the pipeline

assert PDF_PATH.exists(), "Place ieee_paper.pdf in the project folder."

print("1) Extracting text blocks with reading order...")
blocks = extract_reading_order_blocks(PDF_PATH)
link_captions(blocks)
print(f"   -> {len(blocks)} blocks")

print("2) Extracting tables...")
tables_df = extract_tables(PDF_PATH, TABLE_DIR)
print(f"   -> {len(tables_df)} tables saved to {TABLE_DIR}")

print("3) Extracting figures + captions...")
figures = extract_figures_with_captions(PDF_PATH, IMG_DIR)
print(f"   -> {len(figures)} figures saved to {IMG_DIR}")

print("4) Saving chunks with metadata...")
for b in blocks:
    if b.type == "paragraph" and "captions" in b.meta:
        # attach captions text into meta for better context
        b.meta["captions_text"] = [c for c in b.meta["captions"]]

save_jsonl(blocks, CHUNKS_PATH)

print("Done. Artifacts:")
print(f" - Blocks JSONL: {CHUNKS_PATH}")
print(f" - Tables: {TABLE_DIR}")
print(f" - Figures: {IMG_DIR}")


1) Extracting text blocks with reading order...


Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa4' is an invalid float value
Cannot set gray non-stroke color because /'Pa5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value


   -> 260 blocks
2) Extracting tables...


Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa4' is an invalid float value
Cannot set gray non-stroke color because /'Pa5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value


   -> 11 tables saved to C:\Users\SANJANA\OneDrive\Desktop\random_project\out\tables
3) Extracting figures + captions...


Cannot set gray non-stroke color because /'Pa1' is an invalid float value
Cannot set gray non-stroke color because /'Pa2' is an invalid float value
Cannot set gray non-stroke color because /'Pa3' is an invalid float value
Cannot set gray non-stroke color because /'Pa4' is an invalid float value
Cannot set gray non-stroke color because /'Pa5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value


   -> 10 figures saved to C:\Users\SANJANA\OneDrive\Desktop\random_project\out\figures
4) Saving chunks with metadata...
Done. Artifacts:
 - Blocks JSONL: C:\Users\SANJANA\OneDrive\Desktop\random_project\out\chunks.jsonl
 - Tables: C:\Users\SANJANA\OneDrive\Desktop\random_project\out\tables
 - Figures: C:\Users\SANJANA\OneDrive\Desktop\random_project\out\figures


In [8]:
# Quick preview
import itertools
import json

print("Preview: first 5 blocks")
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for i, line in zip(range(5), f):
        rec = json.loads(line)
        print({k: rec[k] for k in ["block_id", "page", "type"]}, rec["text"][:120], "...")

print("\nAvailable tables:")
print(sorted([p.name for p in TABLE_DIR.glob("*.csv")])[:5])

print("\nAvailable figures:")
print(sorted([p.name for p in IMG_DIR.glob("*")])[:5])


Preview: first 5 blocks
{'block_id': 'p1_b0', 'page': 1, 'type': 'paragraph'} of ...
{'block_id': 'p1_b1', 'page': 1, 'type': 'paragraph'} (cid:129) ...
{'block_id': 'p1_b2', 'page': 1, 'type': 'paragraph'} (cid:129) ...
{'block_id': 'p1_b3', 'page': 1, 'type': 'paragraph'} (cid:129) ...
{'block_id': 'p1_b4', 'page': 1, 'type': 'paragraph'} (cid:129) ...

Available tables:
['table_10_4.csv', 'table_10_5.csv', 'table_14_6.csv', 'table_15_7.csv', 'table_18_8.csv']

Available figures:
['figure_p11_0.jpeg', 'figure_p13_0.jpeg', 'figure_p16_0.jpeg', 'figure_p17_0.jpeg', 'figure_p1_0.png']


In [11]:
# Print text of the first page
import pdfplumber

assert PDF_PATH.exists(), "Place ieee_paper.pdf in the project folder."

with pdfplumber.open(str(PDF_PATH)) as pdf:
    page = pdf.pages[8]
    text = page.extract_text(layout=True) or ""

print(text)


                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
                                                             
        