In [9]:
import os
import fitz
import re
from ncert_constants import NCERT_FILE, PAGE_DELIMITER

In [3]:
def clean_text(text: str) -> str:
    # --- Remove headers/footers ---
    text = re.sub(r"Reprint\s*20\d{2}-\d{2}", "", text)
    text = re.sub(r"CHAPTER\s*\d+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
    
    # --- Remove figure captions and labels ---
    text = re.sub(r"Figure\s*\d+(\.\d+)*[^\n]*", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Types of [^\n]*\n?", "", text)  # e.g. "Types of aestivation..."
    text = re.sub(r"\bdiagram\b[^\n]*", "", text, flags=re.IGNORECASE)

    # --- Remove loose labels like (a), (b), (c), etc. ---
    text = re.sub(r"\([a-z]\)", "", text)
    text = re.sub(r"\([A-Z]\)", "", text)

    # --- Remove multiple spaces, newlines ---
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    # --- Remove stray punctuation or hyphen artifacts ---
    text = text.replace("â€“", "-").replace("â€”", "-")
    text = re.sub(r"-\s+", "", text)  # join hyphenated words split across lines

    return text.strip()

In [4]:
pdf_folder = "ncert_book"

pdf_files = sorted([
    os.path.join(pdf_folder, f)
    for f in os.listdir(pdf_folder)
    if f.lower().endswith(".pdf")
])

print(f"ðŸ“‚ Found {len(pdf_files)} PDF files in '{pdf_folder}'")

all_pages = []

for pdf_path in pdf_files:
    print(f"ðŸ“˜ Extracting {pdf_path} with PyMuPDF...")

    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text", flags=1)
            cleaned = clean_text(text)
            if cleaned.strip():
                all_pages.append(cleaned)

print(f"âœ… Extraction complete â€” total pages processed: {len(all_pages)}")


ðŸ“‚ Found 32 PDF files in 'ncert_book'
ðŸ“˜ Extracting ncert_book\kebo101.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo102.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo103.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo104.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo105.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo106.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo107.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo108.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo109.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo110.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo111.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo112.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo113.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo114.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo115.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo116.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_book\kebo117.pdf with PyMuPDF...
ðŸ“˜ Extracting ncert_boo

In [10]:
with open(NCERT_FILE, "w", encoding="utf-8") as f:
    f.write(PAGE_DELIMITER.join([page for page in all_pages]))