In [None]:
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
print("OK ")

In [None]:
from pathlib import Path
import sys
# Make sure src/ is on the Python path
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(PROJECT_ROOT / "src"))
from pdf_extraction import extract_pdf_text_smart

# Test Sample pdf

In [None]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
pdf_path = PROJECT_ROOT / "data" / "sample2.pdf"   # digital test life(sample2)
pdf_path, pdf_path.exists()

# Minimal PyMuPDF text extraction

In [None]:
import fitz

doc = fitz.open(pdf_path)
all_text = ""

for page in doc:
    page_text = page.get_text("text")  # plain text mode
    all_text += "\n\n--- PAGE BREAK ---\n\n"
    all_text += page_text
doc.close()
print(all_text[:1000])

# Page-wise structured extraction

In [None]:
import fitz
doc = fitz.open(pdf_path)
pages = []
for page_number, page in enumerate(doc, start=1):
    text = page.get_text("text")
    pages.append({
        "page_number": page_number,
        "char_count": len(text),
        "text": text
    })
doc.close()
len(pages), pages[0]["page_number"], pages[0]["char_count"]

In [None]:
# to inspect on one page

print(f"Total pages: {len(pages)}\n")
for p in pages:
    print(f"=== PAGE {p['page_number']} | chars: {p['char_count']} ===")
    print(p["text"][:500])
    print("\n" + "="*60 + "\n")


In [None]:
# Heuristic: detect “probably scanned” pages

import numpy as np

char_counts = [p["char_count"] for p in pages]
print("Char counts per page:", char_counts)

avg_chars = np.mean(char_counts) if char_counts else 0
print("Average chars per page:", avg_chars)

# Simple rule: pages with very few chars are suspicious (likely scanned)
scanned_like_pages = [p for p in pages if p["char_count"] < 50]
print("Pages that look scanned (char_count < 50):")
[(p["page_number"], p["char_count"]) for p in scanned_like_pages]

# OCR helper (Not fully integrate yet)

In [None]:
from pdf2image import convert_from_path
import pytesseract

def ocr_single_page(pdf_path, page_number, dpi=300, lang="eng"):
    """
    OCR one page (1-based page_number) from the PDF.
    Returns extracted text as string.
    """
    images = convert_from_path(
        pdf_path,
        first_page=page_number,
        last_page=page_number,
        dpi=dpi
    )
    if not images:
        return ""
    
    img = images[0]
    text = pytesseract.image_to_string(img, lang=lang)
    return text

# Example: OCR page 1
ocr_text_page_1 = ocr_single_page(pdf_path, page_number=1)
print(ocr_text_page_1[:500])

#  unified extractor function

Return a structured dict we can reuse later

In [None]:
import fitz
from pdf2image import convert_from_path
import pytesseract
import re

def extract_pdf_text_smart(pdf_path, force_ocr=False, scanned_threshold=50, dpi=300, lang="eng"):
    """
    Smart PDF text extraction:
    - Uses direct text extraction when possible
    - Falls back to OCR for pages that look scanned or when force_ocr=True
    Returns a dict with:
      - file_name
      - total_pages
      - pages: list of {page_number, mode, char_count, text}
      - full_text
    """
    doc = fitz.open(pdf_path)
    
    pages_data = []
    full_text_parts = []
    
    total_pages = len(doc)
    
    for page_index, page in enumerate(doc, start=1):
        # 1) Try direct extraction
        direct_text = page.get_text("text")
        direct_text = direct_text or ""
        direct_char_count = len(direct_text)
        
        use_ocr = force_ocr or (direct_char_count < scanned_threshold)
        
        if use_ocr:
            # 2) OCR fallback
            images = convert_from_path(
                pdf_path,
                first_page=page_index,
                last_page=page_index,
                dpi=dpi
            )
            if images:
                ocr_text = pytesseract.image_to_string(images[0], lang=lang)
            else:
                ocr_text = ""
            
            raw_text = ocr_text
            mode = "ocr"
        else:
            raw_text = direct_text
            mode = "direct"
        
        # 3) Clean text a bit
        cleaned_text = clean_text_basic(raw_text)
        
        pages_data.append({
            "page_number": page_index,
            "mode": mode,               # "direct" or "ocr"
            "char_count": len(cleaned_text),
            "text": cleaned_text
        })
        
        full_text_parts.append(f"\n\n--- PAGE {page_index} ({mode}) ---\n\n")
        full_text_parts.append(cleaned_text)
    
    doc.close()
    
    full_text = "".join(full_text_parts)
    
    return {
        "file_name": pdf_path.name if hasattr(pdf_path, "name") else str(pdf_path),
        "total_pages": total_pages,
        "pages": pages_data,
        "full_text": full_text
    }

def clean_text_basic(text: str) -> str:
    """Simple text cleaning: collapse spaces/newlines, strip."""
    if not text:
        return ""
    # Collapse multiple whitespace to single space/newline combos
    text = re.sub(r'\r', '\n', text)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

In [None]:
# Testing unified extractor function
result = extract_pdf_text_smart(pdf_path)

print("File:", result["file_name"])
print("Total pages:", result["total_pages"])
for p in result["pages"]:
    print(f"Page {p['page_number']}: mode={p['mode']}, chars={p['char_count']}")

print("\nPreview:")
print(result["full_text"][:800])

# Testing on scanned image

In [None]:
extract_pdf_text_smart(r'C:\Users\KAMALESH MUKHERJEE\Desktop\Multimodal Medical Report Analyzer\data\sample_scanned_image.pdf')

# Testing Both pdf's

In [None]:
pdf_digital = PROJECT_ROOT / "data" / "sample2.pdf"
pdf_scanned = PROJECT_ROOT / "data" / "sample_scanned_image.pdf"

res_digital = extract_pdf_text_smart(pdf_digital)
res_scanned = extract_pdf_text_smart(pdf_scanned)

print("DIGITAL:")
print(res_digital["file_name"], res_digital["total_pages"])
for p in res_digital["pages"]:
    print(f"  Page {p['page_number']}: mode={p['mode']}, chars={p['char_count']}")

print("\nSCANNED:")
print(res_scanned["file_name"], res_scanned["total_pages"])
for p in res_scanned["pages"]:
    print(f"  Page {p['page_number']}: mode={p['mode']}, chars={p['char_count']}")

# Code is working well

In [None]:
pdf_scanned = r'C:\Users\KAMALESH MUKHERJEE\Desktop\Multimodal Medical Report Analyzer\data\scanned_medical_report.pdf'
res_scanned = extract_pdf_text_smart(pdf_scanned)
print("\nSCANNED:")
print(res_scanned["file_name"], res_scanned["total_pages"])
for p in res_scanned["pages"]:
    print(f"  Page {p['page_number']}: mode={p['mode']}, chars={p['char_count']}")