In [None]:
from PyPDF2 import PdfReader

def detect_pdf_type(pdf_path):
    reader = PdfReader(pdf_path)
    text = reader.pages[0].extract_text()

    if text and len(text.strip()) > 30:
        print("Detected: TEXT PDF")
        return "text_pdf"
    else:
        print("Detected: IMAGE PDF (needs OCR)")
        return "image_pdf"

In [None]:
# pdf_path = r'C:\Users\kapil\OneDrive\Desktop\Multimodal Medical Report Analyzer\data\medical_report_table.pdf'
# pdf_path = r'C:\Users\kapil\OneDrive\Desktop\Multimodal Medical Report Analyzer\data\sample_medical_table2.pdf'
pdf_path = r'C:\Users\kapil\OneDrive\Desktop\Multimodal Medical Report Analyzer\data\nyka_table.pdf'
detect_pdf_type(pdf_path)

In [None]:
import camelot

def extract_with_camelot(pdf_path):
    try:
        tables = camelot.read_pdf(pdf_path, flavor="lattice")
        if tables and len(tables) > 0:
            print("Camelot LATTICE worked!")
            return tables[0].df
    except:
        pass

    try:
        tables = camelot.read_pdf(pdf_path, flavor="stream")
        if tables and len(tables) > 0:
            print("Camelot STREAM worked!")
            return tables[0].df
    except:
        pass

    print("Camelot failed on this PDF.")
    return None

In [None]:
import re
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

def ocr_extract(pdf_path):
    # convert first page to image
    images = convert_from_path(pdf_path, dpi=300)
    img = images[0]

    text = pytesseract.image_to_string(img, config="--psm 6 -c preserve_interword_spaces=1")
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]

    results = []
    buffer_test = None

    p_full = re.compile(
        r"(?P<test>[A-Za-z \(\)/]+)\s+"
        r"(?P<result>[\d\.]+)\s*"
        r"(?P<status>Normal|High|Low|Abnormal)?\s*"
        r"(?P<ref>\d+\s*[-â€“]\s*\d+(\.\d+)?)?\s*"
        r"(?P<unit>[A-Za-z/%]+)?"
    )

    p_result = re.compile(r"(?P<result>[\d\.]+)")

    for line in lines:

        m = p_full.match(line)
        if m and m.group("test") and m.group("result"):
            results.append(m.groupdict())
            buffer_test = None
            continue

        if re.match(r"^[A-Za-z \(\)/]+$", line):
            buffer_test = line
            continue

        if buffer_test:
            m2 = p_result.match(line)
            if m2:
                results.append({
                    "test": buffer_test,
                    "result": m2.group("result"),
                    "status": None,
                    "ref": None,
                    "unit": None
                })
            buffer_test = None
            continue

    df = pd.DataFrame(results)
    df = df[df["test"].str.len() > 2].reset_index(drop=True)
    return df

In [None]:
def extract_pdf(pdf_path):
    pdf_type = detect_pdf_type(pdf_path)

    if pdf_type == "text_pdf":
        df = extract_with_camelot(pdf_path)
        if df is not None:
            return df

    # fallback to OCR
    return ocr_extract(pdf_path)

In [None]:
# pdf_path = r'C:\Users\kapil\OneDrive\Desktop\Multimodal Medical Report Analyzer\data\medical_report_table.pdf'
# pdf_path = r'C:\Users\kapil\OneDrive\Desktop\Multimodal Medical Report Analyzer\data\sample_medical_table2.pdf'
pdf_path = r'C:\Users\kapil\OneDrive\Desktop\Multimodal Medical Report Analyzer\data\nyka_table.pdf'
df = extract_pdf(pdf_path)
df

# This code worked very well