## Installations & imports

In [2]:
!pip install PyPDF2



In [3]:
import PyPDF2

In [4]:
!pip install -U spacy PyPDF2 scikit-learn
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Downloading spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.8.6
    Uninstalling spacy-3.8.6:
      Successfully uninstalled spacy-3.8.6
Successfully installed spacy-3.8.7
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to relo

In [1]:
!pip install fitz

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Collecting traits>=6.2 (from nipype->fitz)
  Downloading traits-7.0.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Collecting acres (from nipype->fitz)
  Downloading acres-0.4.1-py3-none-any.whl.metadata (5.9 kB)
Collecting etelemetry>=0.3.1

In [2]:
!pip install --upgrade pymupdf



## Read pdf data

### pypdf2

In [7]:
reader = PyPDF2.PdfReader(r"professional_guide_to_diseases.pdf",'rb')

pdf_data =[]

for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        pdf_data.append((i, text))

### spacy

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text.lower())
    return ' '.join([
        token.lemma_ for token in doc
        if token.is_alpha and not token.is_stop and token.pos_ != 'ADJ'
    ])

# Preprocess all pages
processed_text_data = [(page, preprocess(text)) for page, text in pdf_data]


In [10]:
processed_text_data[0]

(0,
 'p title guide disease edition copyright lippincott w illiams w ilkin table content disorder disorder introduction system begin activity fetus barely month body system cease activity end life system activity define presence life life give transport system heart artery vein lymphatic form network serve body transport system bring life support oxygen nutrient cell remove waste product carry hormone body call system divide branch circulation blood pick oxygen liberate waste product carbon dioxide circulation include circulation blood carry oxygen nutrient cell transport waste product kidney liver skin excretion circulation require functioning heart propel blood system contraction locate sternum heart organ size man fist layer endocardium layer myocardium layer contract beat epicardium membrane surface heart cover heart membrane call pericardium layer layer contact heart layer prevent irritation heart move layer contraction fluid lubricate pericardium heart chamber chamber call atria 

### PyMuPDF

In [9]:
import fitz  # PyMuPDF

In [6]:
doc = fitz.open("professional_guide_to_diseases.pdf")
content = []

for page in doc:
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                text = " ".join([span["text"] for span in line["spans"]])
                font_size = line["spans"][0]["size"]
                font_flags = line["spans"][0]["flags"]
                content.append((font_size, font_flags, text))

# Now sort/group by font_size to infer heading levels


In [24]:
import fitz  # PyMuPDF
from collections import defaultdict

doc = fitz.open("professional_guide_to_diseases.pdf")  # update with your actual file
lines = []

# Extract text lines with font size
for page in doc:
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                spans = line["spans"]
                if spans:
                    text = " ".join(span["text"].strip() for span in spans).strip()
                    if text:
                        font_size = round(spans[0]["size"], 1)
                        lines.append({
                            "text": text,
                            "font_size": font_size,
                            "flags": spans[0]["flags"],
                            "bbox": spans[0]["bbox"],
                        })

# Determine header levels based on font size
font_sizes = sorted(set(line["font_size"] for line in lines), reverse=True)
font_size_levels = {size: f"Header_{i}" for i, size in enumerate(font_sizes)}

# Group into nested structure
structured = {}
current_h1 = None
current_h2 = None

for line in lines:
    level = font_size_levels[line["font_size"]]
    text = line["text"]

    if level == "Header_0":
        current_h1 = text
        structured[current_h1] = {}
        current_h2 = None  # reset subsection
    elif level == "Header_1":
        current_h2 = text
        if current_h1:
            structured[current_h1][current_h2] = ""
    else:
        if current_h1:
            # If no subheading yet, create a default one
            if not current_h2:
                current_h2 = "__intro__"
                structured[current_h1][current_h2] = ""
            structured[current_h1][current_h2] += " " + text

# Display structured output
import json
print(json.dumps(structured, indent=2))


{
  "3": {},
  "Neurologic disorders": {
  },
  "4": {},
  "Gastrointestinal disorders": {
    "__intro__": " Introduction The GI tract, also known as the alimentary canal, is a long, hollow, musculomembranous tube consisting of glands and accessory organs (salivary glands, liver, gallbladder, and pancreas). (See Reviewing GI anatomy and physiology. See also Histology of the GI tract, page 236.) The GI tract breaks down food\u2014carbohydrates, fats, and proteins\u2014into molecules small enough to permeate cell membranes, thus providing cells with the necessary energy to function properly; it prepares food for cellular absorption by altering its physical and chemical composition. (See Primary source of digestive hormones , page 237.)\nConsequently, a malfunction along the GI tract can produce far-reaching\nmetabolic effects, eventually threatening life itself. The GI tract is an unsterile system filled with bacteria and other flora; these organisms can cause superinfection from antibi

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [31]:
structured["Metabolic and nutritional disorders"]

{'__intro__': " Introduction Metabolism is the physiologic process that allows cells to transform food into energy and continually rebuild body cells. Metabolism has two phases: catabolism and anabolism. In catabolism, the energyproducing phase of metabolism, the body breaks down large food molecules into smaller ones; in anabolism, the tissue-building phase, the body converts small molecules into larger ones (such as antibodies to keep the body capable of fighting infection). Both phases are accomplished by means of a chemical process using energy. A wide range of nutrients is metabolized to meet the body's needs. (See Essential nutrients and their functions .) ELDER TIP A person's protein, vitamin, and mineral requirements usually remain the same as he ages, although calorie needs decline. Diminished activity may lower energy requirements\nby almost 200 calories per day for men and women ages 51 to 75, 400 calories per day for women older than age 75, and 500 calories per day for men

In [15]:
import fitz  # PyMuPDF
import pandas as pd
import re

# Define heading keywords
HEADING_3_KEYWORDS = ['Causes and incidence', 'Complications', 'Signs and symptoms',
                      'Diagnosis', 'Treatment', 'Special considerations']
HEADING_3_KEYWORDS = [kw.lower() for kw in HEADING_3_KEYWORDS]
INTRO_KEYWORD = "introduction"

# Load PDF
doc = fitz.open("professional_guide_to_diseases.pdf")  # Replace with your file path

# State trackers
data = []
buffer = []
current_section = ""
current_subsection = ""
current_topic = ""

def flush_buffer(page_num):
    global buffer, current_section, current_subsection, current_topic
    if buffer:
        data.append({
            "section": current_section,
            "subsection": current_subsection,
            "topic": current_topic,
            "content": "\n".join(buffer),
            "page": page_num + 1
        })
        buffer.clear()

# Loop through pages
for page_num, page in enumerate(doc):
    blocks = page.get_text("dict")["blocks"]
    lines = []
    for b in blocks:
        for l in b.get("lines", []):
            line_text = ""
            max_size = 0
            is_italic = False
            for span in l.get("spans", []):
                text = span["text"].strip()
                if not text:
                    continue
                line_text += text + " "
                size = span["size"]
                max_size = max(max_size, size)
                if "Italic" in span["font"] or "Oblique" in span["font"]:
                    is_italic = True
            line_text = line_text.strip()
            if line_text:
                lines.append((line_text, round(max_size, 1), is_italic))

    for text, size, italic in lines:
        lowered = text.lower()

        # Heading-1 (Main Sections)
        if re.match(r"^\d+\s+[A-Za-z ]+$", text) and size >= 13:
            flush_buffer(page_num)
            current_section = re.sub(r"^\d+\s+", "", text).strip().title()
            current_subsection = ""
            current_topic = ""
            continue

        # Heading-2 (Subsections)
        if not italic and size < 13:
            if lowered == INTRO_KEYWORD:
                flush_buffer(page_num)
                current_subsection = text
                current_topic = text
                continue
            elif lowered not in HEADING_3_KEYWORDS:
                flush_buffer(page_num)
                current_subsection = text
                current_topic = ""
                continue

        # Heading-3 (Diseases and Subtopics)
        if italic:
            if lowered in HEADING_3_KEYWORDS:
                flush_buffer(page_num)
                current_topic = text
                continue
            elif lowered not in HEADING_3_KEYWORDS:
                flush_buffer(page_num)
                current_topic = text
                continue

        # Append content
        buffer.append(text)

# Final flush
flush_buffer(page_num)

# Create DataFrame
df = pd.DataFrame(data)
df.head()


Unnamed: 0,section,subsection,topic,content,page
0,,> Table of Contents > 1 - Cardiovascular disor...,,1\nCardiovascular disorders\nIntroduction,1
1,,cardiac rehabilitation programs.,,CONGENITAL ACYANOTIC DEFECTS,8
2,,unrelated medical problem.,,CONGENITAL CYANOTIC DEFECTS,18
3,,systemic venous return to the mitral valve (3).,,ACQUIRED INFLAMMATORY HEART DISEASE,24
4,,rheumatic fever are inadequate for preventing ...,,VALVE DISORDERS,36


In [19]:
df.head()

Unnamed: 0,section,subsection,topic,content,page
0,,> Table of Contents > 1 - Cardiovascular disor...,,1\nCardiovascular disorders\nIntroduction,1
1,,cardiac rehabilitation programs.,,CONGENITAL ACYANOTIC DEFECTS,8
2,,unrelated medical problem.,,CONGENITAL CYANOTIC DEFECTS,18
3,,systemic venous return to the mitral valve (3).,,ACQUIRED INFLAMMATORY HEART DISEASE,24
4,,rheumatic fever are inadequate for preventing ...,,VALVE DISORDERS,36


In [20]:
import fitz  # PyMuPDF
import pandas as pd
import re

# === User Inputs ===
PDF_PATH = "professional_guide_to_diseases.pdf"  # Replace with your file path

# === Configuration ===
INTRO_KEYWORD = "introduction"
HEADING_3_KEYWORDS = [
    "Causes and incidence", "Complications", "Signs and symptoms",
    "Diagnosis", "Treatment", "Special considerations"
]
HEADING_3_KEYWORDS = [kw.lower() for kw in HEADING_3_KEYWORDS]

# === Load PDF ===
doc = fitz.open(PDF_PATH)

# === Initialize ===
data = []
buffer = []
current_section = ""
current_subsection = ""
current_topic = ""

# === Flush buffer into data ===
def flush_buffer(page_num):
    global buffer, current_section, current_subsection, current_topic
    if buffer:
        data.append({
            "section": current_section,
            "subsection": current_subsection,
            "topic": current_topic,
            "content": "\n".join(buffer),
            "page": page_num + 1
        })
        buffer.clear()

# === Determine font size thresholds dynamically ===
font_sizes = []
for page in doc:
    for block in page.get_text("dict")["blocks"]:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                font_sizes.append(round(span["size"], 1))
from collections import Counter
common_sizes = Counter(font_sizes).most_common()
base_font = common_sizes[-1][0]  # Smallest common size (body text)
heading2_font = common_sizes[-2][0]
heading1_font = common_sizes[0][0]

# === Process each page ===
for page_num, page in enumerate(doc):
    blocks = page.get_text("dict")["blocks"]

    lines = []
    for b in blocks:
        for l in b.get("lines", []):
            for span in l.get("spans", []):
                text = span["text"].strip()
                if not text:
                    continue
                size = round(span["size"], 1)
                is_italic = "Italic" in span["font"] or "Oblique" in span["font"]
                lines.append((text, size, is_italic))

    for text, size, italic in lines:
        lowered = text.lower()

        # === Heading-1 (Main Sections) ===
        if re.match(r"^\d+\s+[A-Za-z ]+$", text) and size >= heading1_font:
            flush_buffer(page_num)
            current_section = re.sub(r"^\d+\s+", "", text).strip().title()
            current_subsection = ""
            current_topic = ""
            continue

        # === Heading-2 ===
        if not italic and lowered == INTRO_KEYWORD:
            flush_buffer(page_num)
            current_subsection = text.strip()
            current_topic = text.strip()
            continue
        elif not italic and lowered not in HEADING_3_KEYWORDS and size == heading2_font:
            flush_buffer(page_num)
            current_subsection = text.strip()
            current_topic = ""
            continue

        # === Heading-3 ===
        if italic:
            if lowered in HEADING_3_KEYWORDS:
                flush_buffer(page_num)
                current_topic = text.strip()
                continue
            else:
                flush_buffer(page_num)
                current_topic = text.strip()
                continue

        # === Content ===
        buffer.append(text.strip())

# Final flush
flush_buffer(page_num)

# === Create DataFrame ===
df = pd.DataFrame(data)
df = df[df["section"].notnull()]  # Remove any initial garbage
df.reset_index(drop=True, inplace=True)
df.head()


Unnamed: 0,section,subsection,topic,content,page
0,,,,P\n:\nTitle:,1
1,,,"Professional Guide to Diseases, 9th Edition",Copyright ©2009 Lippincott Williams & Wilkins\...,1
2,,Introduction,Introduction,The cardiovascular system begins its activity ...,1
3,,Introduction,Life-giving transport system,"The heart, arteries, veins, and lymphatics for...",1
4,,Introduction,"system,",it may be divided into two branches:,1


In [22]:
df.head(50)

Unnamed: 0,section,subsection,topic,content,page
0,,,,P\n:\nTitle:,1
1,,,"Professional Guide to Diseases, 9th Edition",Copyright ©2009 Lippincott Williams & Wilkins\...,1
2,,Introduction,Introduction,The cardiovascular system begins its activity ...,1
3,,Introduction,Life-giving transport system,"The heart, arteries, veins, and lymphatics for...",1
4,,Introduction,"system,",it may be divided into two branches:,1
5,,Introduction,"pulmonary circulation,",in which blood picks up new\noxygen and libera...,1
6,,Introduction,systemic circulation,"(including coronary\ncirculation), in which bl...",1
7,,Introduction,endocardium —,the smooth inner layer; the,1
8,,Introduction,myocardium —,"the\nthick, muscular middle layer that contrac...",1
9,,Introduction,epicardium—,"the thin, serous\nmembrane, or outer surface o...",1


In [29]:
import fitz  # PyMuPDF

def is_heading_1(font_size, font_flags, text):
    return font_size >= 15 and (font_flags & 2)  # bold & large font

def is_heading_2(font_size, font_flags, text):
    return 12 <= font_size < 15 and (font_flags & 2 or font_flags & 1)  # bold or italic medium font

def extract_pdf_to_dict_paragraphs(pdf_path):
    doc = fitz.open(pdf_path)
    content_dict = {}
    current_section = None
    current_subsection = None
    paragraph_buffer = []

    def flush_paragraph():
        nonlocal paragraph_buffer
        if paragraph_buffer:
            paragraph_text = " ".join(paragraph_buffer).strip()
            if paragraph_text:
                # Ensure section dict exists
                if current_section is None:
                    cs_current = "Unknown Section"
                else:
                    cs_current = current_section

                if cs_current not in content_dict:
                    content_dict[cs_current] = {}

                # Ensure subsection list exists
                if current_subsection is None:
                    csub = "Introduction"
                else:
                    csub = current_subsection

                if csub not in content_dict[cs_current]:
                    content_dict[cs_current][csub] = []

                content_dict[cs_current][csub].append(paragraph_text)
            paragraph_buffer = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if b['type'] != 0:
                continue  # skip non-text blocks

            for line in b["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    if not text:
                        continue

                    font_size = span["size"]
                    font_flags = span["flags"]

                    if is_heading_1(font_size, font_flags, text):
                        flush_paragraph()
                        current_section = text
                        current_subsection = None
                    elif is_heading_2(font_size, font_flags, text):
                        flush_paragraph()
                        current_subsection = text
                        if current_section is None:
                            current_section = "Unknown Section"
                            if current_section not in content_dict:
                                content_dict[current_section] = {}
                    else:
                        paragraph_buffer.append(text)

    flush_paragraph()
    return content_dict

# Example usage:
pdf_path = "professional_guide_to_diseases.pdf"
content_dict = extract_pdf_to_dict_paragraphs(pdf_path)

# import json
# print(json.dumps(content_dict, indent=2, ensure_ascii=False))


In [30]:
len(content_dict)

547

In [31]:
# prompt: print first key-value of the content_dict

first_key = list(content_dict.keys())[0]
first_value = content_dict[first_key]
print(f"First key: {first_key}")
print(f"First value: {first_value}")

First key: Unknown Section
First value: {'Introduction': ['P : Title: Professional Guide to Diseases, 9th Edition Copyright ©2009 Lippincott Williams & Wilkins > Table of Contents > 1 - Cardiovascular disorders 1 Cardiovascular disorders Introduction The cardiovascular system begins its activity when the fetus is barely a month old and is the last body system to cease activity at the end of life. This system is so vital that its activity defines the presence of life.'], 'Life-giving transport system': ["The heart, arteries, veins, and lymphatics form the cardiovascular network that serves as the body's transport system, bringing life-supporting oxygen and nutrients to cells, removing metabolic waste products, and carrying hormones from one part of the body to another. Often called the circulatory system, it may be divided into two branches: pulmonary circulation, in which blood picks up new oxygen and liberates the waste product carbon dioxide; and systemic circulation (including coron

In [34]:
content_dict['Unknown Section'].keys()

dict_keys(['Introduction', 'Life-giving transport system', 'Heart valves', 'The cardiac cycle', 'Cardiac conduction', 'Cardiac output', 'Circulation and pulses', 'Cardiovascular assessment', 'Special cardiovascular tests', 'Blood tests', 'Managing cardiovascular disease', 'Ventricular septal defect', 'Causes and incidence', 'Complications', 'Signs and symptoms', 'Diagnosis', 'Treatment', 'Special considerations', 'Atrial septal defect', 'Coarctation of the aorta', 'Patent ductus arteriosus', 'Tetralogy of Fallot', 'Transposition of the great arteries', 'Myocarditis', 'Endocarditis', 'Pericarditis', 'Rheumatic fever and rheumatic heart disease', 'Valvular heart disease', 'Hypertension', 'Coronary artery disease', 'Myocardial infarction', 'Heart failure', 'Dilated cardiomyopathy', 'Hypertrophic cardiomyopathy', 'Hypovolemic shock', 'Cardiogenic shock', 'Ventricular aneurysm', 'Cardiac tamponade', 'Cardiac arrhythmias', 'Thoracic aortic aneurysm', 'Abdominal aneurysm', 'Femoral and poplit

In [35]:
import fitz

def analyze_font_sizes(pdf_path, page_num=0):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num)
    blocks = page.get_text("dict")["blocks"]

    font_stats = {}

    for b in blocks:
        if b['type'] != 0:
            continue  # skip non-text blocks
        for line in b["lines"]:
            for span in line["spans"]:
                size = span["size"]
                flags = span["flags"]
                text = span["text"].strip()
                if not text:
                    continue

                # Simple style detection
                is_bold = (flags & 2) != 0
                is_italic = (flags & 1) != 0
                style = []
                if is_bold:
                    style.append("bold")
                if is_italic:
                    style.append("italic")
                if not style:
                    style.append("normal")
                style_str = "+".join(style)

                key = (size, style_str)
                font_stats[key] = font_stats.get(key, 0) + 1

    # Sort by font size descending
    sorted_stats = sorted(font_stats.items(), key=lambda x: x[0][0], reverse=True)

    print("Font sizes and styles on page", page_num+1)
    for (size, style), count in sorted_stats:
        print(f"Font size: {size:.2f}, Style: {style}, Count: {count}")

# Example usage
pdf_path = "professional_guide_to_diseases.pdf"
analyze_font_sizes(pdf_path)


Font sizes and styles on page 1
Font size: 16.66, Style: normal, Count: 2
Font size: 14.16, Style: normal, Count: 1
Font size: 14.16, Style: bold, Count: 2
Font size: 11.25, Style: normal, Count: 1
Font size: 11.25, Style: bold, Count: 2
Font size: 10.00, Style: normal, Count: 47
Font size: 10.00, Style: bold, Count: 19
Font size: 7.50, Style: normal, Count: 1


In [36]:
import fitz  # PyMuPDF

# Keywords for heading-3 (sub-subsection)
HEADING_3_KEYWORDS = {
    "causes", "symptoms", "diagnosis", "treatment", "complications",
    "management", "prognosis", "special considerations"
}

def normalize_heading(text):
    return text.strip().lower()

def is_bold(flags):
    # In your PDF, bold flag == italic meaning for you
    return (flags & 2) != 0

def is_heading_1(font_size, bold):
    # Heading-1: font size ~16.66 and NOT bold (normal)
    return abs(font_size - 16.66) < 1 and not bold

def is_heading_2(font_size, bold):
    # Heading-2: font size ~14.16 and either bold or normal
    return abs(font_size - 14.16) < 1

def is_heading_3(text, font_size, bold):
    # Heading-3: font size between 11 and 14 AND bold (your "italic")
    # OR text is a known keyword ignoring case
    normalized = normalize_heading(text)
    if normalized in HEADING_3_KEYWORDS:
        return True
    return 11 <= font_size <= 14 and bold

def extract_pdf_structured(pdf_path):
    doc = fitz.open(pdf_path)

    content_dict = {}
    current_section = None
    current_subsection = None
    current_subsubsection = None

    subsection_buffer = []
    subsubsection_buffer = []

    def flush_subsubsection():
        nonlocal subsubsection_buffer, current_section, current_subsection, current_subsubsection, content_dict
        if subsubsection_buffer:
            paragraph_text = " ".join(subsubsection_buffer).strip()
            if paragraph_text:
                sec = current_section or "Unknown Section"
                subsec = current_subsection or "Introduction"
                content_dict.setdefault(sec, {})
                content_dict[sec].setdefault(subsec, {})
                if current_subsubsection is None:
                    # No heading-3, add to subsection content
                    content_dict[sec][subsec].setdefault("content", []).append(paragraph_text)
                else:
                    # Add to heading-3 key
                    content_dict[sec][subsec].setdefault(current_subsubsection, []).append(paragraph_text)
            subsubsection_buffer.clear()

    def flush_subsection():
        nonlocal subsection_buffer, current_section, current_subsection, content_dict
        if subsection_buffer:
            paragraph_text = " ".join(subsection_buffer).strip()
            if paragraph_text:
                sec = current_section or "Unknown Section"
                subsec = current_subsection or "Introduction"
                content_dict.setdefault(sec, {})
                content_dict[sec].setdefault(subsec, {})
                # Add to 'content' key under subsection
                content_dict[sec][subsec].setdefault("content", []).append(paragraph_text)
            subsection_buffer.clear()

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if b['type'] != 0:
                continue

            for line in b["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    if not text:
                        continue

                    font_size = span["size"]
                    flags = span["flags"]
                    bold = is_bold(flags)

                    if is_heading_1(font_size, bold):
                        # Flush all buffers, new section
                        flush_subsubsection()
                        flush_subsection()
                        current_section = text
                        current_subsection = None
                        current_subsubsection = None
                    elif is_heading_2(font_size, bold):
                        # Flush subsubsection and subsection buffers, new subsection
                        flush_subsubsection()
                        flush_subsection()
                        current_subsection = text
                        current_subsubsection = None
                        if current_section is None:
                            current_section = "Unknown Section"
                    elif is_heading_3(text, font_size, bold):
                        # Flush previous subsubsection buffer, new subsubsection
                        flush_subsubsection()
                        current_subsubsection = text
                    else:
                        # Add text to appropriate buffer
                        if current_subsubsection is not None:
                            subsubsection_buffer.append(text)
                        elif current_subsection is not None:
                            subsection_buffer.append(text)
                        else:
                            # No headings found yet; accumulate in subsection buffer with default values
                            if current_section is None:
                                current_section = "Unknown Section"
                            if current_subsection is None:
                                current_subsection = "Introduction"
                            subsection_buffer.append(text)

    # Flush leftovers
    flush_subsubsection()
    flush_subsection()

    return content_dict

# Usage example:
pdf_path = "professional_guide_to_diseases.pdf"
content = extract_pdf_structured(pdf_path)

# import json
# print(json.dumps(content, indent=2, ensure_ascii=False))


In [40]:
content['Cardiovascular disorders'].keys()

dict_keys(['Introduction', 'Life-giving transport system', 'Heart valves', 'The cardiac cycle', 'Cardiac conduction', 'Cardiac output', 'Circulation and pulses', 'Cardiovascular assessment', 'Special cardiovascular tests', 'Blood tests', 'Managing cardiovascular disease', 'Ventricular septal defect', 'Causes and incidence', 'Complications', 'Signs and symptoms', 'Diagnosis', 'Treatment', 'Special considerations', 'Atrial septal defect', 'Coarctation of the aorta', 'Patent ductus arteriosus', 'Tetralogy of Fallot', 'Transposition of the great arteries', 'Myocarditis', 'Endocarditis', 'Pericarditis', 'Rheumatic fever and rheumatic heart disease', 'Valvular heart disease', 'DEGENERATIVE CARDIOVASCULAR DISORDERS', 'Hypertension', 'Coronary artery disease', 'PREVENTION', 'Myocardial infarction', 'Heart failure', 'Dilated cardiomyopathy', 'Hypertrophic cardiomyopathy', 'Hypovolemic shock', 'Cardiogenic shock', 'Ventricular aneurysm', 'Cardiac tamponade', 'Cardiac arrhythmias', 'Thoracic aort

## working - pymupdf

In [3]:
import fitz  # PyMuPDF

In [80]:
TARGET_SECTIONS = {
    "cardiovascular disorders",
    "respiratory disorders",
    "neurologic disorders",
    "gastrointestinal disorders",
    "hepatobiliary disorders",
    "musculoskeletal disorders",
    "renal and urologic disorders",
    "immune disorders",
    "hematologic disorders",
    "metabolic and nutritional disorders",
    "infectious disorders",
    "dermatologic disorders",
    "endocrine disorders",
    "psychiatric disorders",
    "environmental and occupational disorders",
    "neoplastic disorders",
    "congenital and genetic disorders",
    "trauma and burns",
    "critical care medicine",
    "pediatric disorders"
}


In [56]:
def extract_sectionwise_content(pdf_path):
    doc = fitz.open(pdf_path)
    content_dict = {}

    current_section = None
    buffer = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block["type"] != 0:
                continue

            for line in block["lines"]:
                spans = line["spans"]
                if not spans:
                    continue

                line_text = " ".join(span["text"].strip() for span in spans if span["text"].strip())
                if not line_text:
                    continue

                font_size = spans[0]["size"]
                norm_text = line_text.lower().strip()

                # Detect section heading directly (font size > 15 and text match)
                if font_size > 15 and norm_text in TARGET_SECTIONS:
                    if current_section and buffer:
                        content_dict[current_section] = " ".join(buffer).strip()
                        buffer.clear()
                    current_section = line_text.strip()  # Preserve original casing
                    continue

                # Regular content lines
                if current_section:
                    buffer.append(line_text)

    # Save last section
    if current_section and buffer:
        content_dict[current_section] = " ".join(buffer).strip()

    return content_dict


In [57]:
content_dict=extract_sectionwise_content("diseases_data.pdf")

In [59]:
len(content_dict.keys())

10

# preprocess content dict

In [5]:
import fitz

def analyze_font_sizes(pdf_path, page_num=0):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num)
    blocks = page.get_text("dict")["blocks"]

    font_stats = {}

    for b in blocks:
        if b['type'] != 0:
            continue  # skip non-text blocks
        for line in b["lines"]:
            for span in line["spans"]:
                size = span["size"]
                flags = span["flags"]
                text = span["text"].strip()
                if not text:
                    continue

                # Simple style detection
                is_bold = (flags & 2) != 0
                is_italic = (flags & 1) != 0
                style = []
                if is_bold:
                    style.append("bold")
                if is_italic:
                    style.append("italic")
                if not style:
                    style.append("normal")
                style_str = "+".join(style)

                key = (size, style_str)
                font_stats[key] = font_stats.get(key, 0) + 1

    # Sort by font size descending
    sorted_stats = sorted(font_stats.items(), key=lambda x: x[0][0], reverse=True)

    print("Font sizes and styles on page", page_num+1)
    for (size, style), count in sorted_stats:
        print(f"Font size: {size:.2f}, Style: {style}, Count: {count}")

# Example usage
pdf_path = "2.Respiratory.pdf"
analyze_font_sizes(pdf_path)


Font sizes and styles on page 1
Font size: 11.04, Style: normal, Count: 39


In [6]:
import fitz

def extract_headings_and_content(pdf_path):
    doc = fitz.open(pdf_path)
    content_dict = {}

    current_heading = None
    buffer = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block["type"] != 0:
                continue

            for line in block["lines"]:
                spans = line["spans"]
                if not spans:
                    continue

                line_text = " ".join(span["text"].strip() for span in spans if span["text"].strip())
                norm_text = line_text.strip()

                if not norm_text:
                    continue

                # Heuristic: detect a heading (title-like)
                if (
                    len(norm_text.split()) < 10 and
                    norm_text[0].isupper() and
                    not norm_text.endswith(".")
                ):
                    # Save previous heading's buffer
                    if current_heading and buffer:
                        content_dict[current_heading] = " ".join(buffer).strip()
                        buffer.clear()
                    current_heading = norm_text
                    continue

                # Accumulate paragraph content
                if current_heading:
                    buffer.append(norm_text)

    # Save last section
    if current_heading and buffer:
        content_dict[current_heading] = " ".join(buffer).strip()

    return content_dict


In [7]:
content_2 = extract_headings_and_content(pdf_path)

In [10]:
list(content_2.keys())

['Respiratory distress syndrome',
 'Sudden infant death syndrome',
 'Croup',
 'Epiglottiditis',
 'Acute respiratory distress syndrome',
 'Acute respiratory failure in COPD',
 'Pulmonary edema',
 'Cor pulmonale',
 "Legionnaires' disease",
 'Atelectasis',
 'Respiratory acidosis',
 'Respiratory alkalosis',
 'Pneumothorax',
 'Pneumonia',
 'Idiopathic bronchiolitis obliterans with organizing pneumonia',
 'Pulmonary embolism',
 'Sarcoidosis',
 'Respiratory—breathlessness, cough (usually nonproductive), substernal pain; complications in',
 'Hepatic—granulomatous hepatitis, usually asymptomatic Genitourinary—hypercalciuria',
 'Severe acute respiratory syndrome',
 'Lung abscess',
 'Hemothorax',
 'Pulmonary hypertension',
 'Pleural effusion and empyema',
 'Pleurisy',
 'Chronic obstructive pulmonary disease',
 'Bronchiectasis',
 'Idiopathic pulmonary fibrosis',
 'Tuberculosis',
 'Silicosis',
 'Asbestosis',
 "Coal worker's pneumoconiosis"]

In [19]:
TARGET_SECTIONS = {
    "cardiovascular disorders",
    "respiratory disorders",
    "neurologic disorders",
    "gastrointestinal disorders",
    "hepatobiliary disorders",
    "musculoskeletal disorders",
    "renal and urologic disorders",
    "immune disorders",
    "hematologic disorders",
    "metabolic and nutritional disorders",
    "endocrine disorders",
    "eye disorders",
    "ear, nose, and throat disorders",
    "skin disorders",
    "malignant neoplasms",
    "infectious disorders",
    "trauma and burns",
    "genetic disorders",
    "obstetric and gynecologic disorders",
    "sexually transmitted infections",
    "psychiatric disorders"
}


def extract_sectionwise_content(pdf_path):
    doc = fitz.open(pdf_path)
    content_dict = {}

    current_section = None
    buffer = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block["type"] != 0:
                continue

            for line in block["lines"]:
                spans = line["spans"]
                if not spans:
                    continue

                line_text = " ".join(span["text"].strip() for span in spans if span["text"].strip())
                if not line_text:
                    continue

                font_size = spans[0]["size"]
                norm_text = line_text.lower().strip()

                # Detect section heading directly (font size > 15 and text match)
                if font_size > 15 and norm_text in TARGET_SECTIONS:
                    if current_section and buffer:
                        content_dict[current_section] = " ".join(buffer).strip()
                        buffer.clear()
                    current_section = line_text.strip()  # Preserve original casing
                    continue

                # Regular content lines
                if current_section:
                    buffer.append(line_text)

    # Save last section
    if current_section and buffer:
        content_dict[current_section] = " ".join(buffer).strip()

    return content_dict

guide_content = extract_sectionwise_content("12_diseases_guide.pdf")


In [21]:
len(guide_content.keys())

12

In [22]:
import fitz

def extract_headings_and_content(pdf_path):
    doc = fitz.open(pdf_path)
    content_dict = {}

    current_heading = None
    buffer = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block["type"] != 0:
                continue

            for line in block["lines"]:
                spans = line["spans"]
                if not spans:
                    continue

                line_text = " ".join(span["text"].strip() for span in spans if span["text"].strip())
                norm_text = line_text.strip()

                if not norm_text:
                    continue

                # Heuristic: detect a heading (title-like)
                if (
                    len(norm_text.split()) < 10 and
                    norm_text[0].isupper() and
                    not norm_text.endswith(".")
                ):
                    # Save previous heading's buffer
                    if current_heading and buffer:
                        content_dict[current_heading] = " ".join(buffer).strip()
                        buffer.clear()
                    current_heading = norm_text
                    continue

                # Accumulate paragraph content
                if current_heading:
                    buffer.append(norm_text)

    # Save last section
    if current_heading and buffer:
        content_dict[current_heading] = " ".join(buffer).strip()

    return content_dict

symptoms_content = extract_headings_and_content("12_diseases_symptoms.pdf")


In [24]:
list(symptoms_content.keys())

['Respiratory distress syndrome',
 'Sudden infant death syndrome',
 'Croup',
 'Epiglottiditis',
 'Acute respiratory distress syndrome',
 'Acute respiratory failure in COPD',
 'Pulmonary edema',
 'Cor pulmonale',
 "Legionnaires' disease",
 'Atelectasis',
 'Respiratory acidosis',
 'Respiratory alkalosis',
 'Pneumothorax',
 'Pneumonia',
 'Idiopathic bronchiolitis obliterans with organizing pneumonia',
 'Pulmonary embolism',
 'Sarcoidosis',
 'Respiratory—breathlessness, cough (usually nonproductive), substernal pain; complications in',
 'Hepatic—granulomatous hepatitis, usually asymptomatic Genitourinary—hypercalciuria',
 'Severe acute respiratory syndrome',
 'Lung abscess',
 'Hemothorax',
 'Pulmonary hypertension',
 'Pleural effusion and empyema',
 'Pleurisy',
 'Chronic obstructive pulmonary disease',
 'Bronchiectasis',
 'Idiopathic pulmonary fibrosis',
 'Tuberculosis',
 'Silicosis',
 'Asbestosis',
 "Coal worker's pneumoconiosis",
 'Cerebral palsy',
 'Hydrocephalus',
 'Cerebral aneurysm',

In [26]:
symptoms_content

{'Respiratory distress syndrome': 'Although a neonate with RDS may breathe normally at first, he usually develops rapid, shallow respirations within minutes or hours of birth, with intercostal, subcostal, or sternal retractions; nasal flaring; and audible expiratory grunting. This grunting is a natural compensatory mechanism designed to produce positive end-expiratory pressure (PEEP) and prevent further alveolar collapse. Severe disease is marked by apnea, bradycardia, and cyanosis (from hypoxemia, left-to-right shunting through the foramen ovale, or right-to \x02left intrapulmonary shunting through atelectatic regions of the lung). Other clinical features include pallor, frothy sputum, and low body temperature as a result of an immature nervous system and the absence of subcutaneous fat.',
 'Sudden infant death syndrome': "Although parents find some victims wedged in crib corners or with blankets wrapped around their heads, autopsies rule out suffocation as the cause of death. Autopsy