In [None]:
import pdfplumber
import re
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from bert_score import score
import textstat
import torch

2025-09-26 11:29:50.104860: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-26 11:29:50.588862: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-26 11:29:53.247752: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
device = 0 if torch.cuda.is_available() else -1  # 0 = first GPU, -1 = CPU fallback

In [None]:
# 3️⃣ Load model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
if device != -1:
    model = model.half()  # FP16 for GPU speed

summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=device
)

Device set to use cuda:0


In [4]:
# 4️⃣ Extract only 1–2 paragraphs (from any page, e.g., page 4)
def extract_paragraphs(pdf_path, page_index=4, num_paragraphs=2):
    with pdfplumber.open(pdf_path) as pdf:
        page_text = pdf.pages[page_index].extract_text()
    paragraphs = page_text.split("\n\n")  # split by paragraph breaks
    return " ".join(paragraphs[:num_paragraphs])

In [None]:
pdf_text = extract_paragraphs("../data/peds_2022060641_25-9-2025.pdf", page_index=4, num_paragraphs=2)
pdf_text

'sadnessandanger.Acknowledging activityandsedentarytimebehaviors, nonalcoholicfattyliverdisease\nandvalidatingtheseresponses,while unhealthyweightcontrolpractices, (NAFLD),andhypertension(KAS3,\nkeepingthefocusonthechild’s sleeppatterns,socialhistory 3.1,5,6,7,8).Appendicesprovide\nhealth,canhelptostrengthenthe (includingSDoHs),andmental/ additionalinformationontreatment\nrelationshipbetweenthepediatrician behavioralhealth(KAS2).Specific ofthesecommoncomorbidities.\norotherPHCPandpatienttosupport assessmenttoolsexistforprimary\nongoingcare. care.Thepurposeoftheevaluationis TheCPGalsodescribesadditional\ntodeterminethechild’sindividual comorbiditiespotentiallyassociated\nAllservicesandsupportsfor healthstatus,includingthepresence withpediatricobesity,including\nchildrenandyouthwithobesity andextentofobesity-related obstructivesleepapnea,polycystic\nandtheirfamiliesshouldbe comorbidities,theextentofobesity ovariansyndrome,depression,\nimplementedanddeliveredina riskfactorspresentinthechi

In [None]:
def clean_text(text):
    # Remove multiple spaces & newlines
    text = re.sub(r'\s+', ' ', text)

    # Remove page numbers (if they appear as standalone numbers)
    text = re.sub(r'\n?\d+\n?', ' ', text)

    # Strip extra whitespace
    return text.strip()

In [7]:
cleaned_text = clean_text(pdf_text)
cleaned_text

'sadnessandanger.Acknowledging activityandsedentarytimebehaviors, nonalcoholicfattyliverdisease andvalidatingtheseresponses,while unhealthyweightcontrolpractices, (NAFLD),andhypertension(KAS , keepingthefocusonthechild’s sleeppatterns,socialhistory  . , , , , ).Appendicesprovide health,canhelptostrengthenthe (includingSDoHs),andmental/ additionalinformationontreatment relationshipbetweenthepediatrician behavioralhealth(KAS ).Specific ofthesecommoncomorbidities. orotherPHCPandpatienttosupport assessmenttoolsexistforprimary ongoingcare. care.Thepurposeoftheevaluationis TheCPGalsodescribesadditional todeterminethechild’sindividual comorbiditiespotentiallyassociated Allservicesandsupportsfor healthstatus,includingthepresence withpediatricobesity,including childrenandyouthwithobesity andextentofobesity-related obstructivesleepapnea,polycystic andtheirfamiliesshouldbe comorbidities,theextentofobesity ovariansyndrome,depression, implementedanddeliveredina riskfactorspresentinthechild’s slippe

# CHunk Size
- For summarizers like DistilBART or T5-small:
- Max input ≈ 512–1024 tokens (~400–800 words).
- Chunk size = 500 words (~750–1000 tokens)
- Still within model’s safe input length.

# Overlap
- Medical text often has critical context spanning across boundaries (e.g., “Do not exceed 4000mg/day. In children under 12…”).
- If overlap is too small, you lose continuity.

In [8]:
# 5️⃣ Ensure text is short enough for model
# Approximate tokens for DistilBART ≤ 1024
tokens = tokenizer.encode(cleaned_text, add_special_tokens=False)
if len(tokens) > 512:
    tokens = tokens[:512]  # truncate for demo

chunk_text = tokenizer.decode(tokens, skip_special_tokens=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1267 > 1024). Running this sequence through the model will result in indexing errors


# Why not make the summary too long?
- Small summarization models like DistilBART or BART CNN usually have a maximum context length (like 1024 tokens input, 200–250 tokens output).
- If you set max_length too high (e.g., 500), the model may:
- Ignore the instruction and still output ~200 tokens (hard cap).
- Hallucinate filler (repeating or generic text).


# Why 200 tokens for max_length?
- For medical guidelines, you want something informative but not overwhelming.
- 200 tokens ≈ 2–3 paragraphs (enough for nuance).
- It avoids summaries that are too short (missing context) or too long (hard to merge later).

# Why 60 tokens for min_length?
- Ensures the summary isn’t just a one-liner like “This document describes medical recommendations.”
- Forces the model to give at least a few sentences with details.

# If you’re using BART-large (heavier, better): you can push max_length=250–300.
- For DistilBART (lighter): 150–200 is usually stable.

In [10]:
# 6️⃣ Summarize (single chunk)
summary = summarizer(
    chunk_text,
    max_length=150,
    min_length=40,
    do_sample=False
)[0]['summary_text']

print("=== Final Summary ===")
print(summary)

=== Final Summary ===
 The CPGalsodescribesadditional todeterminethechild’s individual comorbiditiespotentiallyassociated All servicesandsupportsfor health status . The CPCalsodesocopes providesupportiveandnonbiased instrumentalintailoringand providedforaddressing these behavior .


# First pass (per chunk)
- Each chunk is only a few hundred words.
- If we allow max_length=300, the model may just copy the chunk or make the summary too long.
- That’s why we kept it tight (150–200 max, 40–60 min) → force conciseness.

# Second pass (final refinement)
- Now, we’re summarizing summaries of all chunks combined.
- Input is already shorter (not raw PDF text), but still covers the whole document.
- We want a longer, more polished overview → so we relax the limits.
- max_length=300 → allows ~4–5 paragraphs.
- min_length=100 → ensures enough detail is preserved.

# Why not even bigger (e.g., 500)?
- Most summarization models (like BART, DistilBART, Pegasus) have a hard cap around 250–300 tokens output.
- If you request 500, the model usually stops earlier anyway.
- Bigger doesn’t mean better — it risks hallucinations or rambling.

In [11]:
reference_summary = """  Obesityinchildrenandadolescentsis
 a chronic, complex, multifactorial, and
 treatable disease. This CPG
 recommends early evaluation and
 treatment at the highest intensity
 level that is appropriate and
 available. In addition, understanding
 the wider determinants of obesity
 should enable pediatricians and other
 PHCPs to “raise awareness of the
 relevance of social and environmental
 determinants of childhood obesity in
 their communities.”4 The
 subcommittee urges pediatricians,
 other PHCPs, health systems,
 community partners, payers, and
 policy makers to work together to
 advance the equitable and universal
 provision of evaluation and treatment
 of children and adolescents with the
 chronic disease of obesity.
 """

# BERTScore uses contextual embeddings to compare your summary with a reference summary. It’s better than ROUGE for semantic similarity (especially for medical text).
- Precision → how much of the predicted summary is relevant
- Recall → how much of the reference summary is captured
- F1 → balanced measure
- Range: 0 → 1
Closer to 1 → more similar in meaning.
Closer to 0 → less similar.
Your score: 0.7837 → pretty good!

It means your summary captures ~78% of the semantic content of the reference.

# Readability Score
- Good for medical guidelines, since complex sentences are common. Use Flesch Reading Ease and Gunning Fog Index.

# Flesch Reading Ease → higher score = easier to read
- 60–70 → standard, 30–50 → difficult
- Negative → extremely complex / very long sentences and words

# Gunning Fog Index → higher score = more complex
- 12 → high school level, 16+ → college/professional
- 28.2 → extremely difficult to read

In [12]:
try:
    P, R, F1 = score([summary], [reference_summary], lang="en")
    print(f"BERTScore F1: {F1.mean().item():.4f}")
except Exception:
    print("BERTScore skipped: reference summary not provided.")

# Readability metrics
print("Flesch Reading Ease:", textstat.flesch_reading_ease(summary))
print("Gunning Fog Index:", textstat.gunning_fog(summary))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.7837
Flesch Reading Ease: -160.83499999999995
Gunning Fog Index: 28.200000000000003
