In [1]:
import re, json, statistics

In [2]:
import fitz  # PyMuPDF

In [3]:
from typing import List, Dict, Any, Optional

In [4]:
HEADING_SIZES = {  # 映射字号 -> 层级
    25.5: 1,   # H1
    21.0: 2,   # H2
    16.5: 3,   # H3
}
BODY_SIZE = 12.0
SIDE_NUMBER_SIZE = 10.0
SIZE_TOL = 0.5

In [5]:
# 10pt 侧边编号的识别
SIDE_NUMBER_RE = re.compile(r"^\d+(?:\.\d+)*\.?$")

In [6]:
# 粗体识别（用于引用标注与年份抽取）
def is_bold_font(fontname: str) -> bool:
    return bool(re.search(r"(bold|semibold|demi)", fontname, re.I))

In [7]:
# 段内 [ ... ] 片段
BRACKETED_RE = re.compile(r"\[(.+?)\]")
YEAR_RE = re.compile(r"\b(19|20)\d{2}\b")  # 1900–2099 年份

In [8]:
def snap_size(sz: float) -> float:
    for target in list(HEADING_SIZES.keys()) + [BODY_SIZE, SIDE_NUMBER_SIZE]:
        if abs(sz - target) <= SIZE_TOL:
            return target
    return sz

def heading_level_from_size(sz: float) -> Optional[int]:
    snapped = snap_size(sz)
    return HEADING_SIZES.get(snapped, None)

def is_body_size(sz: float) -> bool:
    return abs(snap_size(sz) - BODY_SIZE) <= SIZE_TOL

def is_side_number_size(sz: float) -> bool:
    return abs(snap_size(sz) - SIDE_NUMBER_SIZE) <= SIZE_TOL

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

In [9]:
def join_lines_smart(lines: List[str]) -> str:
    out = []
    for t in lines:
        t = t.rstrip()
        if not out:
            out.append(t)
        else:
            prev = out[-1]
            if prev.endswith("-"):
                out[-1] = prev[:-1] + t.lstrip()
            else:
                out[-1] = prev + " " + t.lstrip()
    return normalize_whitespace(out[0]) if out else ""

In [19]:
def annotate_refs_in_span_text(text: str, fontname: str):
    """
    对于粗体字体的 span：
      - 查找形如 [ ... ] 的片段；
      - 提取其中所有年份；
      - 用“带 [] 的纯文本年份”替换原内容：
          单年 -> "[2008]"
          多年（如 "[2018,  amended 2019]"）-> "[2018, 2019]"
    返回：annotated_text, years_list
    对于非粗体，原样返回（不做替换）。
    """
    if not is_bold_font(fontname):
        return text, []

    years_all = []

    def _replace(m):
        inner = m.group(1)
        # 收集完整 4 位年份（支持多年份，如 "2018 ... 2019"）
        years_full = [im.group(0) for im in YEAR_RE.finditer(inner)]
        if years_full:
            for y in years_full:
                try:
                    years_all.append(int(y))
                except Exception:
                    pass
            # 拼接年份并保留方括号
            return "[" + ", ".join(years_full) + "]"
        else:
            # 没有年份，保持原样
            return "[" + inner + "]"

    annotated = BRACKETED_RE.sub(_replace, text)
    return annotated, years_all


In [20]:
# 新增：页眉/页脚过滤配置与工具
HEADER_MARGIN = 50   # 你可以按需要调整
FOOTER_MARGIN = 50   # 你可以按需要调整

def is_header_or_footer(bbox, page_height, header_margin=HEADER_MARGIN, footer_margin=FOOTER_MARGIN) -> bool:
    """
    基于 y 坐标过滤页眉/页脚：
      - 页眉：y0 < header_margin
      - 页脚：y1 > page_height - footer_margin
    """
    if not bbox or len(bbox) < 4:
        return False
    y0, y1 = bbox[1], bbox[3]
    return (y0 < header_margin) or (y1 > (page_height - footer_margin))


In [21]:
def extract_pdf_to_jsonl(pdf_path: str, out_path: str,
                         header_margin: int = HEADER_MARGIN,
                         footer_margin: int = FOOTER_MARGIN) -> None:
    doc = fitz.open(pdf_path)

    section_stack: List[str] = ["", "", ""]  # H1/H2/H3
    current_para_lines: List[str] = []
    current_para_refs: List[int] = []
    current_para_pending_label: Optional[str] = None

    def emit_heading(level: int, title: str, page_no: int, f):
        nonlocal section_stack
        title = normalize_whitespace(title)
        idx = level - 1
        section_stack[idx] = title
        for j in range(idx + 1, len(section_stack)):
            section_stack[j] = ""
        rec = {
            "type": "heading",
            "level": level,
            "title": title,
            "page": page_no,
            "section_path": [s for s in section_stack if s],
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    def flush_paragraph(page_no: int, f=None):
        nonlocal current_para_lines, current_para_refs, current_para_pending_label
        if not current_para_lines:
            current_para_refs = []
            current_para_pending_label = None
            return
        text = join_lines_smart(current_para_lines)
        rec = {
            "type": "paragraph",
            "text": text,
            "page": page_no,
            "section_path": [s for s in section_stack if s],
        }
        if current_para_pending_label:
            rec["side_label"] = current_para_pending_label
        if current_para_refs:
            seen, ordered = set(), []
            for y in current_para_refs:
                if y not in seen:
                    ordered.append(y); seen.add(y)
            rec["refs"] = ordered
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        current_para_lines = []
        current_para_refs = []
        current_para_pending_label = None

    # === 拆分 1：处理单页 ===
    def process_page(page, page_no: int, f):
        page_height = page.rect.height
        d = page.get_text("dict")

        pending_heading_level: Optional[int] = None
        pending_heading_lines: List[str] = []

        def flush_heading_if_any():
            nonlocal pending_heading_level, pending_heading_lines
            if pending_heading_level is not None and pending_heading_lines:
                title = join_lines_smart(pending_heading_lines)
                emit_heading(pending_heading_level, title, page_no, f)
            pending_heading_level = None
            pending_heading_lines = []

        # === 拆分 2：处理单个 span ===
        def handle_span(span):
            nonlocal pending_heading_level, pending_heading_lines
            raw = span.get("text", "")
            if not raw.strip():
                return

            # 新增：页眉/页脚过滤
            bbox = span.get("bbox", [0, 0, 0, 0])
            if is_header_or_footer(bbox, page_height, header_margin, footer_margin):
                return

            fontname = span.get("font", "")
            size = float(span.get("size", 0.0))
            size_snapped = snap_size(size)

            # 10pt 侧边编号
            if is_side_number_size(size_snapped) and SIDE_NUMBER_RE.match(raw.strip()):
                flush_heading_if_any()
                # 挂到下一段正文
                nonlocal current_para_pending_label
                current_para_pending_label = raw.strip()
                return

            # 标题？
            lvl = heading_level_from_size(size_snapped)
            if lvl is not None:
                flush_paragraph(page_no, f)
                if (pending_heading_level is not None) and (lvl != pending_heading_level):
                    flush_heading_if_any()
                pending_heading_level = lvl
                pending_heading_lines.append(raw)
                return

            # 进入正文或其他
            if pending_heading_level is not None:
                flush_heading_if_any()

            if is_body_size(size_snapped):
                annotated, years = annotate_refs_in_span_text(raw, fontname)
                if years:
                    current_para_refs.extend(years)
                current_para_lines.append(annotated)
            else:
                # 遇到其他字号，视为段落边界
                flush_paragraph(page_no, f)

        # 遍历该页内容
        for block in d.get("blocks", []):
            if block.get("type", 0) != 0:
                continue
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    handle_span(span)
            # 一个 block 结束，若有挂起标题，输出之
            if pending_heading_level is not None:
                flush_heading_if_any()

        # 页面结束，落盘未完成段落
        flush_paragraph(page_no, f)

    # 主循环
    with open(out_path, "w", encoding="utf-8") as f:
        for p_idx in range(len(doc)):
            process_page(doc[p_idx], p_idx + 1, f)

    doc.close()
    print(f"✅ 提取完成：{out_path}")


In [22]:
# === 运行区 ===
# 1) 设置输入/输出路径
input_pdf = "../data/adhd_guideline.pdf"        # ← 改成你的 PDF 路径
output_jsonl = "../data/adhd_guideline.jsonl"  # ← 想要保存的 JSONL 路径

# 2) 执行抽取
# extract_pdf_to_jsonl(input_pdf, output_jsonl)
# 执行后可用下面的单元查看若干行结果

In [23]:
extract_pdf_to_jsonl(input_pdf, output_jsonl)

✅ 提取完成：../data/adhd_guideline.jsonl


In [24]:
# 预览输出（如已生成）
def head_jsonl(path: str, n: int = 10):
    try:
        with open(path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if i >= n: break
                print(line.rstrip())
    except FileNotFoundError:
        print("未找到文件：", path)

In [25]:
head_jsonl(output_jsonl, 20)

{"type": "paragraph", "text": "This guideline replaces CG72, TA98, ESNM19, ESUOM8 and ESNM70. This guideline is the basis of QS39.", "page": 1, "section_path": []}
{"type": "heading", "level": 1, "title": "Overview", "page": 1, "section_path": ["Overview"]}
{"type": "paragraph", "text": "This guideline covers recognising, diagnosing and managing attention deficit hyperactivity disorder (ADHD) in children, young people and adults. It aims to improve recognition and diagnosis, as well as the quality of care and support for people with ADHD.", "page": 1, "section_path": ["Overview"]}
{"type": "heading", "level": 2, "title": "Who is it for?", "page": 1, "section_path": ["Overview", "Who is it for?"]}
{"type": "paragraph", "text": "• Healthcare professionals • Commissioners and providers • People with ADHD, and their families and carers", "page": 1, "section_path": ["Overview", "Who is it for?"]}
{"type": "heading", "level": 1, "title": "Recommendations", "page": 2, "section_path": ["Recomm

In [26]:
import pandas as pd

In [27]:
df = pd.read_json("../data/adhd_guideline.preprocessed.jsonl", lines=True)

In [34]:
print(df.loc[99, "text"])

The 2018 recommendations ensure that parents and carers of all children and young people with ADHD receive ADHD-focused information and support. Children and young people aged 5 years and over are offered medication by a healthcare professional with training and expertise in diagnosing and managing ADHD only if ADHD symptoms are still causing a significant impairment in at least one domain of their everyday life despite implementation of environmental modifications. This choice follows discussion with the child or young person and their parents or carers and a full baseline assessment. The recommendations make it clear that where a child has symptoms of oppositional defiant disorder or conduct disorder, parents and carers should be offered a parent-training programme in line with the recommendations in NICE's guideline on antisocial behaviour and conduct disorders . The current categorisation of ADHD focuses on the presence of significant impairment in the different domains of everyday