In [None]:
import os
from dotenv import  load_dotenv
from langchain.agents import create_agent
from langchain_groq import  ChatGroq

load_dotenv()
model = ChatGroq(model=os.environ['GROQ_MODEL'])

In [1]:
from typing import Optional
from pathlib import Path
import time

from docling.document_converter import DocumentConverter

def pdf_to_markdown(
    source_path: str,
    destination_path: Optional[str] = None,
    overwrite: bool = True,
) -> Path:
    """
    Convert a PDF file to structured Markdown using Docling.
    """

    start = time.time()

    src = Path(source_path)

    if not src.exists():
        raise FileNotFoundError(f"Source file not found: {src}")

    if src.suffix.lower() != ".pdf":
        raise ValueError("Input file must be a PDF")

    # Determine output directory
    out_dir = Path(destination_path) if destination_path else src.parent
    out_dir.mkdir(parents=True, exist_ok=True)

    output_file = out_dir / f"{src.stem}.md"

    if output_file.exists() and not overwrite:
        print(f"Markdown already exists, skipping: {output_file}")
        return output_file

    print(f"Converting: {src.name}")

    converter = DocumentConverter()

    result = converter.convert(str(src))
    markdown = result.document.export_to_markdown()

    output_file.write_text(markdown, encoding="utf-8")

    elapsed = time.time() - start
    print(f"Saved → {output_file}")
    print(f"Completed in {elapsed:.2f}s")

    return output_file

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_file ='../docs/model-law-568.pdf'
dest_path = "../docs/markdown/"
pdf_to_markdown(pdf_file,dest_path)

Converting: model-law-568.pdf
Saved → ../docs/markdown/model-law-568.md
Completed in 3.46s


PosixPath('../docs/markdown/model-law-568.md')

In [31]:
from pypdf import PdfReader
import re
import pypdfium2 as pdfium

def extract_naic_header(pdf_path: str) -> dict:
    """
    Extract NAIC publication line + Act title from the first page.
    Returns structured metadata.
    """

    first_page = pdfium.PdfDocument(pdf_path)[0] # first page of pdf 
    text = first_page.get_textpage().get_text_range()
    lines = [line.strip() for line in text.splitlines() if line.strip()]

    header_lines = []

    for line in lines:
        # Stop when actual document body begins
        if re.search(r"Table of Contents|Section\s+1", line):
            break

        header_lines.append(line)

    # First line = publication info
    publication = header_lines[0]

    # Remaining uppercase lines usually form the Act title
    title_lines = [line for line in header_lines[1:] if line.isupper()]

    title = " ".join(title_lines)

    return {
        "publication": publication,
        "act_title": title,
    }


def get_metadata(pdf_path: str) -> str | None:
    """
    Extract title from PDF metadata (Info dictionary).
    Returns None if not present.
    """
    reader = PdfReader(pdf_path)
    meta = reader.metadata

    metadata = {}
    if meta and meta.title:
        metadata["title"] = meta.title_raw
    if meta and meta.creation_date:
        metadata["creation_date"] = meta.creation_date_raw
    if meta and meta.modification_date:
        metadata["modification_date"] = meta.modification_date_raw

    headers = extract_naic_header(pdf_path)

    return metadata | headers


In [None]:
from pathlib import Path
from langchain_core.documents import Document


def split_naic_markdown(md_path: str):
    text = Path(md_path).read_text(encoding="utf-8")

    # Split by Sections first
    section_pattern = r"(?=^##\sSection\s\d+\.)"
    sections = re.split(section_pattern, text, flags=re.MULTILINE)

    documents = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Extract section number
        section_match = re.search(r"Section\s(\d+)", section)
        section_num = section_match.group(1) if section_match else "unknown"

        # Now split into legal clauses A., B., (1), etc.
        clause_pattern = r"(?=\n-\s[A-Z]\.)|(?=\n-\s\(\d+\))"
        clauses = re.split(clause_pattern, section)
        for clause in clauses:
            clause = clause.strip()
            if len(clause) < 40:
                continue
            
            documents.append(
                Document(
                    page_content=clause,
                    metadata={
                        "model_law": "565",
                        "section": section_num,
                        "source": md_path,
                    },
                )
            )

    return documents


In [30]:
split_naic_markdown('../docs/markdown/model-law-568.md')

[Document(metadata={'model_law': '565', 'section': '1', 'source': '../docs/markdown/model-law-568.md'}, page_content='## Table of Contents\n\n| Section 1.   | Purpose                                                                              |\n|--------------|--------------------------------------------------------------------------------------|\n| Section 2.   | Scope                                                                                |\n| Section 3.   | Authority                                                                            |\n| Section 4.   | Exemptions                                                                           |\n| Section 5.   | Definitions                                                                          |\n| Section 6.   | Practices Declared False, Misleading, Deceptive or Unfair on a Military Installation |\n| Section 7.   | Practices Declared False, Misleading, Deceptive or Unfair Regardless of Location     |\n| Section 8.   | S

In [50]:
from typing import List, Dict, Any
from docling.document_converter import DocumentConverter
from langchain_core.documents import Document


def process_state_page(pdf_path: str) -> Dict[str, Any]:
    """
    Process an NAIC State Page (STxxx) PDF into:
      - explanations: narrative text chunks (for embeddings)
      - records: structured jurisdiction rows (JSON-style)

    Returns:
        {
            "explanations": List[Document],
            "records": List[dict]
        }
    """

    converter = DocumentConverter()
    result = converter.convert(pdf_path)
    doc = result.document

    explanations: List[Document] = []
    records: List[dict] = []

    for block in doc.texts:  # Docling text blocks (already layout-grouped)
        text = block.text.strip()

        # Skip very small or obvious header noise
        if len(text) < 120:
            continue

        # Skip table header phrases that get duplicated
        if "NAIC MEMBER" in text and "MODEL ADOPTION" in text:
            continue

        explanations.append(
            Document(
                page_content=text,
                metadata={
                    "source": pdf_path,
                    "content_type": "state_page_explanation"
                }
            )
        )

    for table in doc.tables:
        headers = [h.strip() for h in table.header]

        for row in table.rows:
            values = [c.strip() if c else None for c in row]

            if len(values) != len(headers):
                continue

            record = dict(zip(headers, values))

            jurisdiction = record.get("NAIC MEMBER")
            if not jurisdiction:
                continue

            records.append({
                "jurisdiction": jurisdiction,
                "model_adoption": record.get("MODEL ADOPTION"),
                "previous_version": record.get("PREVIOUS VERSION"),
                "related_activity": record.get("RELATED ACTIVITY"),
                "source": pdf_path
            })

    return {
        "explanations": explanations,
        "records": records
    }


In [114]:
pdf_path= '../docs/markdown/model-law-state-page-565.md'
# pdf_path= '../docs/markdown/model-law-565.md'
converter = DocumentConverter()
result = converter.convert(pdf_path)
doc = result.document

In [58]:
explanations: List[Document] = []
for block in doc.texts:  # Docling text blocks (already layout-grouped)
    text = block.text.strip()

    if len(text) < 120:
            continue

    #     # Skip table header phrases that get duplicated
    # if "NAIC MEMBER" in text and "MODEL ADOPTION" in text:
    #         continue

    explanations.append(
            Document(
                page_content=text,
                metadata={
                    "source": pdf_path,
                    "content_type": "state_page_explanation"
                }
            )
        )
    
explanations

[Document(metadata={'source': '../docs/markdown/model-law-state-page-565.md', 'content_type': 'state_page_explanation'}, page_content="This  chart  is  intended  to  provide  readers  with  additional  information  to  more  easily  access  state  statutes, regulations, bulletins or administrative rulings related to the NAIC model. Such guidance provides readers with a starting  point  from  which  they  may  review  how  each  state  has  addressed  the  model  and  the  topic  being covered. The NAIC Legal Division has reviewed each state's activity in this area and has determined whether the citation  most  appropriately  fits  in  the  Model  Adoption  column,  Previous  Version  column,  or  Related  Activity column based on the definitions listed in the key below . The NAIC's interpretation may or may not be shared by the individual states or by interested readers."),
 Document(metadata={'source': '../docs/markdown/model-law-state-page-565.md', 'content_type': 'state_page_explana

In [62]:
doc.texts

[SectionHeaderItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.SECTION_HEADER: 'section_header'>, prov=[], source=[], comments=[], orig='GROUP LIFE INSURANCE DEFINITION AND GROUP LIFE INSURANCE STANDARD PROVISIONS MODEL BILL', text='GROUP LIFE INSURANCE DEFINITION AND GROUP LIFE INSURANCE STANDARD PROVISIONS MODEL BILL', formatting=None, hyperlink=None, level=1),
 SectionHeaderItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.SECTION_HEADER: 'section_header'>, prov=[], source=[], comments=[], orig='What are the state pages?', text='What are the state pages?', formatting=None, hyperlink=None, level=1),
 TextItem(self_ref='#/texts/2', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TEXT: 'text'>, prov=[], source=[], comments=[], or

In [87]:
for table in doc.tables:
    td = table.data 
    print("Rows:", td.num_rows)
    print("Cols:", td.num_cols)

Rows: 7
Cols: 4
Rows: 13
Cols: 4
Rows: 12
Cols: 4
Rows: 12
Cols: 4
Rows: 11
Cols: 4
Rows: 7
Cols: 4


In [130]:

table = doc.tables[0]
td = table.data
print("Rows:", td.num_rows)
print("Cols:", td.num_cols)
rows = list(td)[0]
rows


Rows: 7
Cols: 4


('table_cells',
 [TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1, text='NAIC MEMBER', column_header=True, row_header=False, row_section=False, fillable=False),
  TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=1, end_col_offset_idx=2, text='MODEL ADOPTION', column_header=True, row_header=False, row_section=False, fillable=False),
  TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=2, end_col_offset_idx=3, text='PREVIOUS VERSION', column_header=True, row_header=False, row_section=False, fillable=False),
  TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=3, end_col_offset_idx=4, text='RELATED ACTIVITY', column_header=True, row_header=False, row_section=False, fillable=False),
  TableCell(bbox=None, row_span=1, col_

In [135]:
for kind, payload in td:
    print(kind)

table_cells
num_rows
num_cols


In [None]:
def _extract_table_records(doc, pdf_path: str):

    records = []

    for table in doc.tables:

        flat_cells = None

        # Get flattened payload
        for kind, payload in table.data:
            if kind == "table_cells":
                flat_cells = payload
                break

        if not flat_cells:
            continue

        headers = []
        header_len = 0

        for cell in flat_cells:
            if getattr(cell, "column_header", False):
                headers.append(cell.text.strip())
                header_len += 1
            else:
                break  # first non-header = start of data

        if header_len == 0:
            continue


        data_cells = flat_cells[header_len:]

        values = [
            (cell.text.strip() if cell.text else "")
            for cell in data_cells
        ]
        
        # Rebuild rows using detected column count

        for i in range(0, len(values), header_len):

            row = values[i:i + header_len]

            if len(row) < header_len:
                continue

            row_dict = dict(zip(headers, row))

            # Require first column (usually jurisdiction)
            first_value = row[0]
            if not first_value:
                continue

            row_dict["source"] = pdf_path

            records.append(row_dict)

    return records


In [145]:
_extract_records(doc,pdf_path)

[{'NAIC MEMBER': 'Alabama',
  'MODEL ADOPTION': 'ALA. CODE §§ 27-18-1 to 27-18-16 (1971) (§ 5 of model).',
  'PREVIOUS VERSION': '',
  'RELATED ACTIVITY': '',
  'source': '../docs/markdown/model-law-state-page-565.md'},
 {'NAIC MEMBER': 'Alaska',
  'MODEL ADOPTION': 'ALASKA STAT. §§ 21.48.010 to 21.48.230 (1966/2006).',
  'PREVIOUS VERSION': '',
  'RELATED ACTIVITY': '',
  'source': '../docs/markdown/model-law-state-page-565.md'},
 {'NAIC MEMBER': 'American Samoa',
  'MODEL ADOPTION': 'NO CURRENT ACTIVITY',
  'PREVIOUS VERSION': '',
  'RELATED ACTIVITY': '',
  'source': '../docs/markdown/model-law-state-page-565.md'},
 {'NAIC MEMBER': 'Arizona',
  'MODEL ADOPTION': 'ARIZ. REV. STAT. ANN. §§ 20-1251 to 20-1269 (1954/2005).',
  'PREVIOUS VERSION': '',
  'RELATED ACTIVITY': '',
  'source': '../docs/markdown/model-law-state-page-565.md'},
 {'NAIC MEMBER': 'Arkansas',
  'MODEL ADOPTION': 'ARK. CODE ANN. §§ 23-83-101 to 23-83-122 (1981/1987).',
  'PREVIOUS VERSION': '',
  'RELATED ACTIVITY':

In [160]:
output_file = "../docs/newfile.json"
josn_data =doc.export_to_dict()
Path(output_file).write_text(str(josn_data), encoding="utf-8")

199577