In [None]:
# Installer nÃ¸dvendige pakker
# !pip install pymupdf4llm pymupdf requests

In [None]:
import pymupdf4llm
import fitz
import re
import json
from pathlib import Path
from datetime import datetime

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [6]:
def clean_markdown(md: str) -> str:
    """Strip page numbers and clean spacing"""
    md = re.sub(r"(?m)^\s*\d+\s*$", "", md)  # remove number-only lines
    md = re.sub(r"\n{3,}", "\n\n", md)
    return md.strip()
    
def extract_pdf_to_jsonl(
    pdf_path: Path,
    out_dir: Path,
    source: str,
    domain: str,
    start_page: int = 0,
    end_page: int | None = None,
    source_type: str = "pdf",
) -> Path:
    """
    Konverter lokal PDF til markdown â†’ lagre som JSONL med metadata.
    Returnerer sti til .jsonl-fila.
    """
    out_dir.mkdir(parents=True, exist_ok=True)

    # Filnavn basert pÃ¥ tittel
    slug = source.lower().replace(" ", "-")
    out_jsonl = out_dir / f"{slug}.jsonl"

    # Finn antall sider
    with fitz.open(pdf_path) as doc:
        if end_page is None:
            end_page = doc.page_count
        print(f"ðŸ“„ {doc.page_count} sider (leser {start_page+1}â€“{end_page})")

    # PDF â†’ Markdown
    md_text = pymupdf4llm.to_markdown(str(pdf_path), pages=range(start_page, end_page))
    md_text = clean_markdown(md_text)

    # Lag JSON-record
    record = {
        "source": source,
        "domain": domain,
        "source_url": str(pdf_path),
        "source_type": source_type,
        "text_format": "markdown",
        "text": md_text,
        "retrieved_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
    }

    # Skriv utfil
    out_jsonl.write_text(json.dumps(record, ensure_ascii=False) + "\n", encoding="utf-8")
    print(f"ðŸ’¾ Skrev {out_jsonl}")
    return out_jsonl

## KjÃ¸r konvertering av Statsbudsjett PDF

Oppdater URL og metadata under for Ã¥ laste ned og konvertere statsbudsjett.pdf

In [None]:
# Konfigurasjon
PDF_PATH = Path("../../data/dokumentavgift-2025.pdf")
OUTPUT_DIR = Path("./output")
SOURCE_NAME = "Statsbudsjett 2025"
DOMAIN = "offentlig"

# KjÃ¸r konvertering
output_file = extract_pdf_to_jsonl(
    pdf_path=PDF_PATH,
    out_dir=OUTPUT_DIR,
    source=SOURCE_NAME,
    domain=DOMAIN,
    start_page=0,
    end_page=None,  # None = alle sider
)

print(f"\nâœ… Ferdig! Output: {output_file}")

ðŸ“„ 190 sider (leser 1â€“190)
ðŸ’¾ Skrev output/statsbudsjett-2025.jsonl

âœ… Ferdig! Output: output/statsbudsjett-2025.jsonl
ðŸ’¾ Skrev output/statsbudsjett-2025.jsonl

âœ… Ferdig! Output: output/statsbudsjett-2025.jsonl


  "retrieved_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",


## Les og inspiser resultat

In [8]:
# Les JSONL-fila
with open(output_file, "r", encoding="utf-8") as f:
    data = json.loads(f.read())

print(f"Source: {data['source']}")
print(f"Domain: {data['domain']}")
print(f"Text length: {len(data['text'])} chars")
print(f"\nFirst 500 chars:\n{data['text'][:500]}...")

Source: Statsbudsjett 2025
Domain: offentlig
Text length: 546690 chars

First 500 chars:
(Korrigert utgave per 14.11.2025)

# **Meld. St. 1**

###### (2025 â€“ 2026) Melding til Stortinget

### Nasjonalbudsjettet 2026

# **Meld. St. 1**

###### (2025â€“2026) Melding til Stortinget

### Nasjonalbudsjettet 2026

##### **Innhold**

**1** **Hovedlinjer i den Ã¸konomiske**
**politikken og utsiktene**
**for norsk Ã¸konomi** ...................... 5

**2** **De Ã¸konomiske utsiktene** ......... 12
2.1 Internasjonal konjunktursituasjon .......................................... 12
2.2 Norsk konjun...


In [9]:
# Lagre markdown direkte til fil
md_output = OUTPUT_DIR / f"{SOURCE_NAME.lower().replace(' ', '-')}.md"
md_output.write_text(data['text'], encoding='utf-8')
print(f"âœ… Markdown lagret til: {md_output}")
print(f"ðŸ“Š StÃ¸rrelse: {len(data['text']):,} tegn")

âœ… Markdown lagret til: output/statsbudsjett-2025.md
ðŸ“Š StÃ¸rrelse: 546,690 tegn


# Chunking Strategier

Vi tester tre forskjellige chunking strategier:
1. **Naive**: Split pÃ¥ 200 tegn
2. **Overlap**: Behold overlap mellom chunks, ikke split pÃ¥ headings
3. **Metadata**: Legg til kontekst fra markdown headers

In [None]:
# Installer chunking bibliotek (kjÃ¸r hvis nÃ¸dvendig)
# !pip install langchain-text-splitters pandas

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
import pandas as pd

## Strategi 1: Naive Chunking (200 tegn)

In [13]:
def naive_chunking(text: str, chunk_size: int = 200) -> list[dict]:
    """Split text naivt pÃ¥ chunk_size tegn"""
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk_text = text[i:i + chunk_size]
        chunks.append({
            "chunk_id": i // chunk_size,
            "text": chunk_text,
            "strategy": "naive",
            "char_count": len(chunk_text)
        })
    return chunks

# Test naive chunking
naive_chunks = naive_chunking(data['text'], chunk_size=200)
print(f"ðŸ“Š Naive chunking: {len(naive_chunks)} chunks")
print(f"\nEksempel chunk #{5}:")
print(naive_chunks[5])

ðŸ“Š Naive chunking: 2734 chunks

Eksempel chunk #5:
{'chunk_id': 5, 'text': 'finanser\n\n     - g sammenligning med\nandre land ....................................... 68\n\n**4** **Andre deler av den**\n**Ã¸konomiske politikken** .............. 78\n4.1 Pengepolitikken ...............', 'strategy': 'naive', 'char_count': 200}


## Strategi 2: Overlap Chunking (med overlap, respekter strukturer)

In [29]:
def overlap_chunking(text: str, chunk_size: int = 500, overlap: int = 100) -> list[dict]:
    """
    Split text med overlap, prÃ¸v Ã¥ ikke splitte midt i setninger
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""],  # Prioriter naturlige skilletegn
    )
    
    splits = text_splitter.split_text(text)
    chunks = []
    for i, chunk_text in enumerate(splits):
        chunks.append({
            "chunk_id": i,
            "text": chunk_text,
            "strategy": "overlap",
            "char_count": len(chunk_text),
            "overlap_size": overlap
        })
    return chunks

# Test overlap chunking
overlap_chunks = overlap_chunking(data['text'], chunk_size=500, overlap=100)
print(f"ðŸ“Š Overlap chunking: {len(overlap_chunks)} chunks")
print(f"\nEksempel chunk #{5}:")
print(overlap_chunks[5])
print(f"\nðŸ”— Overlap mellom chunk 5 og 6:")
if len(overlap_chunks) > 6:
    # Vis overlap
    chunk5_end = overlap_chunks[5]['text'][-50:]
    chunk6_start = overlap_chunks[6]['text'][:50]
    print(f"Chunk 5 slutt: ...{chunk5_end}")
    print(f"Chunk 6 start: {chunk6_start}...")

ðŸ“Š Overlap chunking: 1580 chunks

Eksempel chunk #5:
{'chunk_id': 5, 'text': '5.2 Arbeidskraft .................................... 105\n\n5.3 Produktivitet ................................... 118\n5.4 Effektiv offentlig ressursbruk ....... 133\n\n**6** **Velferd, fordeling og ulikhet** .... 141\n6.1 MÃ¥ling av velferd, livskvalitet\n\n       - g levekÃ¥r ....................................... 143\n6.2 Ã˜konomisk ulikhet i Norge .......... 145\n\n**Vedlegg**\n1 Beregning av strukturell', 'strategy': 'overlap', 'char_count': 398, 'overlap_size': 100}

ðŸ”— Overlap mellom chunk 5 og 6:
Chunk 5 slutt: ......... 145

**Vedlegg**
1 Beregning av strukturell
Chunk 6 start: **Vedlegg**
1 Beregning av strukturell

      - lj...


## Strategi 3: Metadata Chunking (med markdown headers som kontekst)

In [None]:
def metadata_chunking(text: str, chunk_size: int = 500, overlap: int = 50) -> list[dict]:
    """
    Split basert pÃ¥ markdown headers og legg til metadata fra heading-hierarkiet.
    Format: 'Dette innholdet er i kategori <h1> og handler om <h2>'
    """
    # FÃ¸rst: Split basert pÃ¥ markdown headers
    headers_to_split_on = [
        ("#", "h1"),
        ("##", "h2"),
        ("###", "h3"),
    ]
    
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False  # Behold headers i teksten
    )
    
    md_header_splits = markdown_splitter.split_text(text)
    
    # Deretter: Split hver seksjon videre med overlap hvis den er for stor
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len,
    )
    
    chunks = []
    chunk_id = 0
    
    for doc in md_header_splits:
        # Hent metadata (headers)
        metadata = doc.metadata if hasattr(doc, 'metadata') else {}
        content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
        
        # Bygg kontekst-streng fra headers
        context_parts = []
        if 'h1' in metadata:
            context_parts.append(f"kategori '{metadata['h1']}'")
        if 'h2' in metadata:
            context_parts.append(f"handler om '{metadata['h2']}'")
        if 'h3' in metadata:
            context_parts.append(f"underkategori '{metadata['h3']}'")
        
        context_string = "Dette innholdet er i " + " og ".join(context_parts) if context_parts else ""
        
        # Split videre hvis nÃ¸dvendig
        if len(content) > chunk_size:
            sub_chunks = text_splitter.split_text(content)
            for sub_chunk in sub_chunks:
                chunks.append({
                    "chunk_id": chunk_id,
                    "text": sub_chunk,
                    "context": context_string,
                    "metadata": metadata,
                    "strategy": "metadata",
                    "char_count": len(sub_chunk),
                })
                chunk_id += 1
        else:
            chunks.append({
                "chunk_id": chunk_id,
                "text": content,
                "context": context_string,
                "metadata": metadata,
                "strategy": "metadata",
                "char_count": len(content),
            })
            chunk_id += 1
    
    return chunks

# Test metadata chunking
metadata_chunks = metadata_chunking(data['text'], chunk_size=500, overlap=50)
print(f"ðŸ“Š Metadata chunking: {len(metadata_chunks)} chunks")
print(f"\nEksempel chunk med metadata:")
for i, chunk in enumerate(metadata_chunks[:5]):
    if chunk['metadata']:  # Vis fÃ¸rste chunk med metadata
        print(f"\nChunk #{chunk['chunk_id']}:")
        print(f"Context: {chunk['context']}")
        print(f"Metadata: {chunk['metadata']}")
        print(f"Text preview: {chunk['text'][:200]}...")
        break

ðŸ“Š Metadata chunking: 1260 chunks

Eksempel chunk med metadata:

Chunk #0:
Context: Dette innholdet er fra kapittel 'Innhold' og handler om 'TilrÃ¥ding fra Finansdepartementet 7. oktober 2025,'
Metadata: {'h5': '**Innhold**'}
Text preview: (Korrigert utgave per 14.11.2025)  
# **Meld. St. 1**  
###### (2025 â€“ 2026) Melding til Stortinget  
### Nasjonalbudsjettet 2026  
# **Meld. St. 1**  
###### (2025â€“2026) Melding til Stortinget  
### ...


## Sammenligning av strategier

In [31]:
# Sammenlign strategiene
comparison = pd.DataFrame([
    {
        "Strategi": "Naive",
        "Antall chunks": len(naive_chunks),
        "Gj.snitt stÃ¸rrelse": sum(c['char_count'] for c in naive_chunks) / len(naive_chunks),
        "Overlap": "Nei",
        "Metadata": "Nei"
    },
    {
        "Strategi": "Overlap",
        "Antall chunks": len(overlap_chunks),
        "Gj.snitt stÃ¸rrelse": sum(c['char_count'] for c in overlap_chunks) / len(overlap_chunks),
        "Overlap": "Ja (100 tegn)",
        "Metadata": "Nei"
    },
    {
        "Strategi": "Metadata",
        "Antall chunks": len(metadata_chunks),
        "Gj.snitt stÃ¸rrelse": sum(c['char_count'] for c in metadata_chunks) / len(metadata_chunks),
        "Overlap": "Ja (50 tegn)",
        "Metadata": "Ja (headers)"
    }
])

print("ðŸ“Š Sammenligning av chunking strategier:\n")
print(comparison.to_string(index=False))

ðŸ“Š Sammenligning av chunking strategier:

Strategi  Antall chunks  Gj.snitt stÃ¸rrelse       Overlap     Metadata
   Naive           2734          199.959766           Nei          Nei
 Overlap           1580          375.341139 Ja (100 tegn)          Nei
Metadata           1260          454.423810  Ja (50 tegn) Ja (headers)


## Lagre chunks til JSONL

In [32]:
# Lagre hver strategi til egen fil
strategies = {
    "naive": naive_chunks,
    "overlap": overlap_chunks,
    "metadata": metadata_chunks
}

for strategy_name, chunks in strategies.items():
    output_path = OUTPUT_DIR / f"chunks_{strategy_name}.jsonl"
    with open(output_path, "w", encoding="utf-8") as f:
        for chunk in chunks:
            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
    print(f"âœ… Lagret {len(chunks)} chunks til {output_path}")

âœ… Lagret 2734 chunks til output/chunks_naive.jsonl
âœ… Lagret 1580 chunks til output/chunks_overlap.jsonl
âœ… Lagret 1260 chunks til output/chunks_metadata.jsonl
