In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os
import numpy as np
import re
import uuid
import json

In [None]:
pdf_folder = "data_json/"

def load_documents_one_per_pdf(pdf_folder):
    all_docs = []
    
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".PDF"):
            loader = PyPDFLoader(os.path.join(pdf_folder, filename))
            docs = loader.load()
            
            full_text = " ".join([doc.page_content for doc in docs])
            new_doc = Document(
                page_content=full_text,
                metadata={"source_file": filename}
            )
            all_docs.append(new_doc)
    
    return all_docs

documents = load_documents_one_per_pdf(pdf_folder)
print(f"Loaded {len(documents)} documents.")


Loaded 3 documents.


In [None]:
""" def clean_text(documents):
    cleaned_documents = []

    for doc in documents:
        text = doc.page_content

        lines = text.splitlines()
        filtered_lines = []

        for line in lines:
            if (
                re.search(r"\d+\s+\d+\s+\d+", line) or
                line.count("|") > 1 or
                len(re.findall(r"\d", line)) > 15
            ):
                continue

            filtered_lines.append(line)

        text = " ".join(filtered_lines)

        text = re.sub(r"")

        doc.page_content = text
        cleaned_documents.append(doc)

    return cleaned_documents


documents = clean_text(documents)
 """

def clean_text(documents):
    cleaned_docs = []

    for doc in documents:
        lines = doc.page_content.splitlines()
        kept_lines = []

        for line in lines:
            line = line.strip()

            if not line:
                continue

            num_ratio = sum(c.isdigit() for c in line) / max(len(line), 1)
            many_spaces = line.count("  ") >= 2

            if num_ratio > 0.35 and many_spaces:
                continue

            kept_lines.append(line)

        text = "\n".join(kept_lines)

        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)

        doc.page_content = text
        cleaned_docs.append(doc)

    return cleaned_docs

documents = clean_text(documents)


  re.search(r"\d+\s+\d+\s+\d+", line) or


In [None]:
def chunk_documents(documents, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=[
            "\n\n",   # seksjoner
            "\n",
            ". ", "? ", "! ",
            " "
        ]
    )

    all_chunks = []

    for doc in documents:
        document_id = doc.metadata.get("document_id")
        if not document_id:
            document_id = str(uuid.uuid4())
            doc.metadata["document_id"] = document_id

        text = doc.page_content
        text = re.sub(r"\s(\d+\s+[A-Z][A-Z\s]+)", r"\n\n\1", text)

        doc.page_content = text

        doc_chunks = splitter.split_documents([doc])

        for i, chunk in enumerate(doc_chunks, start=1):
            chunk.metadata.clear()
            chunk.metadata["document_id"] = document_id
            chunk.metadata["chunk_index"] = i
            chunk.metadata["source_file"] = doc.metadata.get("source_file")

        all_chunks.extend(doc_chunks)

    return all_chunks


chunks = chunk_documents(documents)
print(f"Created {len(chunks)} chunks")


Created 96 chunks


In [None]:
serializable_chunks = [
    {"page_content": c.page_content, "metadata": c.metadata}
    for c in chunks
]

with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(serializable_chunks, f, ensure_ascii=False, indent=2)
