In [36]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os
import numpy as np
import re
import uuid

In [37]:
pdf_folder = "data_json/"

def load_documents_one_per_pdf(pdf_folder):
    all_docs = []
    
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".PDF"):
            loader = PyPDFLoader(os.path.join(pdf_folder, filename))
            docs = loader.load()
            
            # Slå sammen alt innhold fra PDF-en til ett Document
            full_text = " ".join([doc.page_content for doc in docs])
            new_doc = Document(
                page_content=full_text,
                metadata={"source_file": filename}
            )
            all_docs.append(new_doc)
    
    return all_docs

documents = load_documents_one_per_pdf(pdf_folder)
print(f"Loaded {len(documents)} documents.")


Loaded 3 documents.


In [38]:
import re

def clean_text(documents):
    cleaned_documents = []

    for doc in documents:
        raw_text = doc.page_content

        # Del i linjer for å kunne filtrere tabellrader
        lines = raw_text.splitlines()
        filtered_lines = []

        for line in lines:
            # Heuristikk for å hoppe over tabeller:
            # mange | eller mange mellomrom/kolonnestruktur
            if (
                line.count("|") > 2 or
                re.search(r"\s{3,}", line) or
                re.search(r";", line)
            ):
                continue  # hopp over tabellrad

            filtered_lines.append(line)

        text = " ".join(filtered_lines)

        # Normal rensing
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = ' '.join(text.split())

        doc.page_content = text
        cleaned_documents.append(doc)

    return cleaned_documents


documents = clean_text(documents)


In [39]:
def chunk_documents(documents, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    all_chunks = []

    for doc in documents:
        # Sørg for at hvert dokument har en stabil ID
        document_id = doc.metadata.get("document_id")
        if not document_id:
            document_id = str(uuid.uuid4())
            doc.metadata["document_id"] = document_id

        doc_chunks = splitter.split_documents([doc])

        for i, chunk in enumerate(doc_chunks):
            chunk.metadata["document_id"] = document_id
            chunk.metadata["chunk_index"] = i

        all_chunks.extend(doc_chunks)

    return all_chunks


chunks = chunk_documents(documents)
print(f"Created {len(chunks)} chunks")

Created 79 chunks


In [40]:
def add_metadata(chunks):
    for i, chunk in enumerate(chunks):
        chunk.metadata['chunk_index'] = i
    
    return chunks

chunks = add_metadata(chunks)


In [41]:
print(chunks[0].page_content[:500])
print(chunks[0].metadata)

Document No: 31028-J-KB-0001 Odfjell Drilling Document Number: 31028-J-KB-0001 Revision: 01 Date: 17.01.2020 Input to WSOG DSA @ ASKELADD SOUTH TEMPLATE L Input to WSOG DSA @ Askeladd South – Template L Document no: 31028-J-KB-0001 Rev: 01 Date: 17.01.20 Document No: 31028-J-KB-0001 Odfjell Drilling Document history Rev Date Description Prepared Controlled Approved 01A 17.01.2020 Issued for DIC GKVA IDKR CAST 01 17.01.2020 Issued for use GKVA IDKR CAST Document signatures Prepared by: Controlled
{'source_file': '31028-J-KB-0001_01_004.PDF', 'document_id': '72188c8e-f595-4cec-a0fa-59477ebfbc61', 'chunk_index': 0}


In [42]:
import json

serializable_chunks = [
    {"page_content": c.page_content, "metadata": c.metadata}
    for c in chunks
]

with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(serializable_chunks, f, ensure_ascii=False, indent=2)
