In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_unstructured import UnstructuredLoader

import os
import numpy as np
import re
import uuid
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_folder = "data_json/"

def load_documents_one_per_pdf(pdf_folder):
    all_docs = []
    
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".PDF"):
            loader = PyPDFLoader(os.path.join(pdf_folder, filename))
            docs = loader.load()
            
            full_text = " ".join([doc.page_content for doc in docs])
            new_doc = Document(
                page_content=full_text,
                metadata={"source_file": filename}
            )
            all_docs.append(new_doc)
    
    return all_docs

documents = load_documents_one_per_pdf(pdf_folder)
print(f"Loaded {len(documents)} documents.")


Loaded 3 documents.


In [3]:
def clean_text(documents):
    cleaned_docs = []

    for doc in documents:
        lines = doc.page_content.splitlines()
        kept_lines = []

        for line in lines:
            line = line.strip()

            if not line:
                continue

            num_ratio = sum(c.isdigit() for c in line) / max(len(line), 1)
            many_spaces = line.count("  ") >= 2

            if num_ratio > 0.35 and many_spaces:
                continue

            kept_lines.append(line)

        text = "\n".join(kept_lines)

        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)

        doc.page_content = text
        cleaned_docs.append(doc)

    return cleaned_docs

documents = clean_text(documents)


In [15]:
""" def chunk_documents(documents, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=[
            "\n\n",   # seksjoner
            "\n",
            ". ", "? ", "! ",
            " "
        ]
    )

    all_chunks = []

    for doc in documents:
        document_id = doc.metadata.get("document_id")
        if not document_id:
            document_id = str(uuid.uuid4())
            doc.metadata["document_id"] = document_id

        text = doc.page_content
        text = re.sub(r"\s(\d+\s+[A-Z][A-Z\s]+)", r"\n\n\1", text)

        doc.page_content = text

        doc_chunks = splitter.split_documents([doc])

        for i, chunk in enumerate(doc_chunks, start=1):
            chunk.metadata.clear()
            chunk.metadata["document_id"] = document_id
            chunk.metadata["chunk_index"] = i
            chunk.metadata["source_file"] = doc.metadata.get("source_file")

        all_chunks.extend(doc_chunks)

    return all_chunks


chunks = chunk_documents(documents)
print(f"Created {len(chunks)} chunks")
 """


def chunk_documents_by_chapter(documents, chunk_size=2000, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "? ", "! ", " "]
    )

    all_chunks = []

    for doc in documents:
        document_id = doc.metadata.get("document_id") or str(uuid.uuid4())
        doc.metadata["document_id"] = document_id
        source_file = doc.metadata.get("source_file", "unknown")

        text = doc.page_content

        # Finn kapitler
        chapter_pattern = r"\n\d+\s+[A-Z][A-Z\s]+"
        chapter_texts = re.split(chapter_pattern, text)
        chapter_titles = re.findall(chapter_pattern, text)

        # Noen dokumenter starter uten kapittel, håndter det
        if len(chapter_titles) < len(chapter_texts):
            chapter_titles = ["Introduction"] + chapter_titles

        for chap_idx, (chap_text, chap_title) in enumerate(zip(chapter_texts, chapter_titles), start=1):
            # Lag en midlertidig Document med kapitteltekst
            chap_doc = type(doc)(page_content=chap_text, metadata=dict(doc.metadata))
            # Chunk kapittel
            chap_chunks = splitter.split_documents([chap_doc])
            # Legg inn kapittelinfo i metadata
            for i, chunk in enumerate(chap_chunks, start=1):
                chunk.metadata["document_id"] = document_id
                chunk.metadata["chunk_index"] = i
                chunk.metadata["source_file"] = source_file
                chunk.metadata["chapter_title"] = chap_title.strip()
            all_chunks.extend(chap_chunks)

    return all_chunks

chunks = chunk_documents_by_chapter(documents)
print(f"Created {len(chunks)} chunks")


Created 50 chunks


  text = re.sub(r"\s(\d+\s+[A-Z][A-Z\s]+)", r"\n\n\1", text)


In [16]:
serializable_chunks = [
    {"page_content": c.page_content, "metadata": c.metadata}
    for c in chunks
]

with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(serializable_chunks, f, ensure_ascii=False, indent=2)

In [17]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OllamaEmbeddings(model="embeddinggemma")

texts = [c.page_content for c in chunks]
metadatas = [c.metadata for c in chunks]

vectorstore = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)

vectorstore.save_local("faiss_chunks")
print("FAISS vectorstore lagret.")


FAISS vectorstore lagret.


In [9]:
query = "What is the rig heading of Askeladd South?"
results = vectorstore.similarity_search(query, k=5)

for r in results:
    print(r.metadata)
    print(r.page_content[:200])
    print("-----")


{'source_file': '31028-J-KB-0001_01_004.PDF', 'document_id': '87fc4671-6c8a-4340-89b0-1a25a7b98260', 'chunk_index': 1, 'chapter_title': '3 MAP AND LOCATION SPECIFIC REQUIREMENTS'}
3.1 Location Data
Figure 3-1: Location Area Map, Askeladd South, Template L, ref. /1/. Input to WSOG
DSA @ Askeladd South – Template L
Document no: 31028-J-KB-0001 Rev: 01 Date: 17.01.20
Document No: 
-----
{'source_file': '31028-J-KB-0001_01_004.PDF', 'document_id': '87fc4671-6c8a-4340-89b0-1a25a7b98260', 'chunk_index': 1, 'chapter_title': '4 MOORING LINE DATA'}
4.1 Mooring Spread
700m / 84mm
037.5
o à
700m / 84mm
ß
352.5o
700m / 84mm
082.5
o à
700m / 84mm
127.5 o
à700m / 84mm
172.5o à
700m / 84mm
ß 262.5
o
700m / 84mm
ß 307.5 o
Rig Heading 240.0
o
#1
#2
#3
#
-----
{'source_file': '31028-J-KB-0001_01_004.PDF', 'document_id': '87fc4671-6c8a-4340-89b0-1a25a7b98260', 'chunk_index': 2, 'chapter_title': '5 WSOG INPUT'}
5 60 222 222 278 278
6 60 212 212 265 265
7 60 222 222 277 277
8 60 210 210 263 263
Askeladd So

In [18]:
from langchain_ollama import ChatOllama

#vectorstore = FAISS.load_local("faiss_chunks", embeddings, allow_dangerous_deserialization=True)

llm = ChatOllama(model="gemma3:4b")


retriever = FAISS.load_local("faiss_chunks", embeddings, allow_dangerous_deserialization=True).as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3, "fetch_k": 10}
)




In [19]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
You are an AI assistant operating in a Retrieval Augmented Generation (RAG) pipeline.
You MUST answer using ONLY the provided context.
You are NOT allowed to use any external knowledge.


{context}

{question}


Rules:
- Use only information explicitly stated in the context.
- Do NOT make assumptions.
- If the context does not contain sufficient information, respond with exactly:
  "This question cannot be answered with the provided context."
- Every factual claim in your answer MUST be directly supported by a citation.

Format your answer exactly like this:

Your answer here.
When citing sources, always cite in the format:
[source_file | document_id | chunk_index]
Do NOT use internal vector IDs.

""")


In [20]:
#retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
""" retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3, "fetch_k": 10}
)  """

from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {
        "context": retriever,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)


In [21]:
query = "Who is project manager on project no 500011/500204/500257?"

response = rag_chain.invoke(query)

print(response)


content='Tommy Herdlevær is project manager on project no 500011/500204/500257 [994a8bdb-ca64-444f-9ca3-f2b0c3815d5b | 63bed306-ae6c-4908-a4d9-7e191d257b7d | 1]' additional_kwargs={} response_metadata={'model': 'gemma3:4b', 'created_at': '2025-12-01T13:42:58.3825688Z', 'done': True, 'done_reason': 'stop', 'total_duration': 136471806100, 'load_duration': 6338811200, 'prompt_eval_count': 1514, 'prompt_eval_duration': 113707349300, 'eval_count': 104, 'eval_duration': 16266343700, 'model_name': 'gemma3:4b', 'model_provider': 'ollama'} id='lc_run--675f9fe1-1e2d-4aff-bf83-123f12e64984-0' usage_metadata={'input_tokens': 1514, 'output_tokens': 104, 'total_tokens': 1618}


In [None]:
from neo4j import GraphDatabase
import uuid
import math
import json
import time

NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = "password"


use_batch_embedding = hasattr(embeddings, "embed_documents")

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

for c in chunks:
    meta = c.metadata
    if "document_id" not in meta:
        meta["document_id"] = str(uuid.uuid4())
    if "chunk_index" not in meta:
        meta["chunk_index"] = 1
    meta["global_chunk_id"] = f"{meta['document_id']}_{meta['chunk_index']}"
    if "source_file" not in meta:
        meta["source_file"] = meta.get("source", "unknown.pdf")
    if "chapter" not in meta:
        meta["chapter"] = meta.get("chapter", "Unknown")

texts = [c.page_content for c in chunks]

print("computing embeddings for", len(texts), "chunks...")
start = time.time()
if use_batch_embedding:
    vectors = embeddings.embed_documents(texts)
else:
    vectors = [embeddings.embed_query(t) for t in texts]
end = time.time()
print(f"done embeddings in {end-start:.2f}s")


if len(vectors) == 0:
    raise RuntimeError("Ingen vektorer generert!")
dim = len(vectors[0])
print("Embedding dimension:", dim)

BATCH_SIZE = 50

def chunk_iterable(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

with driver.session() as session:
    docs_by_id = {}
    for c in chunks:
        docs_by_id[c.metadata["document_id"]] = c.metadata["source_file"]
    print("Creating Document nodes...")
    for doc_id, source_file in docs_by_id.items():
        session.execute_write(lambda tx, doc_id=doc_id, source_file=source_file: tx.run(
            "MERGE (d:Document {id: $doc_id}) "
            "SET d.source_file = $source_file",
            doc_id=doc_id, source_file=source_file
        ))

    chapters = {}
    for c in chunks:
        k = (c.metadata["document_id"], c.metadata.get("chapter", "Unknown"))
        chapters.setdefault(k, []).append(c)
    print("Creating Chapter nodes...")
    for (doc_id, chapter_name), ch_list in chapters.items():
        session.execute_write(lambda tx, doc_id=doc_id, chapter_name=chapter_name: tx.run(
            "MATCH (d:Document {id: $doc_id}) "
            "MERGE (ch:Chapter {name: $chapter_name, document_id: $doc_id}) "
            "MERGE (d)-[:HAS_CHAPTER]->(ch) ",
            doc_id=doc_id, chapter_name=chapter_name
        ))

    print("Seeding Chunk nodes (batches)...")
    all_items = []
    for idx, c in enumerate(chunks):
        meta = c.metadata
        item = {
            "global_chunk_id": meta["global_chunk_id"],
            "document_id": meta["document_id"],
            "chunk_index": meta["chunk_index"],
            "source_file": meta["source_file"],
            "chapter": meta.get("chapter", "Unknown"),
            "text_preview": c.page_content[:1000],
            "full_text": c.page_content if len(c.page_content) <= 2000 else c.page_content[:2000],
            "embedding": vectors[idx]
        }
        all_items.append(item)

    for batch in chunk_iterable(all_items, BATCH_SIZE):
        session.execute_write(lambda tx, batch=batch: tx.run(
            """
            UNWIND $batch AS row
            MERGE (ch:Chunk {id: row.global_chunk_id})
            SET ch.document_id = row.document_id,
                ch.chunk_index = row.chunk_index,
                ch.source_file = row.source_file,
                ch.chapter = row.chapter,
                ch.text_preview = row.text_preview,
                ch.full_text = row.full_text,
                ch.embedding = row.embedding
            WITH ch, row
            MATCH (d:Document {id: row.document_id})
            MATCH (sec:Chapter {name: row.chapter, document_id: row.document_id})
            MERGE (sec)-[:HAS_CHUNK]->(ch)
            MERGE (d)-[:HAS_CHUNK]->(ch)
            """,
            batch=batch
        ))
    print("Chunk nodes created.")

driver.close()
print("Done seeding Neo4j.")


computing embeddings for 96 chunks...
done embeddings in 90.28s
Embedding dimension: 768
Creating Document nodes...


Transaction failed and will be retried in 1.0710337798430412s (Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it))
Transaction failed and will be retried in 2.192292346894349s (Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it))
Tra

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ('[::1]:7687', '127.0.0.1:7687')):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)