In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_unstructured import UnstructuredLoader

import os
import numpy as np
import re
import uuid
import json

In [None]:
pdf_folder = "data_json/"

def load_documents_one_per_pdf(pdf_folder):
    all_docs = []
    
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".PDF"):
            loader = PyPDFLoader(os.path.join(pdf_folder, filename))
            docs = loader.load()
            
            full_text = " ".join([doc.page_content for doc in docs])
            new_doc = Document(
                page_content=full_text,
                metadata={"source_file": filename}
            )
            all_docs.append(new_doc)
    
    return all_docs

documents = load_documents_one_per_pdf(pdf_folder)
print(f"Loaded {len(documents)} documents.")


In [None]:
def clean_text(documents):
    cleaned_docs = []

    for doc in documents:
        lines = doc.page_content.splitlines()
        kept_lines = []

        for line in lines:
            line = line.strip()

            if not line:
                continue

            num_ratio = sum(c.isdigit() for c in line) / max(len(line), 1)
            many_spaces = line.count("  ") >= 2

            if num_ratio > 0.35 and many_spaces:
                continue

            kept_lines.append(line)

        text = "\n".join(kept_lines)

        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)

        doc.page_content = text
        cleaned_docs.append(doc)

    return cleaned_docs

documents = clean_text(documents)


In [None]:
""" def chunk_documents(documents, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=[
            "\n\n",   # seksjoner
            "\n",
            ". ", "? ", "! ",
            " "
        ]
    )

    all_chunks = []

    for doc in documents:
        document_id = doc.metadata.get("document_id")
        if not document_id:
            document_id = str(uuid.uuid4())
            doc.metadata["document_id"] = document_id

        text = doc.page_content
        text = re.sub(r"\s(\d+\s+[A-Z][A-Z\s]+)", r"\n\n\1", text)

        doc.page_content = text

        doc_chunks = splitter.split_documents([doc])

        for i, chunk in enumerate(doc_chunks, start=1):
            chunk.metadata.clear()
            chunk.metadata["document_id"] = document_id
            chunk.metadata["chunk_index"] = i
            chunk.metadata["source_file"] = doc.metadata.get("source_file")

        all_chunks.extend(doc_chunks)

    return all_chunks


chunks = chunk_documents(documents)
print(f"Created {len(chunks)} chunks")
 """


def chunk_documents_by_chapter(documents, chunk_size=256, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "? ", "! ", " "]
    )

    all_chunks = []

    for doc in documents:
        document_id = doc.metadata.get("document_id") or str(uuid.uuid4())
        doc.metadata["document_id"] = document_id
        source_file = doc.metadata.get("source_file", "unknown")

        text = doc.page_content

        # Finn kapitler
        chapter_pattern = r"\n\d+\s+[A-Z][A-Z\s]+"
        chapter_texts = re.split(chapter_pattern, text)
        chapter_titles = re.findall(chapter_pattern, text)

        # Noen dokumenter starter uten kapittel, håndter det
        if len(chapter_titles) < len(chapter_texts):
            chapter_titles = ["Introduction"] + chapter_titles

        for chap_idx, (chap_text, chap_title) in enumerate(zip(chapter_texts, chapter_titles), start=1):
            # Lag en midlertidig Document med kapitteltekst
            chap_doc = type(doc)(page_content=chap_text, metadata=dict(doc.metadata))
            # Chunk kapittel
            chap_chunks = splitter.split_documents([chap_doc])
            # Legg inn kapittelinfo i metadata
            for i, chunk in enumerate(chap_chunks, start=1):
                chunk.metadata["document_id"] = document_id
                chunk.metadata["chunk_index"] = i
                chunk.metadata["source_file"] = source_file
                chunk.metadata["chapter_title"] = chap_title.strip()
            all_chunks.extend(chap_chunks)

    return all_chunks

chunks = chunk_documents_by_chapter(documents)
print(f"Created {len(chunks)} chunks")


In [None]:
serializable_chunks = [
    {"page_content": c.page_content, "metadata": c.metadata}
    for c in chunks
]

with open("chunks.json", "w", encoding="utf-8") as f:
    json.dump(serializable_chunks, f, ensure_ascii=False, indent=2)

In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OllamaEmbeddings(model="embeddinggemma")

texts = [c.page_content for c in chunks]
metadatas = [c.metadata for c in chunks]

vectorstore = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)

vectorstore.save_local("faiss_chunks")
print("FAISS vectorstore lagret.")


In [None]:
query = "What is the rig heading of Askeladd South?"
results = vectorstore.similarity_search(query, k=5)

for r in results:
    print(r.metadata)
    print(r.page_content[:200])
    print("-----")


In [None]:
from langchain_ollama import ChatOllama

#vectorstore = FAISS.load_local("faiss_chunks", embeddings, allow_dangerous_deserialization=True)

llm = ChatOllama(model="gemma3:4b")


retriever = FAISS.load_local("faiss_chunks", embeddings, allow_dangerous_deserialization=True).as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3, "fetch_k": 10}
)




In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
You are an AI assistant operating in a Retrieval Augmented Generation (RAG) pipeline.
You MUST answer using ONLY the provided context.
You are NOT allowed to use any external knowledge.


{context}

{question}


Rules:
- Use only information explicitly stated in the context.
- Do NOT make assumptions.
- If the context does not contain sufficient information, respond with exactly:
  "This question cannot be answered with the provided context."
- Every factual claim in your answer MUST be directly supported by a citation.

Format your answer exactly like this:

Your answer here.
When citing sources, always cite in the format:
[source_file | document_id | chunk_index]
Do NOT use internal vector IDs.

""")


In [None]:
#retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
""" retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3, "fetch_k": 10}
)  """

from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {
        "context": retriever | (lambda docs: "\n\n".join(
            f"{d.page_content}\n[{d.metadata.get('source_file')} | "
            f"{d.metadata.get('document_id')} | "
            f"{d.metadata.get('chunk_index')}]"
            for d in docs
        )),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)


In [None]:
query = "Who is project manager on project no 500011/500204/500257?"

response = rag_chain.invoke(query)

print(response)


In [None]:
query = "How much does 14 t buoy 4 flotation cells weigh?"

response = rag_chain.invoke(query)

print(response)


In [None]:
TEST_QUERIES = [
    {
        "id": "PM_001",
        "question": "Who is project manager on project no 500011/500204/500257?",
        "expected_contains": ["Tommy Herdlevær", "Torleif Frimannslund"]
    },
    {
        "id": "RIG_001",
        "question": "What is the rig heading of Askeladd South?",
        "expected_contains": ["240"]
    }
]

from time import time

def benchmark_vector_retrieval(vectorstore, queries, k=3):
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": k}
    )

    results = []

    for q in queries:
        t0 = time()
        docs = retriever.invoke(q["question"])
        t_retrieve = time() - t0

        results.append({
            "query_id": q["id"],
            "question": q["question"],
            "retrieval_time_sec": round(t_retrieve, 4),
            "num_chunks": len(docs),
            "chunks": [
                {
                    "source_file": d.metadata["source_file"],
                    "document_id": d.metadata["document_id"],
                    "chunk_index": d.metadata["chunk_index"]
                } for d in docs
            ]
        })

    return results

vector_retrieval_results = benchmark_vector_retrieval(vectorstore, TEST_QUERIES)
print(json.dumps(vector_retrieval_results, indent=2))

In [None]:
from time import time

def run_vector_rag_benchmark(rag_chain, queries):
    results = []

    for q in queries:
        t0 = time()
        answer = rag_chain.invoke(q["question"])
        total_time = time() - t0

        answer_text = str(answer)

        correct = all(
            name.lower() in answer_text.lower()
            for name in q["expected_contains"]
        )

        results.append({
            "query_id": q["id"],
            "question": q["question"],
            "total_time_sec": round(total_time, 2),
            "answer": answer_text,
            "correct": correct
        })

    return results

rag_benchmark_results = run_vector_rag_benchmark(rag_chain, TEST_QUERIES)
print(json.dumps(rag_benchmark_results, indent=2))

In [None]:
import pandas as pd

df_retrieval = pd.DataFrame(vector_retrieval_results)
df_rag = pd.DataFrame(rag_benchmark_results)

df_vector_benchmark = df_retrieval.merge(df_rag, on=["query_id", "question"])
df_vector_benchmark


In [None]:
from neo4j import GraphDatabase
import json

# Sett opp tilkobling til Neo4j
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(user, password))

# Last inn chunks fra JSON (som du lagret tidligere)
with open("chunks.json", "r", encoding="utf-8") as f:
    chunks_data = json.load(f)

def create_graph(tx, chunks):
    docs_grouped = {}
    for c in chunks:
        doc_id = c["metadata"]["document_id"]
        docs_grouped.setdefault(doc_id, []).append(c)

    for doc_id, doc_chunks in docs_grouped.items():
        source_file = doc_chunks[0]["metadata"]["source_file"]
        tx.run("""
            MERGE (d:Document {document_id: $doc_id})
            SET d.source_file = $source_file
        """, doc_id=doc_id, source_file=source_file)

        for c in doc_chunks:
            idx = c["metadata"]["chunk_index"]
            chapter = c["metadata"].get("chapter_title", "Unknown")
            content = c["page_content"]
            tx.run("""
                MERGE (ch:Chunk {document_id: $doc_id, chunk_index: $idx})
                SET ch.page_content = $content, ch.chapter_title = $chapter, ch.source_file = $source_file
                WITH ch
                MATCH (d:Document {document_id: $doc_id})
                MERGE (d)-[:HAS_CHUNK]->(ch)
            """, idx=idx, content=content, chapter=chapter, doc_id=doc_id, source_file=source_file)


with driver.session() as session:
    session.execute_write(create_graph, chunks_data)

print("Grafen er opprettet i Neo4j!")


In [None]:
from neo4j import GraphDatabase
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

# --- 1. Sett opp Neo4j-tilkobling ---
uri = "bolt://localhost:7687"
user = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(user, password))

# --- 2. Hent relevante chunks fra grafen ---
def fetch_relevant_chunks(session, query, limit=5):
    """
    Henter de mest relevante chunks for et søk.
    """
    cypher = """
    MATCH (c:Chunk)
    WHERE c.page_content CONTAINS $term
    RETURN c.page_content AS text,
           c.source_file AS source_file,
           c.document_id AS document_id,
           c.chunk_index AS chunk_index,
           c.chapter_title AS chapter_title
    LIMIT $limit
    """
    results = session.run(cypher, term=query, limit=limit)
    chunks = []
    for record in results:
        chunks.append({
            "text": record["text"],
            "source_file": record["source_file"],
            "document_id": record["document_id"],
            "chunk_index": record["chunk_index"],
            "chapter_title": record.get("chapter_title", "Unknown")
        })
    return chunks

# --- 3. Bygg kontekst for LLM ---
def build_context(chunks):
    """
    Kombinerer hentede chunks til en kontekststreng for LLM.
    """
    context_strings = []
    for c in chunks:
        context_strings.append(
            f"{c['text']}\n[{c['source_file']} | {c['document_id']} | {c['chunk_index']}]"
        )
    return "\n\n".join(context_strings)


with driver.session() as session:
    chunks = fetch_relevant_chunks(session, query, limit=5)
    context = build_context(chunks)

response = llm.invoke({
    "context": context,
    "question": query
})

print("Svar fra Graph-RAG:")
print(response)
