# RAG Processor

## Import Libraries

In [1]:
import json
import re
from bs4 import BeautifulSoup
import networkx as nx
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

## Data Extraction

In [2]:
# Step 1: Parse HTML and extract nodes/edges
def extract_graph_from_html(html_path):
    with open(html_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        script_content = soup.find_all('script')[-1].string
        
        nodes_match = re.search(r'nodes\s*=\s*new vis\.DataSet\((\[.*?\])\);', script_content, re.DOTALL)
        edges_match = re.search(r'edges\s*=\s*new vis\.DataSet\((\[.*?\])\);', script_content, re.DOTALL)

        nodes = json.loads(nodes_match.group(1)) if nodes_match else []
        edges = json.loads(edges_match.group(1)) if edges_match else []

        G = nx.Graph()
        for node in nodes:
            G.add_node(node['id'], label=node.get('title', 'Type'))

        for edge in edges:
            G.add_edge(edge['from'], edge['to'])

        return G

In [3]:
html_file = "knowledge_graph_geo_limits.html"

In [4]:
G = extract_graph_from_html(html_file)

## Graph to Text

In [5]:
def graph_to_text_chunks(G):
    text_chunks = []
    for u, v in G.edges():
        text_chunks.append(f"{u}: {v}")
    return text_chunks

In [6]:
text_chunks = graph_to_text_chunks(G)

## Vector Indexes

In [7]:
# Step 3: Create vector index for retrieval
def build_vector_index(text_chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(text_chunks)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))

    return index, embeddings, text_chunks, model

In [8]:
index, embeddings, chunk_map, model = build_vector_index(text_chunks)

## Retrieval Function

In [9]:
def retrieve_info(query, model, index, text_chunks, top_k=3):
    query_emb = model.encode([query])
    D, I = index.search(np.array(query_emb), top_k)
    return [text_chunks[i] for i in I[0]]

In [10]:
sample_questions = [
    "Why can't I add 251 curve shades to my log?",
    "What is the maximum number of data points allowed per curve?",
    "I want to use the name 'Hydrocarbon bearing zone highlighted' as my curve shade name. Why is it not allowed?",
    "What is the number of curves I can load in a data file?",
    "I have already added 20,000 modifiers to my log. Why can't I add more?",
    "How many log headers can I add to my log?",
    "How many tadpole definitions am I allowed to create?",
    "Why can't I add another layout to my log?"
]
# Remove word "maximum" from "What is the maximum number of curves I can load in a data file?

In [11]:
for question in sample_questions:
    results = retrieve_info(question, model, index, chunk_map)
    print(f"\nQuestion: {question}")
    print("Top Retrieved Info:")
    for res in results:
        print(f"- {res}")


Question: Why can't I add 251 curve shades to my log?
Top Retrieved Info:
- 250: Number of curve shades per plot
- 20: Curve shade name length
- 50: Number of zones per curve shade

Question: What is the maximum number of data points allowed per curve?
Top Retrieved Info:
- Data points per curve: Unlimited
- Number of data files to form one curve: None
- Number of curves: 450

Question: I want to use the name 'Hydrocarbon bearing zone highlighted' as my curve shade name. Why is it not allowed?
Top Retrieved Info:
- Curve to lithology name: 50
- 20: Curve shade name length
- 50: Number of zones per curve shade

Question: What is the number of curves I can load in a data file?
Top Retrieved Info:
- Number of data files to form one curve: None
- Data points per curve: Unlimited
- Number of curves: 450

Question: I have already added 20,000 modifiers to my log. Why can't I add more?
Top Retrieved Info:
- 20000: Number of modifiers per plot
- 450: Number of modifier types
- Number of rows 

## Model Initialisation

In [12]:
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [13]:
# Initialize Ollama
llm_model = ChatOllama(
    model="llama3.2:latest",  # or any other local model you have
    temperature=0,
    num_predict=150
)

In [14]:
prompt_template = PromptTemplate(
    template="""
    You are an expert on GEO limits and constraints.

    Context:
    {context}

    Question: {question}

    Answer concisely based on the context.
    """,
    input_variables=["context", "question"]
)

In [15]:
from langchain_core.runnables import RunnablePassthrough

def generate_ollama_answer(question, retrieved_facts, llm_model, prompt_template):
    context = "\n".join(f"- {fact}" for fact in retrieved_facts)
    
    # Define the chain properly
    rag_chain = (
        {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
        | prompt_template
        | llm_model
        | StrOutputParser()
    )
    
    # Pass context and question as a dict
    answer = rag_chain.invoke({"context": context, "question": question})
    return answer

In [16]:
# Loop over sample questions and get answers
for question in sample_questions:
    results = retrieve_info(question, model, index, chunk_map)
    # print(type(results))
    answer = generate_ollama_answer(question, results, llm_model, prompt_template)
    print(f"\nQuestion: {question}")
    print("Top Retrieved Info:")
    for res in results:
        print(f"- {res}")
    print("Answer:", answer)


Question: Why can't I add 251 curve shades to my log?
Top Retrieved Info:
- 250: Number of curve shades per plot
- 20: Curve shade name length
- 50: Number of zones per curve shade
Answer: The limit of 250 curve shades is due to a technical constraint. This limitation prevents excessive data from being displayed, maintaining readability and usability in the log.

Question: What is the maximum number of data points allowed per curve?
Top Retrieved Info:
- Data points per curve: Unlimited
- Number of data files to form one curve: None
- Number of curves: 450
Answer: The maximum number of data points allowed per curve is Unlimited.

Question: I want to use the name 'Hydrocarbon bearing zone highlighted' as my curve shade name. Why is it not allowed?
Top Retrieved Info:
- Curve to lithology name: 50
- 20: Curve shade name length
- 50: Number of zones per curve shade
Answer: The curve shade name 'Hydrocarbon bearing zone highlighted' exceeds the maximum length of 20 characters, which is a 