In [None]:
from bs4 import BeautifulSoup
from haystack.telemetry import tutorial_running
import logging
import os
import pandas as pd
from tabulate import tabulate
from haystack import Pipeline, Document

In [2]:
tutorial_running(27)

In [3]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
EXCEL_FILE = os.path.join(DATA_DIR, "query_responses.xlsx")
FEEDBACK_FILE = os.path.join(DATA_DIR, "feedback_dataset.json")
PROMPT_VISUALISATION_FILE = os.path.join(DATA_DIR, "prompt_visualisation.txt")
PROCESSED_CONTENT_FILE = os.path.join(DATA_DIR, "processed_content.txt")
UPLOADED_FILE = os.path.join(DATA_DIR, "uploaded_document.txt")

In [4]:
def _list_htm_files():
    """
    Recursively finds all .htm files in the DATA_DIR and its subdirectories.
        
    Returns:
        list: A list of full file paths.
    """
    htm_files = []
    for root, _, files in os.walk(DATA_DIR):
        for file in files:
            if file.endswith(".htm"):
                full_path = os.path.join(root, file)  # Get the absolute path
                htm_files.append(full_path)  

    return htm_files

In [5]:
def extract_text(soup):
    # Extract only meaningful paragraph text
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 20]  # Exclude very short text
    clean_text = "\n\n".join(paragraphs)
    
    return clean_text

In [6]:
def extract_table(soup):
    tables = soup.find_all("table")
    
    formatted_tables = []
                    
    # Process and format each table
    for i, table in enumerate(tables, start=1):
        rows = []
        for row in table.find_all("tr"):
            cols = [col.get_text(strip=True) for col in row.find_all(["td", "th"])]
            rows.append(cols)
            
        # Flatten row values for filtering irrelevant tables
        flat_rows = [item.lower().strip() for sublist in rows for item in sublist]
        
        # Skip navigation tables containing only "Back" and "Forward"
        if set(flat_rows).issubset({"back", "forward", "", "-", "next", "previous"}):
            continue  # Skip this table

        # Convert to DataFrame for better readability
        df = pd.DataFrame(rows)
                        
        formatted_table = tabulate(df, headers="firstrow", tablefmt="grid")
        
        formatted_tables.append(formatted_table)
        
    formatted_tables = "\n\n".join(formatted_tables)
    
    return formatted_tables

In [7]:
def extract_list(soup):
    # Extract lists properly
    lists = []
    for ul in soup.find_all("ul"):
        items = [li.get_text(strip=True) for li in ul.find_all("li")]
        lists.append(items)
    return lists

In [8]:
def _load_content(selectedOptions=None):
    """
    Load and process all .htm files from the base directory.
    """
    htm_files = _list_htm_files()
    logging.info(f"Found {len(htm_files)} .htm files.")
        
    if selectedOptions is None:
        selectedOptions = ["text", "table", "list"]
        
    # initialise empty training web documents.
    web_documents = []
        
    page_texts = []

    for file_path in htm_files:
        try:
            with open(file_path, encoding="utf-8") as file:
                content = file.read()
                    
                # ignore the redundant header section from content
                content = content[content.find("<body>")+6:content.find("</body>")]
                    
                soup = BeautifulSoup(content, "html.parser")
                    
                page_links = [a['href'] for a in soup.find_all('a', href=True)]
                                                
                
                clean_text = extract_text(soup)
                    
                formatted_table = extract_table(soup)
                    
                lists = extract_list(soup)
                        
                page_text = f"""Tables:\n---\n{formatted_table}\n---\nText:\n---\n{clean_text}\n---\nList:\n---\n{lists}\n---"""
 
                page_texts.append(page_text)
                    
                page_data = {
                    'text': page_text,
                    'link': page_links
                }
                    
                document = Document(
                    content = page_data['text']
                )

                if file_path.endswith("GEO_Limits.htm"):
                    print(f"Content:\n{document.content}")
                    
                web_documents.append(document)
                
        except UnicodeDecodeError:
            logging.error(f"Could not read the file {file_path}. Check the file encoding.")

    return web_documents

In [9]:
_load_content()

Content:
Tables:
---
+-----+----------------------------------------------------------------+-----------+
|   0 | Types                                                          | Limits    |
|   1 | Curves                                                         |           |
+-----+----------------------------------------------------------------+-----------+
|   2 | Number of curves                                               | 450       |
+-----+----------------------------------------------------------------+-----------+
|   3 | Size of curve units                                            | 24        |
+-----+----------------------------------------------------------------+-----------+
|   4 | Size of curve name                                             | 90        |
+-----+----------------------------------------------------------------+-----------+
|   5 | Number of data files to form one curve                         | None      |
+-----+-------------------------------------

[Document(id=7c3d1fb644e578c4b30e1307d41c0f9f96185eb9183dcecae1dc19f0cb6df8b0, content: 'Tables:
 ---
 
 ---
 Text:
 ---
 *Maximize screen to view table of contents*
 
 We have support centers stra...'),
 Document(id=952795e8097983e0f9a8391f1f0313122a4c6ebcc3838d66bb755def78baf0c3, content: 'Tables:
 ---
 
 ---
 Text:
 ---
 
 ---
 List:
 ---
 []
 ---'),
 Document(id=d562eb8f38d0288d09a8e821aba52c189cd325031e9072c866d3e88834f44734, content: 'Tables:
 ---
 
 ---
 Text:
 ---
 *Maximize screen to view table of contents*
 
 In this help system you will...'),
 Document(id=6fa148850b0946198889bc10bdfa908eb7609d26aaf644cd98ff0ba609aebb3a, content: 'Tables:
 ---
 
 ---
 Text:
 ---
 The help content for this dialog box is currently not available. In the m...'),
 Document(id=a456e6ae1073b08d3eaa1323897729edf0fdb04dcfdca0cdf1c44fba3b9f2722, content: 'Tables:
 ---
 
 ---
 Text:
 ---
 
 ---
 List:
 ---
 [['', '', ''], [''], [], ['', ''], [''], []]
 ---'),
 Document(id=3a4889757fcf12dddb5871901833975

In [10]:
import os

os.environ["OPENAI_API_KEY"] = "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"

In [1]:
import os

api_key = os.getenv("OPENAI_API_KEY")
if api_key:
    os.environ["OPENAI_API_KEY"] = api_key
    print(f"API key: {api_key}")
else:
    raise ValueError("API key not found. Set OPENAI_API_KEY as an environment variable.")

API key: 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


In [11]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

In [12]:
documents = _load_content()

Content:
Tables:
---
+-----+----------------------------------------------------------------+-----------+
|   0 | Types                                                          | Limits    |
|   1 | Curves                                                         |           |
+-----+----------------------------------------------------------------+-----------+
|   2 | Number of curves                                               | 450       |
+-----+----------------------------------------------------------------+-----------+
|   3 | Size of curve units                                            | 24        |
+-----+----------------------------------------------------------------+-----------+
|   4 | Size of curve name                                             | 90        |
+-----+----------------------------------------------------------------+-----------+
|   5 | Number of data files to form one curve                         | None      |
+-----+-------------------------------------

In [13]:
# Use a set to track unique documents based on content and meta
unique_docs = {}
for doc in documents:
    doc_key = (doc.content.strip(), tuple(doc.meta.items()))  # Normalize content & meta
    if doc_key not in unique_docs:
        unique_docs[doc_key] = doc

In [14]:
documents = list(unique_docs.values())

In [15]:
document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()
indexing_pipeline.add_component(
    instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="doc_embedder"
)
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="doc_writer")

indexing_pipeline.connect("doc_embedder.documents", "doc_writer.documents")

indexing_pipeline.run({"doc_embedder": {"documents": documents}})

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

{'doc_writer': {'documents_written': 268}}

In [16]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator

In [27]:
template = [
    ChatMessage.from_system(
        """
        You are an assistant designed to help users become more familiar with the GEO application.
        
        GEO is a PC-based well log authoring, analysis, and reporting system developed for 
        petroleum geologists, geoscientists, and engineers.
        
        Answer the user's questions accurately using retrieved information from the "Context" 
        section. This section contains help content written by software developers specifically 
        for the GEO application.
        
        Ensure that your response is concise and directly addresses the question, avoiding any 
        irrelevant information. The generated response should contain only the answer to the 
        user's question.
        
        Use the information from the section titled "---Feedback---" as guidelines for improving 
        your answers. Assess the validity and feasibility of the feedback before applying it to 
        refine future responses.

        Context:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}

        Question: {{ question }}
        Answer:
        """
    )
]

In [28]:
rag_pipe = Pipeline()
rag_pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
rag_pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
rag_pipe.add_component("prompt_builder", ChatPromptBuilder(template=template))
rag_pipe.add_component("llm", OpenAIChatGenerator(model="gpt-4o-mini"))

In [29]:
rag_pipe.connect("embedder.embedding", "retriever.query_embedding")
rag_pipe.connect("retriever", "prompt_builder.documents")
rag_pipe.connect("prompt_builder.prompt", "llm.messages")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000015EA35A1190>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: ChatPromptBuilder
  - llm: OpenAIChatGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.messages (List[ChatMessage])

In [30]:
questions = [
    "What's the maximum number of lithology types in a log?",
    "How many tracks can you define in one ODF?",
    "How many curve shades can I create?",
    "How many curves can I load in one go?",
    "What's the maximum number of headers I can display in my log?",
    "How many tables can I have in my log?",
    "What's the maximum number of characters in a single text entry?",
    "How many symbols can I have in the plot at any one time?",
    "How many scales can I define?",
    "What's the maximum number of data files I can load?"
]

In [31]:
predicted_responses = []
for query in questions:
    response = rag_pipe.run({"embedder": {"text": query}, "prompt_builder": {"question": query}})
    print(f"Question: {query}")
    predicted_responses.append(response['llm']['replies'][0]._content[0].text)
    print(f"Response: {response['llm']['replies'][0]._content[0].text}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of lithology types in a log?
Response: The maximum number of lithology types in a log is 450.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many tracks can you define in one ODF?
Response: Up to 250 shadings can be applied in one ODF.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many curve shades can I create?
Response: You can create up to 250 curve shades in one ODF.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many curves can I load in one go?
Response: You can load an unlimited number of data files in one go.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of headers I can display in my log?
Response: The context does not specify a maximum number of headers that can be displayed in a log, so that information is not available.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many tables can I have in my log?
Response: The information regarding the maximum number of tables that can be included in a log is not specified in the provided content. Please refer to the GEO application documentation or contact support for clarification.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of characters in a single text entry?
Response: The maximum number of characters for a curve mnemonic or unit must not exceed thirty-two characters.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many symbols can I have in the plot at any one time?
Response: The maximum number of symbols per plot is 10,000.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many scales can I define?
Response: You can define multiple scales for a curve in a plot.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of data files I can load?
Response: The GEO application does not specify a maximum number of data files that can be loaded; it can load multiple ASCII files as long as they meet the formatting requirements.


In [32]:
sample_responses = [
    450, 
    200, 
    250, 
    450, 
    50, 
    100, 
    "250 / 32000 (varies per text type)", 
    10000, 
    23, 
    "unlimited"
]

print(f"sample response: {sample_responses}")

sample response: [450, 200, 250, 450, 50, 100, '250 / 32000 (varies per text type)', 10000, 23, 'unlimited']


In [33]:
print(f"predicted response: {predicted_responses}")

predicted response: ['The maximum number of lithology types in a log is 450.', 'Up to 250 shadings can be applied in one ODF.', 'You can create up to 250 curve shades in one ODF.', 'You can load an unlimited number of data files in one go.', 'The context does not specify a maximum number of headers that can be displayed in a log, so that information is not available.', 'The information regarding the maximum number of tables that can be included in a log is not specified in the provided content. Please refer to the GEO application documentation or contact support for clarification.', 'The maximum number of characters for a curve mnemonic or unit must not exceed thirty-two characters.', 'The maximum number of symbols per plot is 10,000.', 'You can define multiple scales for a curve in a plot.', 'The GEO application does not specify a maximum number of data files that can be loaded; it can load multiple ASCII files as long as they meet the formatting requirements.']


In [34]:
comparison_df = pd.DataFrame({
    "Expected Response": sample_responses,
    "Predicted Response": predicted_responses
})

In [35]:
import ace_tools_open as tools

In [36]:
tools.display_dataframe_to_user(name="Sample Question Results DataFrame", dataframe=comparison_df)

Sample Question Results DataFrame


Expected Response,Predicted Response
Loading ITables v2.2.4 from the internet... (need help?),
