In [1]:
from bs4 import BeautifulSoup
from haystack.telemetry import tutorial_running
import logging
import os
import pandas as pd
from tabulate import tabulate
import re

In [2]:
tutorial_running(27)

In [3]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
EXCEL_FILE = os.path.join(DATA_DIR, "query_responses.xlsx")
FEEDBACK_FILE = os.path.join(DATA_DIR, "feedback_dataset.json")
PROMPT_VISUALISATION_FILE = os.path.join(DATA_DIR, "prompt_visualisation.txt")
PROCESSED_CONTENT_FILE = os.path.join(DATA_DIR, "processed_content.txt")
UPLOADED_FILE = os.path.join(DATA_DIR, "uploaded_document.txt")

In [4]:
def _list_htm_files():
    """
    Recursively finds all .htm files in the DATA_DIR and its subdirectories.
        
    Returns:
        list: A list of full file paths.
    """
    htm_files = []
    for root, _, files in os.walk(DATA_DIR):
        for file in files:
            if file.endswith(".htm"):
                full_path = os.path.join(root, file)  # Get the absolute path
                htm_files.append(full_path)  

    return htm_files

In [5]:
def extract_text(soup):
    # Extract only meaningful paragraph text
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 20]  # Exclude very short text
    clean_text = "\n\n".join(paragraphs)
    
    return clean_text

In [6]:
def reformat_table(table_text):
    """
    Reformats the extracted table text into a structured and retrievable format.

    Args:
        table_text (str): Raw extracted table text.

    Returns:
        str: Reformatted text suitable for retrieval.
    """
    rows = table_text.split("\n")
    reformatted_lines = []
    
    for row in rows:
        # Match table rows that contain data (ignoring separators like "+----+")
        match = re.match(r"\|\s*(\d+)\s*\|\s*(.*?)\s*\|\s*(.*?)\s*\|", row)
        if match:
            _, key, value = match.groups()
            key = key.strip()
            value = value.strip()
            
            # Ensure meaningful values exist before adding
            if key and value and value.lower() != "none":
                reformatted_lines.append(f"{key}: {value}")

    return "\n".join(reformatted_lines)

In [7]:
def extract_table(soup):
    tables = soup.find_all("table")
    formatted_tables = []
    
    for table in tables:
        rows = []
        for row in table.find_all("tr"):
            cols = [col.get_text(strip=True) for col in row.find_all(["td", "th"])]
            rows.append(cols)
        
        # Flatten row values for filtering irrelevant tables
        flat_rows = [item for sublist in rows for item in sublist]
        if set(flat_rows) == {"Back", "Forward"}:
            continue
        
        # Convert extracted table to DataFrame
        df = pd.DataFrame(rows)
        
        # Convert to readable text using tabulate
        formatted_table = tabulate(df, headers="firstrow", tablefmt="grid")

        # Apply reformatting for better retrieval
        structured_table = reformat_table(formatted_table)

        formatted_tables.append(structured_table)

    return "\n\n".join(formatted_tables)

In [8]:
def extract_list(soup):
    # Extract lists properly
    lists = []
    for ul in soup.find_all("ul"):
        items = [li.get_text(strip=True) for li in ul.find_all("li")]
        lists.append(items)
    return lists

In [9]:
def _load_content(selectedOptions=None):
    """
    Load and process all .htm files from the base directory.
    """
    htm_files = _list_htm_files()
    logging.info(f"Found {len(htm_files)} .htm files.")
        
    if selectedOptions is None:
        selectedOptions = ["text", "table", "list"]
        
    # initialise empty training web documents.
    web_documents = []
        
    page_texts = []

    for file_path in htm_files:
        try:
            with open(file_path, encoding="utf-8") as file:
                content = file.read()
                    
                # ignore the redundant header section from content
                content = content[content.find("<body>")+6:content.find("</body>")]
                    
                soup = BeautifulSoup(content, "html.parser")
                    
                page_links = [a['href'] for a in soup.find_all('a', href=True)]
                                                
                
                clean_text = extract_text(soup)
                    
                formatted_table = extract_table(soup)
                    
                lists = extract_list(soup)
                        
                page_text = f"""
                    
                Tables: 
                ---
                {formatted_table}
                ---
                    
                Text:
                ---
                {clean_text}
                ---
                    
                List:
                ---
                {lists}
                ---
                """
 
                page_texts.append(page_text)
                    
                page_data = {
                    'text': page_text,
                    'link': page_links
                }
                    
                document = Document(
                    content = page_data['text']
                )

                if file_path.endswith("GEO_Limits.htm"):
                    print(f"Content: {document.content}")
                    
                web_documents.append(document)
                
        except UnicodeDecodeError:
            logging.error(f"Could not read the file {file_path}. Check the file encoding.")

    return web_documents

In [10]:
import os

os.environ["OPENAI_API_KEY"] = "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"

In [11]:
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator

messages = [
    ChatMessage.from_system("Always respond in German even if some input data is in other languages."),
    ChatMessage.from_user("What's Natural Language Processing? Be brief."),
]

chat_generator = OpenAIChatGenerator(model="gpt-4o-mini")
chat_generator.run(messages=messages)

{'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text='Natural Language Processing (NLP) ist ein Teilbereich der Künstlichen Intelligenz, der sich mit der Interaktion zwischen Computern und menschlicher Sprache beschäftigt. Ziel ist es, Computern zu ermöglichen, Texte und Sprache in einer Weise zu verstehen, zu interpretieren und zu generieren, die für Menschen natürlich und sinnvoll ist.')], _name=None, _meta={'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 69, 'prompt_tokens': 33, 'total_tokens': 102, 'completion_tokens_details': CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), 'prompt_tokens_details': PromptTokensDetails(audio_tokens=0, cached_tokens=0)}})]}

In [13]:
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder




In [15]:
documents = _load_content()

Content: 
                    
                Tables: 
                ---
                Types: Limits
Number of curves: 450
Size of curve units: 24
Size of curve name: 90
Number of pen definitions: 20
Curve selection name: 60
Curve to lithology name: 50
Curve to lithology lithology types: 10
Data points per curve: Unlimited
Computed curve parameters: 250
Size of computed curve parameters name: 12
Computed curve expressions: 300
Size of computed curve expressions name: 25
Size of computed curve parameter description: 150
Number of 'curves for surfaces' definitions: 10
Number of curve synonym-pairs: 500
Number of tracks: 200
Number of qualitative tracks: 30
Size of track name: 75
Number of curve shades per plot: 250
Number of zones per curve shade: 50
Curve shade name length: 20
Number of data files: Unlimited
Columns per data file: 450
Size of file name (including the path names): 255
Size of file ID: 9
Number of file ID: 100
Auto file load definition name: 40
Number of mnemonics pe

In [17]:
# Use a set to track unique documents based on content and meta
unique_docs = {}
for doc in documents:
    doc_key = (doc.content.strip(), tuple(doc.meta.items()))  # Normalize content & meta
    if doc_key not in unique_docs:
        unique_docs[doc_key] = doc

In [18]:
documents = list(unique_docs.values())

In [19]:
document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()
indexing_pipeline.add_component(
    instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="doc_embedder"
)
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="doc_writer")

indexing_pipeline.connect("doc_embedder.documents", "doc_writer.documents")

indexing_pipeline.run({"doc_embedder": {"documents": documents}})

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

{'doc_writer': {'documents_written': 268}}

In [20]:
print(len(_list_htm_files()))

269


In [21]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator

In [22]:
template = [
    ChatMessage.from_system(
        """
        You are an assistant for helping the users becoming more familiar with using the GEO   \ 
        application. 
            
        GEO is an integrated a PC-based integrated well log authoring, analysis and reporting  \
        system which has been developed for petroleum geologists, geoscientists and engineers.
            
        Answer the user's questions accurately using retrieved information from the Documents  \
        section precisely. The Document section contains the help content written by software  \ 
        developers for the GEO application. 
            
        Ensure that the answer is concise and answers the question to the point without the    \
        inclusion of any irrelevant information. Only the answer to the question should be     \
        outputted as the generated response. 
            
        Use the information from the section under the title "---Feedback---" as feedback for  \
        making improvements to your answers. Use the feedback as guidelines to determine which \
        area you need to improve your answer after assessing their validity and feasibility.

        Context:
        {% for document in documents %}
            {{ document.content }}
        {% endfor %}
        Question: {{ question }}
        Answer:
        """
    )
]

In [23]:
rag_pipe = Pipeline()
rag_pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
rag_pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
rag_pipe.add_component("prompt_builder", ChatPromptBuilder(template=template))
rag_pipe.add_component("llm", OpenAIChatGenerator(model="gpt-4o-mini"))

In [24]:
rag_pipe.connect("embedder.embedding", "retriever.query_embedding")
rag_pipe.connect("retriever", "prompt_builder.documents")
rag_pipe.connect("prompt_builder.prompt", "llm.messages")

<haystack.core.pipeline.pipeline.Pipeline object at 0x000002A48736CCD0>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: ChatPromptBuilder
  - llm: OpenAIChatGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.messages (List[ChatMessage])

In [33]:
questions = [
    "What's the maximum number of lithology types in a log?",
    "How many tracks can you define in one ODF?",
    "How many curve shades can I create?",
    "How many curves can I load in one go?",
    "What's the maximum number of headers I can display in my log?",
    "How many tables can I have in my log?",
    "What's the maximum number of characters in a single text entry?",
    "How many symbols can I have in the plot at any one time?",
    "How many scales can I define?",
    "What's the maximum number of data files I can load?"
]

In [36]:
predicted_responses = []
for query in questions:
    response = rag_pipe.run({"embedder": {"text": query}, "prompt_builder": {"question": query}})
    print(f"Question: {query}")
    predicted_responses.append(response['llm']['replies'][0]._content[0].text)
    print(f"Response: {response['llm']['replies'][0]._content[0].text}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of lithology types in a log?
Response: The maximum number of lithology types in a log is 450.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many tracks can you define in one ODF?
Response: The maximum number of tracks per ODF is 200.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many curve shades can I create?
Response: You can create up to 250 curve shades in one ODF.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many curves can I load in one go?
Response: You can load an unlimited number of curves in one go in GEO.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of headers I can display in my log?
Response: The document does not specify the maximum number of headers that can be displayed in your log. Please refer to the software’s limits on header quantities in the relevant documentation for that information.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many tables can I have in my log?
Response: You can have a maximum of 100 tables in one ODF file.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of characters in a single text entry?
Response: The maximum number of characters in a single text entry is determined by the size of the text box, which can be edited as needed.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many symbols can I have in the plot at any one time?
Response: You can have up to 250 shadings applied in one ODF.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How many scales can I define?
Response: You can define multiple scales for a curve in a plot, allowing for scale changes at different points in the plot.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: What's the maximum number of data files I can load?
Response: The maximum number of data files you can load in GEO is unlimited.


In [37]:
sample_responses = [
    450, 
    200, 
    250, 
    450, 
    50, 
    100, 
    "250 / 32000 (varies per text type)", 
    10000, 
    23, 
    "unlimited"
]

print(f"sample response: {sample_responses}")

sample response: [450, 200, 250, 450, 50, 100, '250 / 32000 (varies per text type)', 10000, 23, 'unlimited']


In [38]:
print(f"predicted response: {predicted_responses}")

predicted response: ['The maximum number of lithology types in a log is 450.', 'The maximum number of tracks per ODF is 200.', 'You can create up to 250 curve shades in one ODF.', 'You can load an unlimited number of curves in one go in GEO.', 'The document does not specify the maximum number of headers that can be displayed in your log. Please refer to the software’s limits on header quantities in the relevant documentation for that information.', 'You can have a maximum of 100 tables in one ODF file.', 'The maximum number of characters in a single text entry is determined by the size of the text box, which can be edited as needed.', 'You can have up to 250 shadings applied in one ODF.', 'You can define multiple scales for a curve in a plot, allowing for scale changes at different points in the plot.', 'The maximum number of data files you can load in GEO is unlimited.']


In [39]:
comparison_df = pd.DataFrame({
    "Expected Response": sample_responses,
    "Predicted Response": predicted_responses
})

In [40]:
import ace_tools_open as tools

In [41]:
tools.display_dataframe_to_user(name="Sample Question Results DataFrame", dataframe=comparison_df)

Sample Question Results DataFrame


Expected Response,Predicted Response
Loading ITables v2.2.4 from the internet... (need help?),
