In [1]:
%pip install beautifulsoup4 haystack-ai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from bs4 import BeautifulSoup

## List Data Files

In [3]:
base_directory = "Data"

In [4]:
def _list_htm_files():
    """
    Recursively finds all .htm files in the base directory and its subdirectories.
        
    Returns:
        list: A list of file paths relative to the base directory.
    """
    htm_files = []
    for root, _, files in os.walk(base_directory):
        for file in files:
            if file.endswith(".htm"):
                relative_path = os.path.relpath(os.path.join(root, file), start=base_directory)
                htm_files.append(os.path.join(base_directory, relative_path))
    
    return htm_files

In [5]:
htm_files = _list_htm_files() # DEMO: Listing all the htm_files in the Data folder. 

In [6]:
htm_files

['Data\\Contact_us.htm',
 'Data\\csh-redirect.htm',
 'Data\\First_Topic.htm',
 'Data\\Help_Missing.htm',
 'Data\\index.htm',
 'Data\\topic.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\Cement_Volume.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\D_Exponent.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\Mechanical_Specific_Energy.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\Temperature_Gradient.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\C1_Sum.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Balance.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Character.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Wetness.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Inverse_Oil_Indicator.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Oil_Indicator.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluati

## Process Data Files

In [7]:
%pip install pandas tabulate

Note: you may need to restart the kernel to use updated packages.


In [8]:
import re
import pandas as pd
from tabulate import tabulate

In [9]:
def extract_text(soup):
    # Define navigation-related keyword patterns
    navigation_keywords = [
        r'contact\s+us', r'click\s+(here|for)', r'guidance', r'help', r'support', r'assistance',
        r'maximize\s+screen', r'view\s+details', r'read\s+more', r'convert.*file', r'FAQ', r'learn\s+more'
    ]
    
    navigation_pattern = re.compile(r"|".join(navigation_keywords), re.IGNORECASE)

    # Remove navigation-related text
    for tag in soup.find_all("p"):
        if navigation_pattern.search(tag.text):
            tag.decompose()

    # Extract only meaningful paragraph text (excluding very short ones)
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 20]
    
    clean_text = "\n\n".join(paragraphs)
    
    return clean_text

In [10]:
def extract_list(soup):
    # Extract lists properly
    lists = []
    for ul in soup.find_all("ul"):
        items = [li.get_text(strip=True) for li in ul.find_all("li")]
        lists.append(items)
    return lists

In [11]:
def extract_table_as_text_block(file_path):
    """
    Extract tables from HTML as a single formatted text block for inclusion into page_text.
    Skips navigation tables and handles no-table cases.

    Args:
        file_path (str): Path to the file (for metadata).

    Returns:
        str: Formatted block of all tables from this file, or a message if no tables are found.
    """
    try:
        tables = pd.read_html(file_path)

        def is_navigation_table(table):
            """Detect if table is a 'navigation-only' table with just 'back' and 'forward'."""
            flattened = [str(cell).strip().lower() for cell in table.to_numpy().flatten()]
            navigation_keywords = {"back", "forward"}
            return set(flattened).issubset(navigation_keywords)
        
        def is_nan_only_table(table):
            """Detect if the entire table only contains NaN values."""
            return table.isna().all().all()

        table_texts = []
        table_count = 0

        for idx, table in enumerate(tables):
            if is_navigation_table(table) or is_nan_only_table(table):
                continue
            
            if table.shape[1] == 2:
                # Drop rows where both the second and third columns are NaN
                table = table.dropna(how='all')

                last_col = table.columns[-1]

                table[last_col] = table[last_col].fillna("")

            table_count += 1
            formatted_table = tabulate(table, headers="keys", tablefmt="grid")

            beautified_table = f"""
╔════════════════════════════════════════════════════╗
║            📊 Table {table_count} from {file_path}              ║
╚════════════════════════════════════════════════════╝

{formatted_table}

╔════════════════════════════════════════════════════╗
║            🔚 End of Table {table_count}                       ║
╚════════════════════════════════════════════════════╝
"""
            table_texts.append(beautified_table)

        if not table_texts:
            return ""

        return "\n".join(table_texts)

    except ValueError:
        # No tables found case
        return ""

In [12]:
%pip install lxml html5lib

Note: you may need to restart the kernel to use updated packages.


In [13]:
import logging 
from haystack import Document

In [14]:
def load_content():
    """
    Load and process all .htm files from the given base directory for information retrieval.
    
    Returns:
        list: A list of dictionaries containing processed text content and metadata.
    """

    # Find all .htm files in the directory
    htm_files = _list_htm_files()
    logging.info(f"Found {len(htm_files)} .htm files.")

    documents = []
    page_texts = []

    for file_path in htm_files:
        try:
            with open(file_path, encoding="utf-8") as file:
                content = file.read()

                # Extract content inside the <body> tag
                content = content[content.find("<body>") + 6 : content.find("</body>")]

                soup = BeautifulSoup(content, "html.parser")

                # Extract text, tables, and lists
                clean_text = extract_text(soup)
                formatted_table = extract_table_as_text_block(file_path)
                lists = extract_list(soup)

                # Combine extracted content
                page_text = "\n".join(filter(None, [clean_text, formatted_table, "\n".join(["• " + item for sublist in lists for item in sublist])]))

                page_texts.append(page_text)

                # Store the processed content as a Document Objects. 
                document = Document(content=page_text, meta={"source": os.path.basename(file_path)})

                documents.append(document)

        except UnicodeDecodeError:
            logging.error(f"Could not read the file {file_path}. Check the file encoding.")

    logging.info(f"Processed {len(documents)} documents.")
    
    return documents

In [15]:
from bs4 import XMLParsedAsHTMLWarning
import warnings

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [16]:
training_documents = load_content()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table[last_col] = table[last_col].fillna("")


## Document Query and Retrieval

In [17]:
%pip install sentence-transformers ollama-haystack

Note: you may need to restart the kernel to use updated packages.


In [18]:
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack_integrations.components.embedders.ollama import OllamaTextEmbedder
from haystack_integrations.components.embedders.ollama import OllamaDocumentEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

In [19]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

document_embedder = OllamaDocumentEmbedder()
documents_with_embeddings = document_embedder.run(training_documents)['documents']
document_store.write_documents(documents_with_embeddings, policy="overwrite")

Calculating embeddings: 100%|██████████| 9/9 [03:46<00:00, 25.14s/it]


269

In [20]:
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", OllamaTextEmbedder())
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

query = "What is the limit to the number of curves in GEO?"

result = query_pipeline.run({"text_embedder":{"text": query}})

In [21]:
result

{'text_embedder': {'meta': {'model': 'nomic-embed-text'}},
 'retriever': {'documents': [Document(id=9bbcad922c0dce91cfaa93eca689ef00992f798cf01b984e4befaf0b55a916e7, content: 'Data typically stored in tabular format can be converted into a GEO Curve.
   
   On theHometab, on theTab...', meta: {'source': 'Create_Curve_Data_from_Table_Columns.htm'}, score: 0.6855712654921557),
   Document(id=c94c0c82f4687e0a0d2c6427ca94903137a59fc9f9cc9ae7baafbff7baa00d0a, content: 'GEO accepts curve data in ascii (space, comma and tab delimited), Las and XML format.
   
   On theGEOtab,...', meta: {'source': 'Load_Las_Curve_Data.htm'}, score: 0.6852733135198559),
   Document(id=3e7030e6c146860de658813e3f00c6511cd9318eb69aabd530313a01123a9e52, content: 'On theDefinetab, selectAutoLoad. The Auto Load Definitions dialog box will open.
   
   For a first-time l...', meta: {'source': 'Automatic_File_Load.htm'}, score: 0.6461052082908765),
   Document(id=cbc26c30555f7190c9ff5f67869a9af37322a084554d001c7519b4b

## Similarity Ranking Document Query

In [22]:
%pip install transformers[torch,sentencepiece]

Note: you may need to restart the kernel to use updated packages.


In [23]:
from haystack import Document
from haystack.components.rankers import TransformersSimilarityRanker

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
docs = [Document(content="Paris"), Document(content="Berlin")]

In [25]:
ranker = TransformersSimilarityRanker()
ranker.warm_up()

In [26]:
ranker.run(query="City in France", documents=docs, top_k=1)

{'documents': [Document(id=438e9f6860659f042dfb7c7b9e38ae1ebc95bec001df6a22755a8a02f3d1e964, content: 'Paris', score: 0.6908064484596252)]}

In [27]:
ranker.run(query="I have 20000 modifiers added ty log, why I can't I add anymore?", documents=training_documents, top_k=1)

{'documents': [Document(id=e15d4fe034a5d1e64a453b442a7780d74abbf674bde2c2dee966e51e612a9028, content: 'You can provide additional information for log elements (such as lithology units, modifiers, symbols...', meta: {'source': 'Object_Links_in_GEO.htm'}, score: 0.0011907320003956556, embedding: vector of size 768)]}

## Ollama Model (Offline)

In [29]:
%pip install langchain-ollama

Note: you may need to restart the kernel to use updated packages.


In [30]:
from langchain_ollama import OllamaLLM

In [33]:
offline_model = OllamaLLM(model="llama3.2:latest")

In [34]:
result = offline_model.invoke("Hello word")
print(result)

Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?
