In [45]:
from datasets import load_dataset
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

In [46]:
from bs4 import BeautifulSoup
from haystack.telemetry import tutorial_running
import logging
import os
import pandas as pd
from tabulate import tabulate
import re

In [47]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
EXCEL_FILE = os.path.join(DATA_DIR, "query_responses.xlsx")
FEEDBACK_FILE = os.path.join(DATA_DIR, "feedback_dataset.json")
PROMPT_VISUALISATION_FILE = os.path.join(DATA_DIR, "prompt_visualisation.txt")
PROCESSED_CONTENT_FILE = os.path.join(DATA_DIR, "processed_content.txt")
UPLOADED_FILE = os.path.join(DATA_DIR, "uploaded_document.txt")

In [48]:
def _list_htm_files():
    """
    Recursively finds all .htm files in the DATA_DIR and its subdirectories.
        
    Returns:
        list: A list of full file paths.
    """
    htm_files = []
    for root, _, files in os.walk(DATA_DIR):
        for file in files:
            if file.endswith(".htm"):
                full_path = os.path.join(root, file)  # Get the absolute path
                htm_files.append(full_path)  

    return htm_files

In [49]:
def extract_text(soup):
    # Extract only meaningful paragraph text
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 20]  # Exclude very short text
    clean_text = "\n\n".join(paragraphs)
    
    return clean_text

In [50]:
def reformat_table(table_text):
    """
    Reformats the extracted table text into a structured and retrievable format.

    Args:
        table_text (str): Raw extracted table text.

    Returns:
        str: Reformatted text suitable for retrieval.
    """
    rows = table_text.split("\n")
    reformatted_lines = []
    
    for row in rows:
        # Match table rows that contain data (ignoring separators like "+----+")
        match = re.match(r"\|\s*(\d+)\s*\|\s*(.*?)\s*\|\s*(.*?)\s*\|", row)
        if match:
            _, key, value = match.groups()
            key = key.strip()
            value = value.strip()
            
            # Ensure meaningful values exist before adding
            if key and value and value.lower() != "none":
                reformatted_lines.append(f"{key}: {value}")

    return "\n".join(reformatted_lines)

In [51]:
def extract_table(soup):
    tables = soup.find_all("table")
    formatted_tables = []
    
    for table in tables:
        rows = []
        for row in table.find_all("tr"):
            cols = [col.get_text(strip=True) for col in row.find_all(["td", "th"])]
            rows.append(cols)
        
        # Flatten row values for filtering irrelevant tables
        flat_rows = [item for sublist in rows for item in sublist]
        if set(flat_rows) == {"Back", "Forward"}:
            continue
        
        # Convert extracted table to DataFrame
        df = pd.DataFrame(rows)
        
        # Convert to readable text using tabulate
        formatted_table = tabulate(df, headers="firstrow", tablefmt="grid")

        # Apply reformatting for better retrieval
        structured_table = reformat_table(formatted_table)

        formatted_tables.append(structured_table)

    return "\n\n".join(formatted_tables)

In [52]:
def extract_list(soup):
    # Extract lists properly
    lists = []
    for ul in soup.find_all("ul"):
        items = [li.get_text(strip=True) for li in ul.find_all("li")]
        lists.append(items)
    return lists

In [53]:
def _load_content(selectedOptions=None):
    """
    Load and process all .htm files from the base directory.
    """
    htm_files = _list_htm_files()
    logging.info(f"Found {len(htm_files)} .htm files.")
        
    if selectedOptions is None:
        selectedOptions = ["text", "table", "list"]
        
    # initialise empty training web documents.
    web_documents = []
        
    page_texts = []

    for file_path in htm_files:
        try:
            with open(file_path, encoding="utf-8") as file:
                content = file.read()
                    
                # ignore the redundant header section from content
                content = content[content.find("<body>")+6:content.find("</body>")]
                    
                soup = BeautifulSoup(content, "html.parser")
                    
                page_links = [a['href'] for a in soup.find_all('a', href=True)]
                                                
                
                clean_text = extract_text(soup)
                    
                formatted_table = extract_table(soup)
                    
                lists = extract_list(soup)
                        
                page_text = f"""
                    
                Tables: 
                ---
                {formatted_table}
                ---
                    
                Text:
                ---
                {clean_text}
                ---
                    
                List:
                ---
                {lists}
                ---
                """
 
                page_texts.append(page_text)
                    
                page_data = {
                    'text': page_text,
                    'link': page_links
                }
                    
                document = Document(
                    content = page_data['text']
                )

                if file_path.endswith("GEO_Limits.htm"):
                    print(f"Content: {document.content}")
                    
                web_documents.append(document)
                
        except UnicodeDecodeError:
            logging.error(f"Could not read the file {file_path}. Check the file encoding.")

    return web_documents

In [54]:
documents = _load_content()

Content: 
                    
                Tables: 
                ---
                Types: Limits
Number of curves: 450
Size of curve units: 24
Size of curve name: 90
Number of pen definitions: 20
Curve selection name: 60
Curve to lithology name: 50
Curve to lithology lithology types: 10
Data points per curve: Unlimited
Computed curve parameters: 250
Size of computed curve parameters name: 12
Computed curve expressions: 300
Size of computed curve expressions name: 25
Size of computed curve parameter description: 150
Number of 'curves for surfaces' definitions: 10
Number of curve synonym-pairs: 500
Number of tracks: 200
Number of qualitative tracks: 30
Size of track name: 75
Number of curve shades per plot: 250
Number of zones per curve shade: 50
Curve shade name length: 20
Number of data files: Unlimited
Columns per data file: 450
Size of file name (including the path names): 255
Size of file ID: 9
Number of file ID: 100
Auto file load definition name: 40
Number of mnemonics pe

In [55]:
# Use a set to track unique documents based on content and meta
unique_docs = {}
for doc in documents:
    doc_key = (doc.content.strip(), tuple(doc.meta.items()))  # Normalize content & meta
    if doc_key not in unique_docs:
        unique_docs[doc_key] = doc

In [56]:
# Convert back to a list of unique Document objects
# documents = list(unique_docs.values())

In [57]:
dataset = load_dataset("bilgeyucel/seven-wonders", split="train")

In [58]:
documents = [Document(content=doc["content"], meta=doc["meta"]) for doc in dataset]

In [59]:
import random
import textwrap

# Assuming `documents` is the list of Document objects
random_doc = random.choice(documents)  # Select a random document

# Format the output for readability
wrapped_content = textwrap.fill(random_doc.content, width=80)

# Print the selected document's meta and wrapped content
print(f"Random Meta: {random_doc.meta}\n")
print(f"Top result:\n{wrapped_content}")

Random Meta: {'url': 'https://en.wikipedia.org/wiki/Hanging_Gardens_of_Babylon', '_split_id': 8}

Top result:
[36] There was a tradition of Assyrian royal garden building. King Ashurnasirpal
II (883–859 BC) had created a canal, which cut through the mountains. Fruit tree
orchards were planted. Also mentioned were pines, cypresses and junipers; almond
trees, date trees, ebony, rosewood, olive, oak, tamarisk, walnut, terebinth,
ash, fir, pomegranate, pear, quince, fig, and grapes. A sculptured wall panel of
Assurbanipal shows the garden in its maturity. One original panel[37] and the
drawing of another[38] are held by the British Museum, although neither is on
public display. Several features mentioned by the classical authors are
discernible on these contemporary images.  Assyrian wall relief showing gardens
in Nineveh Of Sennacherib's palace, he mentions the massive limestone blocks
that reinforce the flood defences. Parts of the palace were excavated by Austin
Henry Layard in the mid-

In [60]:
model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

In [61]:
document_store = InMemoryDocumentStore()

In [62]:
from haystack.document_stores.types import DuplicatePolicy

indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
indexing_pipeline.connect("embedder.documents", "writer.documents")

indexing_pipeline.run({"documents": documents})

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

{'writer': {'documents_written': 151}}

In [63]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder

In [64]:
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

In [65]:
extractive_qa_pipeline = Pipeline()

extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")
extractive_qa_pipeline.add_component(instance=reader, name="reader")

extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x000002A2D4CD81D0>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [66]:
query = "Who was Pliney the elder"
answers = extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 5}, "reader": {"query": query, "top_k": 2}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [67]:
for i in range(len(answers['reader']['answers'])):
    print(f"Answer: {answers['reader']['answers'][i].data}")

Answer: Roman writer
Answer: a Roman author
Answer: None
