In [1]:
!pip install bs4 pandas rapidfuzz tabulate

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd

In [3]:
base_directory = "Data"

In [4]:
import os

htm_files = []
for root, _, files in os.walk(base_directory):
    for file in files:
        if file.endswith(".htm"):
            relative_path = os.path.relpath(
                os.path.join(root, file), start=base_directory
            )
            htm_files.append(os.path.join(base_directory, relative_path))

In [5]:
htm_files

['Data\\Contact_us.htm',
 'Data\\csh-redirect.htm',
 'Data\\First_Topic.htm',
 'Data\\GEO_Limits.htm',
 'Data\\Help_Missing.htm',
 'Data\\index.htm',
 'Data\\topic.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\Cement_Volume.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\D_Exponent.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\Mechanical_Specific_Energy.htm',
 'Data\\Computed_Curve_Templates\\Drilling\\Temperature_Gradient.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\C1_Sum.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Balance.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Character.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Wetness.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Inverse_Oil_Indicator.htm',
 'Data\\Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Oil_Indicator.htm',
 'Data\\Computed_Curve_Templates\\Gas\

In [6]:
import logging
from bs4 import BeautifulSoup

# Load all htm files and extract tables
def extract_tables_from_htm_files(htm_files):

    extracted_contents = []

    for file_path in htm_files:
        try:
            with open(file_path, encoding="utf-8") as file:
                content = file.read()

                # Ignore the redundant header section from content
                body_start = content.find("<body>") + 6
                body_end = content.find("</body>")
                content = content[body_start:body_end]

                soup = BeautifulSoup(content, "html.parser")

                if soup.find("table"):
                    table_data = []

                    for table in soup.find_all("table"):
                        rows = table.find_all("tr")

                        for row in rows:
                            cols = row.find_all("td")
                            cols = [col.get_text(strip=True) for col in cols]
                            if all(not col for col in cols):
                                continue
                            # Skip rows containing "Back" and "Forward" in the first two columns
                            if len(cols) > 1 and cols[0] == "Back" and cols[1] == "Forward":
                                continue
                            table_data.append(cols)

                    if len(table_data) > 1:
                        table_headings = table_data[0]
                        table_data = table_data[1:]

                        table_data_df = pd.DataFrame(table_data, columns=table_headings)
                        extracted_contents.append(table_data_df)

        except Exception as e:
            logging.error(f"Error processing {file_path}: {e}")

    return extracted_contents

In [7]:
extracted_tables = extract_tables_from_htm_files(htm_files)

In [8]:
extracted_tables

[                                         Types Limits
 0                                       Curves       
 1                             Number of curves    450
 2                          Size of curve units     24
 3                           Size of curve name     90
 4       Number of data files to form one curve   None
 ..                                         ...    ...
 113               Maximum significant decimals      4
 114                                      Zones       
 115  Number of query definitions per zone type     75
 116                                Correlation       
 117                          Correlation items     50
 
 [118 rows x 2 columns],
          Normal  \
 0  User-defined   
 1      Computed   
 2       Spliced   
 3         Table   
 4        Edited   
 
   Curves that are loaded into GEO throughImportfrom theGEOmenu. In curve lists, the file ID will follow the curve name.  
 0  Curves that are created by the user inCurveDat...               

In [9]:
from tabulate import tabulate

for table in extracted_tables:

    print(tabulate(table, headers='keys', tablefmt='grid'))

+-----+----------------------------------------------------------------+-----------+
|     | Types                                                          | Limits    |
|   0 | Curves                                                         |           |
+-----+----------------------------------------------------------------+-----------+
|   1 | Number of curves                                               | 450       |
+-----+----------------------------------------------------------------+-----------+
|   2 | Size of curve units                                            | 24        |
+-----+----------------------------------------------------------------+-----------+
|   3 | Size of curve name                                             | 90        |
+-----+----------------------------------------------------------------+-----------+
|   4 | Number of data files to form one curve                         | None      |
+-----+----------------------------------------------------------

In [10]:
def split_table_by_subheadings(df):
    sub_tables = {}
    current_subheading = None
    sub_table_data = []
    
    # skip the first row
    df = df[1:]
    
    # first row has the columns
    column_names = df.iloc[0].to_list()
    
    df = df[1:]

    for _, row in df.iterrows():
        
        if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
            if current_subheading and sub_table_data:
                
                sub_tables[current_subheading] = pd.DataFrame(sub_table_data, columns=column_names)
                # print(f"Subtable: \n{sub_tables[current_subheading]}")
                sub_table_data = []

            current_subheading = row[0]
        else:
            row_list = row.tolist()
            sub_table_data.append(row_list)

    # Add the last collected sub-table
    if current_subheading and sub_table_data:
        sub_tables[current_subheading] = pd.DataFrame(sub_table_data)

    # if there is nothing in the subtables, return nothing.
    if bool(sub_tables) == False:
        return df
    else:
        return sub_tables

In [11]:
temp_tables = []

for table in extracted_tables:

    sub_tables = split_table_by_subheadings(table)

    print(f"Subtable: \n{sub_tables}")

    temp_tables.append(sub_tables)


Subtable: 
{'Tracks':                                 Number of curves        450
0                            Size of curve units         24
1                             Size of curve name         90
2         Number of data files to form one curve       None
3                      Number of pen definitions         20
4                           Curve selection name         60
5                        Curve to lithology name         50
6             Curve to lithology lithology types         10
7                          Data points per curve  Unlimited
8                      Computed curve parameters        250
9         Size of computed curve parameters name         12
10                    Computed curve expressions        300
11       Size of computed curve expressions name         25
12  Size of computed curve parameter description        150
13   Number of 'curves for surfaces' definitions         10
14                 Number of curve synonym-pairs        500
15                

  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  current_subheading = row[0]
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  current_subheading = row[0]
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if row[1] == '':  # Identify subheadings based on NaN in the 'Limit' column
  if

In [12]:
temp_tables

[{'Tracks':                                 Number of curves        450
  0                            Size of curve units         24
  1                             Size of curve name         90
  2         Number of data files to form one curve       None
  3                      Number of pen definitions         20
  4                           Curve selection name         60
  5                        Curve to lithology name         50
  6             Curve to lithology lithology types         10
  7                          Data points per curve  Unlimited
  8                      Computed curve parameters        250
  9         Size of computed curve parameters name         12
  10                    Computed curve expressions        300
  11       Size of computed curve expressions name         25
  12  Size of computed curve parameter description        150
  13   Number of 'curves for surfaces' definitions         10
  14                 Number of curve synonym-pairs        50

In [13]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import logging
from tabulate import tabulate

# Ensure the 'tables' directory exists
output_dir = "tables"
os.makedirs(output_dir, exist_ok=True)

def extract_tables_from_htm_files(htm_files):
    extracted_tables = []

    for file_path in htm_files:
        try:
            with open(file_path, encoding="utf-8") as file:
                content = file.read()

                # Extract only the <body> content to remove redundant headers
                body_start = content.find("<body>") + 6
                body_end = content.find("</body>")
                content = content[body_start:body_end]

                soup = BeautifulSoup(content, "html.parser")

                # Extract all tables
                for idx, table in enumerate(soup.find_all("table")):
                    rows = table.find_all("tr")
                    table_data = []

                    for row in rows:
                        cols = row.find_all(["td", "th"])
                        cols = [col.get_text(strip=True) for col in cols]
                        if all(not col for col in cols):  # Ignore empty rows
                            continue
                        if len(cols) > 1 and cols[0] == "Back" and cols[1] == "Forward":
                            continue  # Skip navigation rows
                        table_data.append(cols)

                    # Convert table data to DataFrame
                    if len(table_data) > 1:
                        headers = table_data[0]
                        data_rows = table_data[1:]

                        # Handle missing headers
                        if any("Unnamed" in col for col in headers):
                            headers = [f"Column_{i}" for i in range(len(data_rows[0]))]

                        df = pd.DataFrame(data_rows, columns=headers)
                        extracted_tables.append(df)

                        # Save in multiple formats
                        base_filename = os.path.splitext(os.path.basename(file_path))[0]
                        table_filename = f"{base_filename}_table_{idx}"
                        
                        df.to_csv(f"{output_dir}/{table_filename}.csv", index=False)
                        df.to_json(f"{output_dir}/{table_filename}.json", orient="records", indent=4)
                        with open(f"{output_dir}/{table_filename}.md", "w") as md_file:
                            md_file.write(tabulate(df, headers="keys", tablefmt="pipe"))

        except Exception as e:
            logging.error(f"Error processing {file_path}: {e}")

    return extracted_tables

# Run extraction
extracted_tables = extract_tables_from_htm_files(htm_files)

# Display results
for idx, table in enumerate(extracted_tables):
    print(f"\nTable {idx+1}:\n", tabulate(table, headers="keys", tablefmt="grid"))



Table 1:
 +-----+----------------------------------------------------------------+-----------+
|     | Types                                                          | Limits    |
|   0 | Curves                                                         |           |
+-----+----------------------------------------------------------------+-----------+
|   1 | Number of curves                                               | 450       |
+-----+----------------------------------------------------------------+-----------+
|   2 | Size of curve units                                            | 24        |
+-----+----------------------------------------------------------------+-----------+
|   3 | Size of curve name                                             | 90        |
+-----+----------------------------------------------------------------+-----------+
|   4 | Number of data files to form one curve                         | None      |
+-----+-----------------------------------------------

In [14]:
from GEO_bots import GeoCurve

df = pd.read_csv("tables/Curve_Types_in_GEO_table_1.csv")

GeoCurveBot = GeoCurve.GeoCurveChatbot(df)

In [15]:
from rapidfuzz import process

In [16]:
def find_best_match(user_input, curve_list):
    best_match, score, _ = process.extractOne(user_input, curve_list)
    return best_match if score > 70 else None  # Threshold for a match

In [17]:
curve_names = ["Normal", "User-defined", "Computed", "Spliced", "Table", "Edited"]

# Get user input
request = input("Please enter your question: ").lower()  # Convert input to lowercase

# Find if any curve name is in the input text
curve_name = next((name for name in curve_names if name.lower() in request), None)

if curve_name:
    GEO_info = GeoCurveBot.get_curve_info(curve_name)
else:
    GEO_info = "Curve name not found. Please specify a valid curve name."

In [18]:
print(f"Question: {request}")
print(f"Answer: {GEO_info}")

Question: how to plot curves for tables?
Answer: Curves that are generated from the table data.
