In [1]:
import os

In [2]:
base_directory = "Data"
model_name = "llama3"

In [3]:
htm_files = []
for root, _, files in os.walk(base_directory):
    for file in files:
        if file.endswith(".htm"):
            relative_path = os.path.relpath(os.path.join(root, file), start=base_directory)
            htm_files.append(base_directory + "/" + relative_path)

In [4]:
htm_files

['Data/Contact_us.htm',
 'Data/csh-redirect.htm',
 'Data/First_Topic.htm',
 'Data/Help_Missing.htm',
 'Data/index.htm',
 'Data/topic.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Cement_Volume.htm',
 'Data/Computed_Curve_Templates\\Drilling\\D_Exponent.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Mechanical_Specific_Energy.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Temperature_Gradient.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\C1_Sum.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Balance.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Character.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Wetness.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Inverse_Oil_Indicator.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Oil_Indicator.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Pixler_C1_C2_

In [5]:
from bs4 import BeautifulSoup
import pandas as pd


dataFrame = None

for file_path in htm_files:
    if file_path.endswith("GEO_Limits.htm"):
        with open(file_path, encoding="utf-8") as file:
            content = file.read()
            
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            
            data = []
            
            for table in soup.find_all('table'):
                rows = table.find_all('tr')
                for row in rows:
                    cols = row.find_all('td')
                    cols = [col.get_text(strip=True) for col in cols]
                    if cols:
                        data.append(cols)

            # Convert the list of data into a DataFrame
            dataFrame = pd.DataFrame(data, columns=['Type/Category', 'Limit'])
            
            dataFrame = dataFrame[dataFrame['Type/Category'] != '']
            break
        
if dataFrame is not None:
    print(dataFrame)
    
# store the dataFrame as a csv file
# dataFrame.to_csv(model_name + "_GEO_Limits.csv", index=False)

                                 Type/Category    Limit
0                                         Back  Forward
1                                        Types   Limits
2                                       Curves         
3                             Number of curves      450
4                          Size of curve units       24
..                                         ...      ...
131               Maximum significant decimals        4
133                                      Zones         
134  Number of query definitions per zone type       75
136                                Correlation         
137                          Correlation items       50

[120 rows x 2 columns]


In [6]:
def split_table_by_subheadings(df, column_name):
    sub_tables = {}
    current_subheading = None
    sub_table_data = []

    for _, row in df.iterrows():
        # print(f"Row data: \"{row['Limit']}\"")
        if row['Limit'] == '':  # Identify subheadings based on NaN in the 'Limit' column
            current_subheading = row[column_name]
            print(current_subheading)
            # print(sub_table_data)
            if current_subheading and sub_table_data:
                sub_tables[current_subheading] = pd.DataFrame(sub_table_data)
                sub_table_data = []
        else:
            sub_table_data.append(row)

    # Add the last collected sub-table
    if current_subheading and sub_table_data:
        sub_tables[current_subheading] = pd.DataFrame(sub_table_data)

    return sub_tables

In [7]:
sub_tables = split_table_by_subheadings(dataFrame, 'Type/Category')

Curves
Tracks
Curve Shading
Data Files
Mnemonics
Free Format Text
Track Text
Tables
Lithology (includes Structures)
Modifiers
Symbols
Lines
Headers and Trailers
Tadpole
System/General
VectDraw or Vector Object
Imaging Tools
Zones
Correlation


In [8]:
third_subheading = list(sub_tables.keys())[2]  # Get the key for the third sub-table
third_sub_table = sub_tables[third_subheading]

print(f"Subheading: {third_subheading}")
print(third_sub_table)

Subheading: Curve Shading
                   Type/Category Limit
21              Number of tracks   200
22  Number of qualitative tracks    30
23            Size of track name    75


In [9]:
# Task 2: Print every sub-table
for subheading, sub_table in sub_tables.items():
    print(f"\nSubheading: {subheading}")
    print(sub_table)


Subheading: Curves
  Type/Category    Limit
0          Back  Forward
1         Types   Limits

Subheading: Tracks
                                   Type/Category      Limit
3                               Number of curves        450
4                            Size of curve units         24
5                             Size of curve name         90
6         Number of data files to form one curve       None
7                      Number of pen definitions         20
8                           Curve selection name         60
9                        Curve to lithology name         50
10            Curve to lithology lithology types         10
11                         Data points per curve  Unlimited
12                     Computed curve parameters        250
13        Size of computed curve parameters name         12
14                    Computed curve expressions        300
15       Size of computed curve expressions name         25
16  Size of computed curve parameter descript