In [15]:
import os

In [16]:
base_directory = "Data"
model_name = "llama3"

In [17]:
htm_files = []
for root, _, files in os.walk(base_directory):
    for file in files:
        if file.endswith(".htm"):
            relative_path = os.path.relpath(os.path.join(root, file), start=base_directory)
            htm_files.append(base_directory + "/" + relative_path)

In [18]:
htm_files

['Data/Contact_us.htm',
 'Data/csh-redirect.htm',
 'Data/First_Topic.htm',
 'Data/Help_Missing.htm',
 'Data/index.htm',
 'Data/topic.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Cement_Volume.htm',
 'Data/Computed_Curve_Templates\\Drilling\\D_Exponent.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Mechanical_Specific_Energy.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Temperature_Gradient.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\C1_Sum.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Balance.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Character.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Wetness.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Inverse_Oil_Indicator.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Oil_Indicator.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Pixler_C1_C2_

In [19]:
from bs4 import BeautifulSoup
import pandas as pd


dataFrame = None

for file_path in htm_files:
    if file_path.endswith("GEO_Limits.htm"):
        with open(file_path, encoding="utf-8") as file:
            content = file.read()
            
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            
            data = []
            
            for table in soup.find_all('table'):
                rows = table.find_all('tr')
                for row in rows:
                    cols = row.find_all('td')
                    cols = [col.get_text(strip=True) for col in cols]
                    print("Column: ", cols)
                    if cols:
                        data.append(cols)

            # Convert the list of data into a DataFrame
            dataFrame = pd.DataFrame(data, columns=['Type/Category', 'Limit'])
            
            dataFrame = dataFrame[dataFrame['Type/Category'] != '']
            break
        
# if dataFrame is not None:
#     print(dataFrame)
    
# store the dataFrame as a csv file
# dataFrame.to_csv(model_name + "_GEO_Limits.csv", index=False)

Column:  ['Back', 'Forward']
Column:  ['Types', 'Limits']
Column:  ['Curves', '']
Column:  ['Number of curves', '450']
Column:  ['Size of curve units', '24']
Column:  ['Size of curve name', '90']
Column:  ['Number of data files to form one curve', 'None']
Column:  ['Number of pen definitions', '20']
Column:  ['Curve selection name', '60']
Column:  ['Curve to lithology name', '50']
Column:  ['Curve to lithology lithology types', '10']
Column:  ['Data points per curve', 'Unlimited']
Column:  ['Computed curve parameters', '250']
Column:  ['Size of computed curve parameters name', '12']
Column:  ['Computed curve expressions', '300']
Column:  ['Size of computed curve expressions name', '25']
Column:  ['Size of computed curve parameter description', '150']
Column:  ["Number of 'curves for surfaces' definitions", '10']
Column:  ['Number of curve synonym-pairs', '500']
Column:  ['', '']
Column:  ['Tracks', '']
Column:  ['Number of tracks', '200']
Column:  ['Number of qualitative tracks', '30']

In [6]:
def split_table_by_subheadings(df, column_name):
    sub_tables = {}
    current_subheading = None
    sub_table_data = []

    for _, row in df.iterrows():
        # print(f"Row data: \"{row['Limit']}\"")
        if row['Limit'] == '':  # Identify subheadings based on NaN in the 'Limit' column
            current_subheading = row[column_name]
            print(current_subheading)
            # print(sub_table_data)
            if current_subheading and sub_table_data:
                sub_tables[current_subheading] = pd.DataFrame(sub_table_data)
                sub_table_data = []
        else:
            sub_table_data.append(row)

    # Add the last collected sub-table
    if current_subheading and sub_table_data:
        sub_tables[current_subheading] = pd.DataFrame(sub_table_data)

    return sub_tables

In [7]:
sub_tables = split_table_by_subheadings(dataFrame, 'Type/Category')

Curves
Tracks
Curve Shading
Data Files
Mnemonics
Free Format Text
Track Text
Tables
Lithology (includes Structures)
Modifiers
Symbols
Lines
Headers and Trailers
Tadpole
System/General
VectDraw or Vector Object
Imaging Tools
Zones
Correlation


In [8]:
third_subheading = list(sub_tables.keys())[2]  # Get the key for the third sub-table
third_sub_table = sub_tables[third_subheading]

print(f"Subheading: {third_subheading}")
print(third_sub_table)

Subheading: Curve Shading
                   Type/Category Limit
21              Number of tracks   200
22  Number of qualitative tracks    30
23            Size of track name    75


In [9]:
# Task 2: Print every sub-table
for subheading, sub_table in sub_tables.items():
    print(f"\nSubheading: {subheading}")
    print(sub_table)


Subheading: Curves
  Type/Category    Limit
0          Back  Forward
1         Types   Limits

Subheading: Tracks
                                   Type/Category      Limit
3                               Number of curves        450
4                            Size of curve units         24
5                             Size of curve name         90
6         Number of data files to form one curve       None
7                      Number of pen definitions         20
8                           Curve selection name         60
9                        Curve to lithology name         50
10            Curve to lithology lithology types         10
11                         Data points per curve  Unlimited
12                     Computed curve parameters        250
13        Size of computed curve parameters name         12
14                    Computed curve expressions        300
15       Size of computed curve expressions name         25
16  Size of computed curve parameter descript

In [10]:
from pathlib import Path

current_folder_path = Path.cwd()

for subheading, sub_table in sub_tables.items():
    # save the item as a sub-table file in a sub directory "csv-file/intro/limits"
    if not os.path.exists("csv-files/intro/limits"):
        os.makedirs("csv-files/intro/limits")
        
    try:
        sub_table.to_csv("csv-files/intro/limits/" + model_name + "_" + subheading + ".csv", index=False)
    except Exception as e:
        pass

In [11]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PandasCSVReader

documents = []

for subheading, sub_table in sub_tables.items():
    # Construct the folder path
    folder_path = current_folder_path / "csv-files" / "intro" / "limits"
    
    # Construct the full file path
    file_path = folder_path / f"{model_name}_{subheading}.csv"
    
    parser = PandasCSVReader()
    fe = {".csv":parser}
    loader = SimpleDirectoryReader(folder_path, file_extractor=fe)
    docs = loader.load_data()
    for doc in docs:
        if doc.metadata['file_type'] == "text/csv":
            # replace "," with ": " in the text
            documents.append(doc.text)
            
print(documents)

['Correlation items, 50', 'Number of tracks, 200\nNumber of qualitative tracks, 30\nSize of track name, 75', 'Back, Forward\nTypes, Limits', 'Number of curve shades per plot, 250\nNumber of zones per curve shade, 50\nCurve shade name length, 20', 'Number of mnemonics per file, 100\nNumber of mnemonics per plot, 600\nSize of curve mnemonic, 32\nSize of file mnemonic value, 250\nSize of plot mnemonic value, 1000\nSize of mnemonic description, 40', 'Number of lines per plot, 750', 'Number of points per polygon in VOB, 20\nNumber of different fonts for VOB, 50\nSize of text in VOB, 300\nMemory for all bitmaps and VOBs, 300 KB', 'Number of symbol types, 1000\nNumber of symbols per plot, 10000', "Number of tables, 100\nNumber of rows in 'operations diary' type table, 4320\nNumber of rows in 'normal' and 'operations remarks' type table, 32000\nNumber of fields in a row`, 20\nColumns per table, 4320\nSize of table name, 29\nSize of table ID, 12\nSize of table column heading, 29\nSize of postfi

In [12]:
documents

['Correlation items, 50',
 'Number of tracks, 200\nNumber of qualitative tracks, 30\nSize of track name, 75',
 'Back, Forward\nTypes, Limits',
 'Number of curve shades per plot, 250\nNumber of zones per curve shade, 50\nCurve shade name length, 20',
 'Number of mnemonics per file, 100\nNumber of mnemonics per plot, 600\nSize of curve mnemonic, 32\nSize of file mnemonic value, 250\nSize of plot mnemonic value, 1000\nSize of mnemonic description, 40',
 'Number of lines per plot, 750',
 'Number of points per polygon in VOB, 20\nNumber of different fonts for VOB, 50\nSize of text in VOB, 300\nMemory for all bitmaps and VOBs, 300 KB',
 'Number of symbol types, 1000\nNumber of symbols per plot, 10000',
 "Number of tables, 100\nNumber of rows in 'operations diary' type table, 4320\nNumber of rows in 'normal' and 'operations remarks' type table, 32000\nNumber of fields in a row`, 20\nColumns per table, 4320\nSize of table name, 29\nSize of table ID, 12\nSize of table column heading, 29\nSize o

In [13]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
question = input()

template = """
    Answer questions for users who wanted to look for help from the GEO help Guide.

    Question: {question}

    As a GEO help guide, I can help you with the following topics:
    {topics}
    
    Ensure that the answer is relevant to the topic and provide the information in a concise structured format.

    Answer: 
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model = "llama3")

chain = prompt | model # chain the operations together.
topics = "Curve Data, MWD, LWD"
response = chain.invoke({"question": question, "topics": topics, "documents": documents})

In [14]:
print(response)

I'm happy to help!

**Limits to Number of Columns per Data File in GEO**

As a GEO Help Guide, I'd like to clarify the limits on the number of columns per data file in GEO. According to our guidelines:

* The maximum number of columns (fields) allowed in a single GEO data file is 256.
* This limit applies to both CSV and tab-delimited files.

If you're working with a larger dataset that exceeds this limit, we recommend using multiple files or consolidating your data into a smaller number of columns. If you have any further questions or concerns, feel free to ask!

Would you like me to help with anything else, such as Curve Data, MWD, or LWD?
