In [1]:
import os

In [2]:
base_directory = "Data"
model_name = "llama3"

In [3]:
htm_files = []
for root, _, files in os.walk(base_directory):
    for file in files:
        if file.endswith(".htm"):
            relative_path = os.path.relpath(os.path.join(root, file), start=base_directory)
            htm_files.append(base_directory + "/" + relative_path)

In [4]:
htm_files

['Data/Contact_us.htm',
 'Data/csh-redirect.htm',
 'Data/First_Topic.htm',
 'Data/Help_Missing.htm',
 'Data/index.htm',
 'Data/topic.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Cement_Volume.htm',
 'Data/Computed_Curve_Templates\\Drilling\\D_Exponent.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Mechanical_Specific_Energy.htm',
 'Data/Computed_Curve_Templates\\Drilling\\Temperature_Gradient.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\C1_Sum.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Balance.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Character.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Gas_Wetness.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Inverse_Oil_Indicator.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Oil_Indicator.htm',
 'Data/Computed_Curve_Templates\\Gas\\Formation_Fluid_Evaluation\\Pixler_C1_C2_

In [5]:
from bs4 import BeautifulSoup
import pandas as pd


dataFrame = None

for file_path in htm_files:
    if file_path.endswith("GEO_Limits.htm"):
        with open(file_path, encoding="utf-8") as file:
            content = file.read()
            
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            
            data = []
            
            for table in soup.find_all('table'):
                rows = table.find_all('tr')
                for row in rows:
                    cols = row.find_all('td')
                    cols = [col.get_text(strip=True) for col in cols]
                    print("Column: ", cols)
                    if cols:
                        data.append(cols)

            # Convert the list of data into a DataFrame
            dataFrame = pd.DataFrame(data, columns=['Type', 'Limit'])
            
            dataFrame = dataFrame[dataFrame['Type'] != '']
            break
        
# if dataFrame is not None:
#     print(dataFrame)
    
# store the dataFrame as a csv file
# dataFrame.to_csv(model_name + "_GEO_Limits.csv", index=False)

Column:  ['Back', 'Forward']
Column:  ['Types', 'Limits']
Column:  ['Curves', '']
Column:  ['Number of curves', '450']
Column:  ['Size of curve units', '24']
Column:  ['Size of curve name', '90']
Column:  ['Number of data files to form one curve', 'None']
Column:  ['Number of pen definitions', '20']
Column:  ['Curve selection name', '60']
Column:  ['Curve to lithology name', '50']
Column:  ['Curve to lithology lithology types', '10']
Column:  ['Data points per curve', 'Unlimited']
Column:  ['Computed curve parameters', '250']
Column:  ['Size of computed curve parameters name', '12']
Column:  ['Computed curve expressions', '300']
Column:  ['Size of computed curve expressions name', '25']
Column:  ['Size of computed curve parameter description', '150']
Column:  ["Number of 'curves for surfaces' definitions", '10']
Column:  ['Number of curve synonym-pairs', '500']
Column:  ['', '']
Column:  ['Tracks', '']
Column:  ['Number of tracks', '200']
Column:  ['Number of qualitative tracks', '30']

In [6]:
def split_table_by_subheadings(df, column_name):
    sub_tables = {}
    current_subheading = None
    sub_table_data = []
    
    # skip the first row
    df = df[1:]
    
    # first row has the columns
    column_names = df.iloc[0].to_list()
    
    df = df[1:]

    for _, row in df.iterrows():
        
        if row['Limit'] == '':  # Identify subheadings based on NaN in the 'Limit' column
            if current_subheading and sub_table_data:
                
                sub_tables[current_subheading] = pd.DataFrame(sub_table_data, columns=column_names)
                print(f"Subtable: \n{sub_tables[current_subheading]}")
                sub_table_data = []

            current_subheading = row[column_name]
        else:
            row_list = row.tolist()
            print(f"Row data: {row_list}")
            sub_table_data.append(row_list)

    # Add the last collected sub-table
    if current_subheading and sub_table_data:
        sub_tables[current_subheading] = pd.DataFrame(sub_table_data)

    return sub_tables

In [7]:
# sub_tables = split_table_by_subheadings(dataFrame, 'Type/Category')
sub_tables = split_table_by_subheadings(dataFrame, 'Type')

Row data: ['Number of curves', '450']
Row data: ['Size of curve units', '24']
Row data: ['Size of curve name', '90']
Row data: ['Number of data files to form one curve', 'None']
Row data: ['Number of pen definitions', '20']
Row data: ['Curve selection name', '60']
Row data: ['Curve to lithology name', '50']
Row data: ['Curve to lithology lithology types', '10']
Row data: ['Data points per curve', 'Unlimited']
Row data: ['Computed curve parameters', '250']
Row data: ['Size of computed curve parameters name', '12']
Row data: ['Computed curve expressions', '300']
Row data: ['Size of computed curve expressions name', '25']
Row data: ['Size of computed curve parameter description', '150']
Row data: ["Number of 'curves for surfaces' definitions", '10']
Row data: ['Number of curve synonym-pairs', '500']
Subtable: 
                                           Types     Limits
0                               Number of curves        450
1                            Size of curve units         24
2

In [8]:
third_subheading = list(sub_tables.keys())[2]  # Get the key for the third sub-table
third_sub_table = sub_tables[third_subheading]

print(f"Subheading: {third_subheading}")
print(third_sub_table)

Subheading: Curve Shading
                             Types Limits
0  Number of curve shades per plot    250
1  Number of zones per curve shade     50
2          Curve shade name length     20


In [9]:
# Task 2: Print every sub-table
for subheading, sub_table in sub_tables.items():
    print(f"Subheading: {subheading}")
    print(sub_table)
    print("\n")

Subheading: Curves
                                           Types     Limits
0                               Number of curves        450
1                            Size of curve units         24
2                             Size of curve name         90
3         Number of data files to form one curve       None
4                      Number of pen definitions         20
5                           Curve selection name         60
6                        Curve to lithology name         50
7             Curve to lithology lithology types         10
8                          Data points per curve  Unlimited
9                      Computed curve parameters        250
10        Size of computed curve parameters name         12
11                    Computed curve expressions        300
12       Size of computed curve expressions name         25
13  Size of computed curve parameter description        150
14   Number of 'curves for surfaces' definitions         10
15                 Nu

In [10]:
from pathlib import Path

current_folder_path = Path.cwd()

for subheading, sub_table in sub_tables.items():
    # save the item as a sub-table file in a sub directory "csv-file/intro/limits"
    if not os.path.exists("csv-files/intro/limits"):
        os.makedirs("csv-files/intro/limits")
        
    try:
        sub_table.to_csv("csv-files/intro/limits/" + model_name + "_" + subheading + ".csv", index=False)
    except Exception as e:
        pass

In [11]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PandasCSVReader

training_data = []

for subheading, sub_table in sub_tables.items():
    # Construct the folder path
    folder_path = current_folder_path / "csv-files" / "intro" / "limits"
    
    # Construct the full file path
    file_path = folder_path / f"{model_name}_{subheading}.csv"
    
    parser = PandasCSVReader()
    fe = {".csv":parser}
    loader = SimpleDirectoryReader(folder_path, file_extractor=fe)
    docs = loader.load_data()
    for doc in docs:
        if doc.metadata['file_type'] == "text/csv":
            # replace "," with ": " in the text
            if isinstance(doc.text, str):
                rows = doc.text.splitlines()
                
                for i in range(len(rows)):
                    key = rows[i].split(",")[0]
                    value = rows[i].split(",")[1]
                    
                    train_entry = {"Question": key, "Answer": value}
                    
                    print(f"Question: {key}, Answer: {value}")
            
                    training_data.append(train_entry)

Question: Correlation items, Answer:  50
Question: Number of curve shades per plot, Answer:  250
Question: Number of zones per curve shade, Answer:  50
Question: Curve shade name length, Answer:  20
Question: Number of curves, Answer:  450
Question: Size of curve units, Answer:  24
Question: Size of curve name, Answer:  90
Question: Number of data files to form one curve, Answer:  None
Question: Number of pen definitions, Answer:  20
Question: Curve selection name, Answer:  60
Question: Curve to lithology name, Answer:  50
Question: Curve to lithology lithology types, Answer:  10
Question: Data points per curve, Answer:  Unlimited
Question: Computed curve parameters, Answer:  250
Question: Size of computed curve parameters name, Answer:  12
Question: Computed curve expressions, Answer:  300
Question: Size of computed curve expressions name, Answer:  25
Question: Size of computed curve parameter description, Answer:  150
Question: Number of 'curves for surfaces' definitions, Answer:  10

In [12]:
training_data

[{'Question': 'Correlation items', 'Answer': ' 50'},
 {'Question': 'Number of curve shades per plot', 'Answer': ' 250'},
 {'Question': 'Number of zones per curve shade', 'Answer': ' 50'},
 {'Question': 'Curve shade name length', 'Answer': ' 20'},
 {'Question': 'Number of curves', 'Answer': ' 450'},
 {'Question': 'Size of curve units', 'Answer': ' 24'},
 {'Question': 'Size of curve name', 'Answer': ' 90'},
 {'Question': 'Number of data files to form one curve', 'Answer': ' None'},
 {'Question': 'Number of pen definitions', 'Answer': ' 20'},
 {'Question': 'Curve selection name', 'Answer': ' 60'},
 {'Question': 'Curve to lithology name', 'Answer': ' 50'},
 {'Question': 'Curve to lithology lithology types', 'Answer': ' 10'},
 {'Question': 'Data points per curve', 'Answer': ' Unlimited'},
 {'Question': 'Computed curve parameters', 'Answer': ' 250'},
 {'Question': 'Size of computed curve parameters name', 'Answer': ' 12'},
 {'Question': 'Computed curve expressions', 'Answer': ' 300'},
 {'Que

In [13]:
from transformers import BertModel
# import AdamW model 
from transformers import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
import torch
model = BertModel.from_pretrained("google-bert/bert-base-uncased")
optimiser = AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [15]:
# import a library for a dataloader
from torch.utils.data import DataLoader

In [16]:
train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)

In [19]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
question = input()

template = """
    Answer questions for users who wanted to look for help from the GEO help Guide.

    Question: {question}

    As a GEO help guide, I can help you with the following topics:
    {topics}
    
    Ensure that the answer is relevant to the topic and provide the information in a concise structured format.

    Answer: 
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model = "llama3")

chain = prompt | model # chain the operations together.
topics = "Curve Data, MWD, LWD"
response = chain.invoke({"question": question, "topics": topics})

In [None]:
print(response)