In [1]:
import os
from dash import Dash, html, dcc, callback, Input, Output, State
from embedchain import App
from chromadb.utils import embedding_functions
import yaml

In [2]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [3]:
embedder_config = config.get("embedder", {})
provider = embedder_config.get("provider")
model = embedder_config.get("config", {}).get("model")

In [4]:
os.environ["OPENAI_API_KEY"] = "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"

In [5]:
embedding_function = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"], model_name = model)

In [6]:
ai_bot = App()

In [7]:
file_paths = []

def list_htm_files(base_directory):
    """
    Recursively finds all .htm files in the given directory and its subdirectories.

    Args:
        base_directory (str): The path of the directory to start searching.

    Returns:
        list: A list of file paths relative to the base directory.
    """
    htm_files = []
    for root, _, files in os.walk(base_directory):
        for file in files:
            if file.endswith(".htm"):
                relative_path = os.path.relpath(os.path.join(root, file), start=base_directory)
                htm_files.append(base_directory + "/" + relative_path)
    return htm_files

In [8]:
base_dir = "Data"

file_paths = list_htm_files(base_dir)

print(file_paths)

['Data/Append_Curve_Data.htm', 'Data/GEO_Limits.htm']


In [9]:
from bs4 import BeautifulSoup
import json

# test extracting data into tabular format if there is a table present.
demo_html_path = "Data/GEO_Limits.htm"

with open(demo_html_path, "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Extract the limits table data from the HTML
limits_data = []
table = soup.find("table", class_="Table_Style_1")

if table:
    rows = table.find_all("tr")
    current_section = None

    for row in rows:
        cols = row.find_all("td")
        if len(cols) == 2:
            key = cols[0].get_text(strip=True)
            value = cols[1].get_text(strip=True)

            # Check if it's a section header
            if not value and key:
                current_section = key
            elif current_section and key:
                # Add data to the limits list with section
                limits_data.append({"Section": current_section, "Type": key, "Limit": value})

# Save the extracted limits data into a JSON format for integration
limits_json_path = "Data/geo_limits.json"
with open(limits_json_path, "w", encoding="utf-8") as json_file:
    json.dump(limits_data, json_file, indent=4, ensure_ascii=False)

# Show extracted data to user
import pandas as pd

limits_df = pd.DataFrame(limits_data)
import ace_tools_open as tools
tools.display_dataframe_to_user(name="GEO Limits Data", dataframe=limits_df)

GEO Limits Data


Section,Type,Limit
Loading ITables v2.2.4 from the internet... (need help?),,


In [11]:
for file_path in file_paths:
    try:
        # Read the file using the correct encoding
        with open(file_path, encoding="utf-8") as file:
            content = file.read()

            soup = BeautifulSoup(content, "html.parser")

            limits_data = []

            if soup.find("table"):
                rows = table.find_all("tr")
                current_section = None

                for row in rows:
                    cols = row.find_all("td")
                    if len(cols) == 2:
                        key = cols[0].get_text(strip=True)
                        value = cols[1].get_text(strip=True)

                        # Check if it's a section header
                        if not value and key:
                            current_section = key
                        elif current_section and key:
                            # Add data to the limits list with section
                            limits_data.append({"Section": current_section, "Type": key, "Limit": value})

            limits_data_str = json.dumps(limits_data, indent = 4)

            # add limits data to the app
            ai_bot.add(limits_data_str)
                
        # Add the content to the app
        ai_bot.add(content)
    except UnicodeDecodeError:
        print(f"Error: Could not read the file {file_path}. Please check the file encoding.")

Inserting batches in chromadb: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]


In [12]:
app = Dash()

app.layout = html.Div([
    html.H1("A1 Bot Curve Data"),
    html.H3("This data uses GEO's Append Curve Data Guide for helping you understand the data."),
    html.Label("Enter your question:"),
    html.Br(),
    dcc.Textarea(id='question-area', value=None, style={'width': '25%', 'height': 100}),
    html.Br(),
    html.Button(id='submit-btn', children='Submit'),
    dcc.Loading(id="load", children=html.Div(id='response-area', children=''))
])

In [13]:
@callback(
    Output('response-area', 'children'),
    Input('submit-btn', 'n_clicks'),
    State('question-area', 'value'),
    prevent_initial_call=True
)
def create_response(_, question):
    # Answers question relating to the Curve Data
    answer = ai_bot.query(question)
    return answer

In [14]:
if __name__ == '__main__':
    app.run_server(debug=False)

