We are only interested in the html element that contains the text. remove all the others

To activate the environment:
conda activate vikunja-data

What to return from LLM?
- user instrucions
- element to be highlighted

We are looking for:
- min element to locate it
- remove svg, styles, scripts, etc.
    - path within svg, but keep others

# Return a more concise version of the html 
- we will feed that cleaned html file into the LLM
- keep the DOM tree structure to provide more context for LLM
- the expected interaction with LLMis:
    - input: the cleaned html + prompt: "show the user how to do {requirement}"
    - output: instructions + the html element to be highlighted

In [31]:
from bs4 import BeautifulSoup

def make_soup(input_path):
    with open(input_path, 'r') as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    if not soup:
        print("Failed to parse HTML")
        exit()
    return soup

def clean_tags(soup):
    # remove all style tags
    for style_tag in soup.find_all('style'):
        style_tag.decompose()

    # remove all script tags
    for script_tag in soup.find_all('script'):
        script_tag.decompose()

    # remove <path> within <svg>
    for svg_tag in soup.find_all('svg'):
        for path_tag in svg_tag.find_all('path'):
            path_tag.decompose()
    
    return soup


def clean_tag_attributes(soup):
    """
    only keep the class and unique vue identifier for each element
    """
    for tag in soup.find_all(True): 
        attrs = dict(tag.attrs)
        new_attrs = {}
        
        # Keep class attributes
        if 'class' in attrs:
            new_attrs['class'] = attrs['class']
        # Keep Vue.js component identifiers (data-v-*)
        for attr in attrs:
            if attr.startswith('data-v-'):
                new_attrs[attr] = attrs[attr]

        tag.attrs = new_attrs

    return soup

def export_soup(soup, output_path):
    cleaned_html = soup.prettify()
    with open(output_path, 'w') as file:
        file.write(cleaned_html)

    print(f"Cleaned HTML file saved to {output_path}")

In [32]:
file_name = "Current_Tasks_Vikunja.html"
input_path = "data/Current_Tasks_Vikunja_files/" + file_name
output_path = input_path.replace(".html", "_cleaned.html")

soup = make_soup(input_path)
soup = clean_tags(soup)
soup = clean_tag_attributes(soup)
export_soup(soup, output_path)

Cleaned HTML file saved to data/Current_Tasks_Vikunja_files/Current_Tasks_Vikunja_cleaned.html


# Return elements that contain the text we are interested in

In [33]:
# BeautifulSoup offers multiple parsers, with 'html.parser' (built-in), 'lxml' (faster), and 'html5lib' (most accurate) being the common options.

from bs4 import BeautifulSoup

# File path
file_path = "data/Current_Tasks_Vikunja_files/Current_Tasks_Vikunja.html" 

# Parse HTML from a string
with open(file_path, 'r') as file:
    html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')

if not soup:
    print("Failed to parse HTML")
    exit()  

# First find and remove all style tags
for style_tag in soup.find_all('style'):
    style_tag.decompose()

# # Find all elements that have some text content (not empty)
# elements_with_text = [element for element in soup.find_all() if element.string and element.string.strip()]

text = "import"

# Find elements containing text with partial match (using a function)
elements = soup.find_all(string=lambda t: text in t.lower())

# Display
print("Elements found: ", len(elements))

for element in elements:
    print(element)

# Then get their parent elements
elements_with_tags = [e.parent for e in elements]

# Display
print("Elements with tags: ", len(elements_with_tags))

for element in elements_with_tags:
    print(element)


Elements found:  2
Import your projects and tasks from
                other services into Vikunja:
Import your
                data into Vikunja
Elements with tags:  2
<p class="mt-4" data-v-746c9d2b="">Import your projects and tasks from
                other services into Vikunja:</p>
<a class="base-button button is-primary has-no-shadow" data-v-746c9d2b="" data-v-fefdc249="" href="http://localhost:3456/user/settings/migrate" style="--button-white-space: break-spaces;"><!-- -->Import your
                data into Vikunja</a>
