# Functions

In [11]:
from llama_index.core import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser
from bs4 import BeautifulSoup, NavigableString
import cohere

def generate_xpath(element, path=""): # used to generate dict nodes
    """ Recursive function to generate the xpath of an element """
    if element.parent is None:
        return path
    else:
        siblings = [sib for sib in element.parent.children if sib.name == element.name]
        if len(siblings) > 1:
            count = siblings.index(element) + 1
            path = f"/{element.name}[{count}]{path}"
        else:
            path = f"/{element.name}{path}"
        return generate_xpath(element.parent, path)

def create_xpath_dict(html, only_body=True, max_length=200): # used to generate dict nodes
    ''' Create a list of xpaths and a list of dict of attributes of all elements in the html'''
    soup = BeautifulSoup(html, 'html.parser')
    if only_body:
        root = soup.body
    else:
        root = soup.html
    element_xpath_list = []
    element_attributes_list = []
    stack = [(root, '')]  # stack to keep track of elements and their paths
    while stack:
        element, path = stack.pop()
        if element.name is not None:
            current_path = generate_xpath(element)
            element_attrs = dict(element.attrs)
            direct_text_content = ''.join([str(content).strip() for content in element.contents if isinstance(content, NavigableString) and content.strip()])
            if direct_text_content:
                element_attrs['text'] = direct_text_content
                element_attrs['element'] = element.name
                for key in element_attrs:
                    if len(element_attrs[key]) > max_length:
                        element_attrs[key] = element_attrs[key][:max_length]
                element_xpath_list.append(current_path)
                element_attributes_list.append(element_attrs)
            elif element_attrs != {}:
                element_attrs['element'] = element.name
                for key in element_attrs:
                    if len(element_attrs[key]) > max_length:
                        element_attrs[key] = element_attrs[key][:max_length]
                element_xpath_list.append(current_path)
                element_attributes_list.append(element_attrs)
            for child in element.children:
                if child.name is not None:
                    stack.append((child, current_path))

    return element_xpath_list, element_attributes_list

def get_results(cohere, query, html, top_n=5, model="rerank-english-v3.0", rank_fields=None):
    xpath_list, attributes_list = create_xpath_dict(html)
    assert len(xpath_list) == len(attributes_list)
    l = len(xpath_list)
    list_of_results = []
    for j in range(0, l, 1000):
        attr = attributes_list[j:j+1000]
        results = cohere.rerank(model=model, query=query, documents=attr, top_n=top_n, return_documents=True, rank_fields=rank_fields)
        results = [r.dict() for r in results.results]
        for r in results:
            r['index']+=j
        list_of_results += results
    list_of_results = sorted(list_of_results, key=lambda x: x['relevance_score'], reverse=True)
    xpaths = [xpath_list[result['index']] for result in list_of_results[:top_n]]
    return xpaths, list_of_results[:top_n]



def match_element(attributes, element_specs):
    i=0
    for spec in element_specs:
        matches = True
        for key in spec:
            if key in attributes:
                if isinstance(attributes[key], list):
                    if not set(spec[key]).issubset(set(attributes[key])):
                        matches = False
                        break
                elif attributes[key] != spec[key]:
                    matches = False
                    break
            else:
                matches = False
                break
        if matches:
            return i
        i+=1
    return None

def return_nodes_with_xpath(nodes, xpaths, results_dict):
    returned_nodes = []
    for node in nodes:
        node.metadata['xpath'] = []
        node.metadata['element'] = []
        split_html = node.text
        soup = BeautifulSoup(split_html, 'html.parser')
        for element  in soup.descendants:
            try:
                attribute = element.attrs
                direct_text_content = ''.join([str(content).strip() for content in element.contents if isinstance(content, NavigableString) and content.strip()])
                if direct_text_content:
                    attribute['text'] = direct_text_content
                attribute['element'] = element.name
                indice = match_element(attribute, results_dict)
                if indice is not None:
                    node.metadata['xpath'].append(xpaths[indice])
                    node.metadata['element'].append(results_dict[indice])
                    returned_nodes.append(node)
            except:
                pass
    return returned_nodes

def get_nodes_sm(cohere, query, html, top_n=5, model="rerank-english-v3.0", rank_fields=['element', 'placeholder', 'text', 'name']):
    text_list = [html]
    documents = [Document(text=t) for t in text_list]
    splitter = LangchainNodeParser(lc_splitter=RecursiveCharacterTextSplitter.from_language(
            language="html",
        ))
    nodes = splitter.get_nodes_from_documents(documents)
    xpaths, results = get_results(cohere, query, html, top_n=top_n, model=model, rank_fields=rank_fields)
    results_dict = [r['document'] for r in results]
    
    returned_nodes = return_nodes_with_xpath(nodes, xpaths, results_dict)
    return returned_nodes

def get_nodes_sm_with_xpath(cohere, query, html, top_n=5, model="rerank-english-v3.0", rank_fields=['element', 'placeholder', 'text', 'name']): # used to add xpaths to the returned nodes
    nodes = get_nodes_sm(cohere, query, html, top_n, model, rank_fields)
    returned_nodes = []
    for node in nodes:
       returned_nodes.append(node.text + f"""\n
        Here is a list of some xpaths of element of previous text:
        {node.metadata['xpath']}
        \n\n
        """)
    return returned_nodes

# Example

In [12]:
import requests

cohere_api_key = #fill with your cohere api key

html = requests.get('https://github.com').text
query = 'Click on start a free entreprise trial'
co = cohere.Client(cohere_api_key)

In [13]:
nodes = get_nodes_sm_with_xpath(co, query, html, top_n=5, model="rerank-english-v3.0", rank_fields=['element', 'placeholder', 'text', 'name'])
nodes

['<div class="HeaderMenu-dropdown dropdown-menu rounded m-0 p-0 py-2 py-lg-4 position-relative position-lg-absolute left-0 left-lg-n3 px-lg-4">\n          <div class="border-bottom pb-3 mb-3">\n              <span class="d-block h4 color-fg-default my-1" id="solutions-for-heading">For</span>\n            <ul class="list-style-none f5" aria-labelledby="solutions-for-heading">\n                <li>\n  <a class="HeaderMenu-dropdown-link lh-condensed d-block no-underline position-relative py-2 Link--secondary" data-analytics-event="{&quot;category&quot;:&quot;Header dropdown (logged out), Solutions&quot;,&quot;action&quot;:&quot;click to go to Enterprise&quot;,&quot;label&quot;:&quot;ref_cta:Enterprise;&quot;}" href="/enterprise">\n      Enterprise\n\n    \n</a></li>\n\n                <li>\n  <a class="HeaderMenu-dropdown-link lh-condensed d-block no-underline position-relative py-2 Link--secondary" data-analytics-event="{&quot;category&quot;:&quot;Header dropdown (logged out), Solutions&