In [1]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser, HTMLNodeParser, CodeSplitter, SentenceWindowNodeParser
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import VectorStoreIndex, Document
from langchain.docstore.document import Document as Doc
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import numpy as np
from tqdm import tqdm
import spacy
from spacy_html_tokenizer import create_html_tokenizer
from nltk.stem import PorterStemmer
from llama_index.core.indices.keyword_table.utils import simple_extract_keywords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('dataset.csv')
df.head(10)

Unnamed: 0,url,html,query,node,code
0,https://github.com,"<html lang=""en"" data-a11y-animated-images=""sys...",click on start a free entreprise trial,"<a class=""btn-mktg home-campaign-enterprise bt...",# Let's proceed step by step.\n# First we need...
1,https://huggingface.co/,"<html class=""""><head>\n\t\t<meta charset=""utf-...",click on datasets,"<li><a class=""group flex items-center px-2 py-...",# Let's proceed step by step.\n# First we need...
2,https://www.irs.gov,"<html lang=""en"" dir=""ltr"" prefix=""content: htt...",click on criminal investigation,</ul>\n \n </li>\n ...,# Let's proceed step by step.\n# First we need...
3,https://www.kaggle.com,"<html lang=""en""><head><meta http-equiv=""origin...",click on competitions,"<div class=""sc-jiSpbx kcFuZv""><div class=""sc-h...",# Let's proceed step by step.\n# First we need...
4,https://en.wikipedia.org/wiki/Main_Page,"<html class=""client-js vector-feature-language...",click on create account,"<div id=""p-vector-user-menu-overflow"" class=""v...",# Let's proceed step by step.\n# First we need...
5,https://www.google.fr/,"<html itemscope="""" itemtype=""http://schema.org...",click on gmail,"<div class=""gb_Id gb_J gb_3f gb_Tf"" data-ogbl=...",# Let's proceed step by step.\n# First we need...
6,https://www.salesforce.com/,"<html lang=""en"" style=""--xsf-chat-agent-image:...",click on start free trial,"<pbc-button class=""cta_button__wrapper"" data-l...",# Let's proceed step by step.\n# First we need...
7,https://www.ldlc.com,"<html class=""js cssanimations csstransitions p...",click on configurateur pc,"<li><a href=""https://www.ldlc.com/configurateu...",# Let's proceed step by step.\n# First we need...
8,https://www.semrush.com/,"<html lang=""en""><head>\n <meta charset=...",click on search bar enter domain,"<div class=""index-search__input""><input type=""...",# Let's proceed step by step.\n# First we need...
9,https://www.hubspot.com,"<html lang=""en""><head>\n <meta charset=""utf...",click on get started free,"<a class=""\n cl-button -secondary -large wf-p...",# Let's proceed step by step.\n# First we need...


In [3]:
df.iloc[:]['url']

0                         https://github.com
1                    https://huggingface.co/
2                        https://www.irs.gov
3                     https://www.kaggle.com
4    https://en.wikipedia.org/wiki/Main_Page
5                     https://www.google.fr/
6                https://www.salesforce.com/
7                       https://www.ldlc.com
8                   https://www.semrush.com/
9                    https://www.hubspot.com
Name: url, dtype: object

In [10]:
DEFAULT_EMBED_MODEL = "BAAI/bge-small-en-v1.5"
embed = HuggingFaceEmbedding(DEFAULT_EMBED_MODEL)

In [23]:
def get_nodes_code(html, chunk_lines=60, max_chars=2000):
    text_list = [html]
    documents = [Document(text=t) for t in text_list]
    
    splitter = CodeSplitter(
        language="html",
        chunk_lines=chunk_lines,  # lines per chunk
        chunk_lines_overlap=15,  # lines overlap between chunks
        max_chars=max_chars,  # max chars per chunk
    )
    
    nodes = splitter.get_nodes_from_documents(documents)
    nodes = [node for node in nodes if node.text]

    return nodes

def get_nodes_recursive(html):
    text_list = [html]
    documents = [Document(text=t) for t in text_list]
    
    splitter = LangchainNodeParser(lc_splitter=RecursiveCharacterTextSplitter.from_language(
        language="html"
    ))
    
    nodes = splitter.get_nodes_from_documents(documents)
    nodes = [node for node in nodes if node.text]

    return nodes

def get_nodes_html(html):
    text_list = [html]
    documents = [Document(text=t) for t in text_list]
      
    splitter = HTMLNodeParser()
    
    nodes = splitter.get_nodes_from_documents(documents)
    nodes = [node for node in nodes if node.text]

    return nodes

def get_nodes_sentence(html):
    text_list = [html]
    documents = [Document(text=t) for t in text_list]
    sentence_splitter = CodeSplitter(
                language="html",
                chunk_lines=40,  # lines per chunk
                chunk_lines_overlap=200,  # lines overlap between chunks
                max_chars=1000,  # max chars per chunk
            ).split_text
    splitter = SentenceWindowNodeParser(
            sentence_splitter = sentence_splitter,
            window_size=2,
            window_stride=1,
    )
    nodes = splitter.build_window_nodes_from_documents(documents)
    nodes = [node for node in nodes if node.text]

    return nodes

In [26]:
indexs = {'index_code2000_50': lambda x: get_nodes_code(x, 50, 2000),
            'index_code3000_50': lambda x: get_nodes_code(x, 50, 3000),
            'index_code2000_100': lambda x: get_nodes_code(x, 100, 2000),
            'index_code3000_100': lambda x: get_nodes_code(x, 100, 3000),
              }

In [8]:
def longest_common_substring(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0] * (n+1) for _ in range(m+1)]  # DP table
    longest, end_pos = 0, 0
    
    for i in range(1, m+1):
        for j in range(1, n+1):
            if s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
                if dp[i][j] > longest:
                    longest = dp[i][j]
                    end_pos = i
            else:
                dp[i][j] = 0
    return longest, s1[end_pos-longest: end_pos]

def compute_lcs_scores(true_node, output_node):
    lcs_length, longest = longest_common_substring(true_node, output_node)
    true_node_score = lcs_length / len(true_node)
    output_node_score = lcs_length / len(output_node)
    return np.array([true_node_score, output_node_score])

def calculate_metrics(indexs, df):
    score = {}
    texts = {}
    for index_name in indexs:
        score[index_name] = np.array([0.0, 0.0])
        texts[index_name] = []
    for i in range(len(df)):
        for index_name in indexs:
            true_node = df.iloc[i]['node']
            s = np.array([0.0, 0.0])
            text = ''
            nodes = indexs[index_name](df.iloc[i]['html'])
            for node in tqdm(nodes):
                if index_name == 'index_window':
                    sc = compute_lcs_scores(true_node, node.metadata['window'])
                else:               
                    sc = compute_lcs_scores(true_node, node.text)
                if sc[0] > s[0]:
                    s = sc
                    text = node.text            
            score[index_name] += s
            texts[index_name].append(text)
    for index_name in indexs:
        score[index_name] /= len(df)
        print(f'{index_name} - true_in_output_score: {score[index_name][0]}, output_in_true_score: {score[index_name][1]}')
    return texts
            

In [58]:
def find_nodes(index, text):
    nodes = []
    for node in tqdm(index):
        if text in node.text:
            nodes.append(node.text)
    return nodes