In [89]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser, CodeSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import VectorStoreIndex, Document
from langchain.docstore.document import Document as Doc
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import numpy as np
from tqdm import tqdm
from llama_index.core.retrievers import QueryFusionRetriever
import os
from llama_index.llms.azure_openai import AzureOpenAI

In [90]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,url,html,query,node,code
0,https://github.com,"<html lang=""en"" data-a11y-animated-images=""sys...",click on start a free entreprise trial,"<a class=""btn-mktg home-campaign-enterprise bt...",# Let's proceed step by step.\n# First we need...
1,https://huggingface.co/,"<html class=""""><head>\n\t\t<meta charset=""utf-...",click on datasets,"<li><a class=""group flex items-center px-2 py-...",# Let's proceed step by step.\n# First we need...
2,https://www.irs.gov,"<html lang=""en"" dir=""ltr"" prefix=""content: htt...",click on criminal investigation,</ul>\n \n </li>\n ...,# Let's proceed step by step.\n# First we need...
3,https://www.kaggle.com,"<html lang=""en""><head><meta http-equiv=""origin...",click on competitions,"<div class=""sc-jiSpbx kcFuZv""><div class=""sc-h...",# Let's proceed step by step.\n# First we need...
4,https://en.wikipedia.org/wiki/Main_Page,"<html class=""client-js vector-feature-language...",click on create account,"<div id=""p-vector-user-menu-overflow"" class=""v...",# Let's proceed step by step.\n# First we need...


In [91]:
DEFAULT_EMBED_MODEL = "BAAI/bge-small-en-v1.5"
embed = HuggingFaceEmbedding(DEFAULT_EMBED_MODEL)

In [92]:
def get_index_recursive(embed, html):
        text_list = [html]
        documents = [Document(text=t) for t in text_list]
        
        splitter = LangchainNodeParser(lc_splitter=RecursiveCharacterTextSplitter.from_language(
            language="html",
        ))
        
        nodes = splitter.get_nodes_from_documents(documents)
        nodes = [node for node in nodes if node.text]

        index = VectorStoreIndex(nodes, embed_model=embed)

        return index
    
def get_index_code(embed, html):
        text_list = [html]
        documents = [Document(text=t) for t in text_list]
        
        splitter = CodeSplitter(
            language="html",
            chunk_lines=50,  # lines per chunk
            chunk_lines_overlap=15,  # lines overlap between chunks
            max_chars=2000,  # max chars per chunk
        )
        
        nodes = splitter.get_nodes_from_documents(documents)
        nodes = [node for node in nodes if node.text]

        index = VectorStoreIndex(nodes, embed_model=embed)

        return index

In [93]:
os.environ["AZURE_OPENAI_KEY"] = "38774554c5d14584b2b0aeb3ef3c515b"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://canada-mith-oai.openai.azure.com/"
os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = "gpt-4-turbo"
api_key=os.getenv("AZURE_OPENAI_KEY")
api_version="2023-05-15"
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
model = "gpt-4"
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "gpt-4-turbo")
llm = AzureOpenAI(
            model=model,
            deployment_name=deployment_name,
            api_key=api_key,
            azure_endpoint=azure_endpoint,
            api_version=api_version,
            temperature=0.0
        )

In [94]:
def get_retriever_recursive(embed, html):
    index_recursive = get_index_recursive(embed, html)
    retriever_recursive = BM25Retriever.from_defaults(index = index_recursive, similarity_top_k=3)
    return retriever_recursive

def get_retriever_code(embed, html):
    index_code = get_index_code(embed, html)
    retriever_code = BM25Retriever.from_defaults(index = index_code, similarity_top_k=3)
    return retriever_code

In [95]:
QUERY_GEN_PROMPT = (
    "You are a helpful assistant that generates multiple queries for net browsing based on a "
    "single input query. Generate {num_queries} queries, one on each line, "
    "to accomplish the same action as the following input query:\n"
    "Query: {query}\n"
    "Queries:\n"
)

In [96]:
def get_retriever_recursive_fusion(embed, html):
    index_recursive = get_index_recursive(embed, html)
    retriever_recursive = BM25Retriever.from_defaults(index = index_recursive, similarity_top_k=3)
    retriever = QueryFusionRetriever(
        
        [index_recursive.as_retriever(), retriever_recursive],
        similarity_top_k=3,
        num_queries=6,  # set this to 1 to disable query generation
        mode="reciprocal_rerank",
        verbose=True,
        use_async=False,
        llm=llm,
        query_gen_prompt=QUERY_GEN_PROMPT,  # we could override the query generation prompt here
    )
    return retriever

def get_retriever_code_fusion(embed, html):
    index_code = get_index_code(embed, html)
    retriever_code = BM25Retriever.from_defaults(index = index_code, similarity_top_k=3)
    retriever = QueryFusionRetriever(
        [index_code.as_retriever(), retriever_code],
        similarity_top_k=3,
        num_queries=6,  # set this to 1 to disable query generation
        mode="reciprocal_rerank",
        use_async=False,
        llm=llm,
        query_gen_prompt=QUERY_GEN_PROMPT,  # we could override the query generation prompt here
    )
    return retriever

In [97]:
def get_retriever_recursive_as_retriever(embed, html):
    index_recursive = get_index_recursive(embed, html)
    return index_recursive.as_retriever(similarity_top_k=3)

In [98]:
retrievers = {'retriever_recursive': get_retriever_recursive,
              'get_retriever_recursive_as_retriever': get_retriever_recursive_as_retriever,
              }

In [99]:
retrievers_fusion = {'retriever_recursive_fusion': get_retriever_recursive_fusion,
                     'get_retriever_code_fusion': get_retriever_code_fusion,
                        }

In [100]:
def longest_common_substring(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0] * (n+1) for _ in range(m+1)]  # DP table
    longest, end_pos = 0, 0
    
    for i in range(1, m+1):
        for j in range(1, n+1):
            if s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
                if dp[i][j] > longest:
                    longest = dp[i][j]
                    end_pos = i
            else:
                dp[i][j] = 0
    return longest, s1[end_pos-longest: end_pos]

def compute_lcs_scores(true_node, output_node):
    lcs_length, longest = longest_common_substring(true_node, output_node)
    true_node_score = lcs_length / len(true_node)
    output_node_score = lcs_length / len(output_node)
    return [true_node_score, output_node_score]

def calculate_metrics(embed, get_retrievers, df):
    score = {}
    text = {}
    for retriever_name in get_retrievers:
        score[retriever_name] = []
        text[retriever_name] = []
    for i in tqdm(range(len(df))):
        for retriever_name in get_retrievers:
            query = df.iloc[i]['query']
            true_node = df.iloc[i]['node']
            retriever = get_retrievers[retriever_name](embed, df.iloc[i]['html'])
            results = retriever.retrieve(query)
            s = [0.0, 0.0]
            for result in results:
                if retriever_name == 'retriever_window':
                    sc = compute_lcs_scores(true_node, result.metadata['window'])
                else :
                    sc = compute_lcs_scores(true_node, result.get_text())
                if sc[0] > s[0]:
                    s = sc            
            score[retriever_name].append(s)
            text[retriever_name].append(results)
    for retriever_name in get_retrievers:
        score[retriever_name] = np.array(score[retriever_name])
        print(f'{retriever_name} - true_in_output_score: {score[retriever_name].mean(axis=0)[0]}, output_in_true_score: {score[retriever_name].mean(axis=0)[1]}')
    return score, text            

In [101]:
score, text = calculate_metrics(embed, retrievers_fusion, df)

  0%|          | 0/10 [00:00<?, ?it/s]

Generated queries:
1. How to initiate a free enterprise trial
2. Steps to begin a free trial for business software
3. Guide to starting a free enterprise version trial
4. Instructions for activating a free trial of enterprise solutions
5. Process for enrolling in a free enterprise package trial


 10%|█         | 1/10 [00:19<02:54, 19.39s/it]

Generated queries:
1. Find datasets to download
2. Access datasets online
3. Locate downloadable datasets
4. Search for datasets to click on
5. Navigate to dataset repositories


 20%|██        | 2/10 [00:30<01:56, 14.62s/it]

Generated queries:
1. How to start a criminal investigation process
2. Steps involved in conducting a criminal investigation
3. Criminal investigation techniques and procedures
4. Best practices for law enforcement in criminal investigations
5. Criminal investigation training and resources online


 30%|███       | 3/10 [00:52<02:04, 17.72s/it]

Generated queries:
1. Find online competitions to participate in
2. Search for current competitions to click on
3. Locate active competitions registration page
4. Access recent competitions entry forms
5. Navigate to ongoing competitions sign-up links


 40%|████      | 4/10 [01:03<01:31, 15.25s/it]

Generated queries:
1. How to create a new account online
2. Steps to register for an account on a website
3. Guide for signing up for a new user account
4. Instructions to make an account on a platform
5. Process for account creation on the internet


 50%|█████     | 5/10 [01:15<01:10, 14.14s/it]

Generated queries:
1. How to access Gmail account
2. Open Gmail inbox online
3. Gmail sign in page
4. Check Gmail messages
5. Gmail login portal


 60%|██████    | 6/10 [01:27<00:53, 13.43s/it]

Generated queries:
1. How to begin a free trial subscription
2. Steps to activate free trial offer
3. Guide to starting a trial period for a service
4. Instructions for enrolling in a free trial
5. Process for initiating a free trial membership


 70%|███████   | 7/10 [01:51<00:50, 16.88s/it]

Generated queries:
1. How to use a PC configurator tool online
2. Steps to configure a custom PC using a configurator
3. Guide to selecting components on a PC builder website
4. Instructions for clicking through a PC configuration process
5. Tutorial for using an online PC configurator to build a computer


 80%|████████  | 8/10 [02:02<00:30, 15.05s/it]

Generated queries:
1. How to focus on a search bar using keyboard shortcuts?
2. Steps to input a website address in a search bar.
3. Guide to entering a domain name in a browser's search field.
4. Instructions for clicking into a search bar and typing a URL.
5. Methods to navigate to a search bar and enter a web address manually.


 90%|█████████ | 9/10 [02:22<00:16, 16.50s/it]

Generated queries:
1. How to start using a service for free
2. Steps to sign up for a free trial online
3. Guide to accessing free version of a software
4. Instructions for clicking the "Get Started" button for free services
5. Process for initiating a free service subscription online


100%|██████████| 10/10 [02:42<00:00, 16.28s/it]

retriever_recursive_fusion - true_in_output_score: 0.8211294358507735, output_in_true_score: 0.17218202468251703
get_retriever_code_fusion - true_in_output_score: 0.815338726947284, output_in_true_score: 0.47379441734014033





In [103]:
score

{'retriever_recursive_fusion': array([[1.        , 0.41101152],
        [0.18035714, 0.14346591],
        [1.        , 0.45072574],
        [0.03093722, 0.01156069],
        [1.        , 0.22163309],
        [1.        , 0.11308517],
        [1.        , 0.13822246],
        [1.        , 0.03766578],
        [1.        , 0.0954557 ],
        [1.        , 0.09899418]]),
 'get_retriever_code_fusion': array([[0.99454829, 0.99454829],
        [0.18035714, 0.11542857],
        [0.90169492, 0.90424929],
        [1.        , 0.55115346],
        [1.        , 0.5005988 ],
        [1.        , 0.87363834],
        [1.        , 0.41557377],
        [0.77464789, 0.07959479],
        [1.        , 0.20557682],
        [0.30213904, 0.09758204]])}

In [105]:
df2 = pd.DataFrame()
for retriever_name in retrievers_fusion:
    t = []
    for i in range(len(df)):
        u = ""
        for j in range(3):
            u += text[retriever_name][i][j].get_text() + " "
        t.append(u)
    df2[retriever_name] = t
    
    
    

In [117]:
df2 = pd.read_csv('predicted_nodes.csv')