# Identify the 3 top matching chunks using LlamaIndex

LlamaIndex:
- Github: https://github.com/run-llama/llama_index
- Documentation: https://docs.llamaindex.ai/en/latest/

In [1]:
import pandas as pd

# read the xlsx data into a pandas dataframe
df = pd.read_excel(f"../data/ReferenceErrorDetection_data_extended_annotation.xlsx")

In [2]:
df.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Citation ID,Statement with Citation,Corrected Statement,...,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Error Type,Added,Previously Partially Substantiated
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,cit001_1,Others have aimed to reduce irreversibility or...,Others have aimed to reduce irreversibility or...,...,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,cit001_2,Some researchers have also studied various hea...,Some researchers have also studied various hea...,...,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,cit002_1,The relative content of total flavonoids in th...,The relative content of total flavonoids in th...,...,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,cit003_1,Research has shown that remimazolam tosylate e...,Research has shown that remimazolam tosylate e...,...,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,cit004_1,if the efficiency of the routing algorithm is ...,If the efficiency of the routing algorithm is ...,...,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,


## Get text from reference article

In [3]:
extension = "txt"

In [4]:
import glob

def get_file_path(reference_article_id):
    # Construct the file path pattern using the Reference Article ID of the first entry
    file_pattern = f"../data/extractions/{'only_text/' if extension == 'txt' else ''}{reference_article_id}*.{extension}"

    # Find the file that matches the pattern
    file_list = glob.glob(file_pattern)
    if file_list:
        file_path = file_list[0]
        return file_path
    else: 
        print("No matching file found.")
        return None

In [5]:
import xml.etree.ElementTree as ET

def get_reference_text(reference_article_id):
    # Get the file path
    file_path = get_file_path(reference_article_id)
    
    if file_path:
        if extension == "txt":
            # Read the text file
            with open(file_path, 'r') as file:
                reference_text = file.read()
            return reference_text

        elif extension == "xml":
            # Parse the XML file
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Extract the text content from the XML file
            reference_text = ''.join(root.itertext())
            return reference_text

## Set OpenAI key

In [6]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

## Setting up vector index

### Reloading or generating index

In [7]:
model_embeddings = "text-embedding-3-large"

In [8]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.embed_model = OpenAIEmbedding(model=model_embeddings, api_key=open_ai_key)

In [9]:
from llama_index.core import VectorStoreIndex, Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

def create_index(reference_text, chunk_size, chunk_overlap):
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
            OpenAIEmbedding(model=model_embeddings, api_key=open_ai_key)
        ]
    )

    # run the pipeline
    nodes = pipeline.run(documents=[Document(text=reference_text)])
    index = VectorStoreIndex(nodes)
    return index

In [10]:
from llama_index.core import StorageContext, load_index_from_storage
import os

def load_or_create_index(article_id, reference_text, chunk_size, chunk_overlap, only_checking=False):
    index_path = f"../data/vector_indices/{'only_text_' if extension == 'txt' else ''}{chunk_size}_{chunk_overlap}/{article_id}/"
    index = None
    if only_checking:
        if os.path.exists(index_path) and os.listdir(index_path):
            print(article_id + ": Index exists.")
            return True
        
    assert reference_text is not None and reference_text != '', "Reference text cannot be None or empty."

    try:
        storage_context = StorageContext.from_defaults(persist_dir=index_path)
        index = load_index_from_storage(storage_context)
        print(article_id + ": Loaded existing index.")
    except Exception as e:
        print(e)
        print(article_id + ": Creating a new index.")
        try: 
            index = create_index(reference_text, chunk_size, chunk_overlap)
            index.storage_context.persist(persist_dir=index_path)
        except Exception as e:
            print(e)
            print(article_id + ": Failed to create index.")
            print(reference_text)
    return index

### Create Indices for all reference articles

In [11]:
chunk_size = 256
chunk_overlap = 20

around 9 minutes per chunk_size

In [12]:
%%time

for _, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        reference_article_id = row['Reference Article ID']
        if reference_article_id:
            reference_text = get_reference_text(reference_article_id)
            index = load_or_create_index(reference_article_id, reference_text, chunk_size, chunk_overlap, only_checking=True)

r001: Index exists.
r002: Index exists.
r003: Index exists.
r004: Index exists.
r005: Index exists.
r006: Index exists.
r007: Index exists.
r008: Index exists.
r009: Index exists.
r010: Index exists.
r011: Index exists.
r012: Index exists.
r013: Index exists.
r013: Index exists.
r014: Index exists.
r015: Index exists.
r005: Index exists.
r018: Index exists.
r019: Index exists.
r020: Index exists.
r021: Index exists.
r022: Index exists.
r023: Index exists.
r024: Index exists.
r013: Index exists.
r025: Index exists.
r026: Index exists.
r028: Index exists.
r027: Index exists.
r029: Index exists.
r030: Index exists.
r031: Index exists.
r032: Index exists.
r033: Index exists.
r034: Index exists.
r035: Index exists.
r036: Index exists.
r037: Index exists.
r038: Index exists.
r039: Index exists.
r040: Index exists.
r041: Index exists.
r042: Index exists.
r043: Index exists.
r044: Index exists.
r045: Index exists.
r046: Index exists.
r047: Index exists.
r048: Index exists.
r049: Index exists.


## Identifying top 3 chunks

In [13]:
from llama_index.core.retrievers import VectorIndexRetriever

def get_top_k_similar_chunks(statement, index, k=3):
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=k,
    )
    retrieved_nodes = retriever.retrieve(statement)
    return retrieved_nodes

In [14]:
import json
import os

def save_similar_chunks(doc_ids, reference_id, chunk_size, chunk_overlap):
    file_path = f"../data/similar_chunks/{'only_text_' if extension == 'txt' else ''}{chunk_size}_{chunk_overlap}/{reference_id}.json"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as file:
        json.dump(doc_ids, file)

def load_similar_chunks(reference_id, chunk_size, chunk_overlap):
    file_path = f"../data/similar_chunks/{'only_text_' if extension == 'txt' else ''}{chunk_size}_{chunk_overlap}/{reference_id}.json"
    with open(file_path, 'r') as file:
        doc_ids = json.load(file)
    return doc_ids

### Save the document ids and text contents of the top k chunks of all reference articles to the df

In [15]:
def get_doc_ids(response):
    doc_ids = []
    for node in response:
        doc_ids.append(node.dict()['node']['id_'])
    return doc_ids

In [18]:
def save_top_k_chunk_ids(df, chunk_size, chunk_overlap, k=3):
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            print(f"------ Starting {reference_article_id} ------")
            
            # Try to load similar chunks first
            try:
                doc_ids = load_similar_chunks(reference_article_id, chunk_size, chunk_overlap)
                print("Loaded similar chunks successfully.")
            except FileNotFoundError:
                # Load reference text and create chunks
                reference_text = get_reference_text(reference_article_id)
                
                # Load or create index
                index = load_or_create_index(reference_article_id, reference_text, chunk_size, chunk_overlap)
                
                # Get the statement and retrieve top chunks
                statement = row["Corrected Statement"]

                print("Receiving top chunks")

                try:
                    response = get_top_k_similar_chunks(statement, index, k)
                    doc_ids = get_doc_ids(response)
                    
                    # Save the top chunks
                    print("Saving top chunks")
                    save_similar_chunks(doc_ids, reference_article_id, chunk_size, chunk_overlap)
                except Exception as e:
                    print(e)
                    print("Failed to get top chunks.")
            print("")

around 6,5 minutes per chunk_size

In [19]:
%%time 

save_top_k_chunk_ids(df, chunk_size, chunk_overlap, k=3)

------ Starting r001 ------
Loaded similar chunks successfully.

------ Starting r002 ------
Loaded similar chunks successfully.

------ Starting r003 ------
Loaded similar chunks successfully.

------ Starting r004 ------
Loaded similar chunks successfully.

------ Starting r005 ------
Loaded similar chunks successfully.

------ Starting r006 ------
Loaded similar chunks successfully.

------ Starting r007 ------
Loaded similar chunks successfully.

------ Starting r008 ------
Loaded similar chunks successfully.

------ Starting r009 ------
Loaded similar chunks successfully.

------ Starting r010 ------
Loaded similar chunks successfully.

------ Starting r011 ------
Loaded similar chunks successfully.

------ Starting r012 ------
Loaded similar chunks successfully.

------ Starting r013 ------
Loaded similar chunks successfully.

------ Starting r013 ------
Loaded similar chunks successfully.

------ Starting r014 ------
Loaded similar chunks successfully.

------ Starting r015 ----

In [20]:
import pandas as pd

output_dir = f"../data/dfs/{'only_text_' if extension == 'txt' else ''}{chunk_size}_{chunk_overlap}/"
os.makedirs(output_dir, exist_ok=True)
# df2 = pd.read_pickle(os.path.join(output_dir, f"ReferenceErrorDetection_data_with_chunk_info.pkl"))

In [21]:
def add_top_k_chunk_ids_and_texts_to_df(df, chunk_size, chunk_overlap, k=3):
    doc_ids_list = []
    doc_texts_list = []
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            
            print(f"------ Starting {reference_article_id} ------")

            # load index
            index_path = f"../data/vector_indices/{'only_text_' if extension == 'txt' else ''}{chunk_size}_{chunk_overlap}/{reference_article_id}/"
            storage_context = StorageContext.from_defaults(persist_dir=index_path)
            index = load_index_from_storage(storage_context)

            # load similar chunks
            doc_ids = load_similar_chunks(reference_article_id, chunk_size, chunk_overlap)
            doc_texts = [index.docstore.docs[doc_id].text for doc_id in doc_ids]

            # add to lists
            doc_ids_list.append(doc_ids)
            doc_texts_list.append(doc_texts)
        else:
            doc_ids_list.append(None)
            doc_texts_list.append(None)
    
    df[f'Top_{k}_Chunk_IDs'] = doc_ids_list
    df[f'Top_{k}_Chunk_Texts'] = doc_texts_list
    return df

around 4,5 minutes per chunk_size

In [22]:
df2 = add_top_k_chunk_ids_and_texts_to_df(df, chunk_size, chunk_overlap, k=3)

------ Starting r001 ------
------ Starting r002 ------
------ Starting r003 ------
------ Starting r004 ------
------ Starting r005 ------
------ Starting r006 ------
------ Starting r007 ------
------ Starting r008 ------
------ Starting r009 ------
------ Starting r010 ------
------ Starting r011 ------
------ Starting r012 ------
------ Starting r013 ------
------ Starting r013 ------
------ Starting r014 ------
------ Starting r015 ------
------ Starting r005 ------
------ Starting r018 ------
------ Starting r019 ------
------ Starting r020 ------
------ Starting r021 ------
------ Starting r022 ------
------ Starting r023 ------
------ Starting r024 ------
------ Starting r013 ------
------ Starting r025 ------
------ Starting r026 ------
------ Starting r028 ------
------ Starting r027 ------
------ Starting r029 ------
------ Starting r030 ------
------ Starting r031 ------
------ Starting r032 ------
------ Starting r033 ------
------ Starting r034 ------
------ Starting r035

In [23]:
df2.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Citation ID,Statement with Citation,Corrected Statement,...,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Error Type,Added,Previously Partially Substantiated,Top_3_Chunk_IDs,Top_3_Chunk_Texts
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,cit001_1,Others have aimed to reduce irreversibility or...,Others have aimed to reduce irreversibility or...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[15e4928e-d5b4-4358-a37a-0e2eef963320, 4472c31...",[68. (8 ) σ 1 σ 17 σ 18 σ 31 σ 30 σ 41 σ 46 σ ...
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,cit001_2,Some researchers have also studied various hea...,Some researchers have also studied various hea...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[1815ea07-43b1-4b3f-bbf0-35876f1f2e2f, cfeb9f2...",[The evaluations of nanofluid thermo-physical ...
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,cit002_1,The relative content of total flavonoids in th...,The relative content of total flavonoids in th...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[3d0ce96f-8ffb-4347-b469-d35015b7ef3a, 00c7040...",[Note when patients have undergone different m...
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,cit003_1,Research has shown that remimazolam tosylate e...,Research has shown that remimazolam tosylate e...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[c836b980-93f4-4dff-ac29-1735fa77227b, 0b41ff6...",[The second study analyzed 325 patients with 1...
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,cit004_1,if the efficiency of the routing algorithm is ...,If the efficiency of the routing algorithm is ...,...,Yes,No,Yes,Unsubstantiated,Irrelevant,Irrelevant,No,,"[1ecb8356-826f-491a-984a-722919362c0e, c0e8850...","[In the table, China's intermediary centrality..."


In [24]:
import os

# Ensure the directory exists
output_dir = f"../data/dfs/{'only_text_' if extension == 'txt' else ''}{chunk_size}_{chunk_overlap}/"
os.makedirs(output_dir, exist_ok=True)

# Save the DataFrame to a pickle file
df2.to_pickle(os.path.join(output_dir, f"ReferenceErrorDetection_data_with_chunk_info.pkl"))

# Save the DataFrame to a excel file
df2.to_excel(os.path.join(output_dir, f"ReferenceErrorDetection_data_with_chunk_info.xlsx"), index=False)