# Identify the 3 top matching chunks using LlamaIndex

LlamaIndex:
- Github: https://github.com/run-llama/llama_index
- Documentation: https://docs.llamaindex.ai/en/latest/

In [2]:
import pandas as pd

annotated = True

# read the xlsx data into a pandas dataframe
df = pd.read_excel(f"../data/ReferenceErrorDetection_data{'_annotated' if annotated else ''}.xlsx")

In [3]:
df.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Corrected Statement,Citation Statement Section,...,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Error Type,Supporting Sentences,Suited for Task
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,,,...,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,,,...,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,,,...,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiate,Irrelevant,,,
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,,,...,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,,,...,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiate,Irrelevant,,,


## Get text from reference article

In [4]:
grobid_model = "full_model"
extension = "xml"

In [5]:
import glob

def get_file_path(reference_article_id):
    # Construct the file path pattern using the Reference Article ID of the first entry
    file_pattern = f"../data/extractions/{grobid_model}/{reference_article_id}*.{extension}"

    # Find the file that matches the pattern
    file_list = glob.glob(file_pattern)
    if file_list:
        file_path = file_list[0]
        return file_path
    else: 
        print("No matching file found.")
        return None

In [6]:
import xml.etree.ElementTree as ET

def get_reference_text(reference_article_id):
    global extension
    
    # Get the file path
    file_path = get_file_path(reference_article_id)
    
    if file_path:
        if extension == "txt":
            # Read the text file
            with open(file_path, 'r') as file:
                reference_text = file.read()
            return reference_text

        elif extension == "xml":
            # Parse the XML file
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Extract the text content from the XML file
            reference_text = ''.join(root.itertext())
            return reference_text

## Set OpenAI key

In [7]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

## Setting up vector index

### Reloading or generating index

In [8]:
embedding = "te3l"

In [9]:
if embedding == "te3l":
    model_embeddings = "text-embedding-3-large"
elif embedding == "te3s":
    model_embeddings = "text-embedding-3-small"

In [10]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.embed_model = OpenAIEmbedding(model=model_embeddings, api_key=open_ai_key)

In [11]:
from llama_index.core import VectorStoreIndex, Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

def create_index(reference_text, chunk_size, chunk_overlap):
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
            OpenAIEmbedding(model=model_embeddings, api_key=open_ai_key)
        ]
    )

    # run the pipeline
    nodes = pipeline.run(documents=[Document(text=reference_text)])
    index = VectorStoreIndex(nodes)
    return index

In [12]:
from llama_index.core.node_parser import TokenTextSplitter

def create_chunks(text, chunk_size=256, chunk_overlap=20):
    token_text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = token_text_splitter.split_text(text)
    return chunks

In [13]:
def create_index_with_chunking_first(reference_text, chunk_size, chunk_overlap):
    reference_chunks = create_chunks(reference_text, chunk_size, chunk_overlap)
    documents = [Document(text=chunk) for chunk in reference_chunks]
    index = VectorStoreIndex.from_documents(documents)
    return index

In [17]:
from llama_index.core import StorageContext, load_index_from_storage
import os

def load_or_create_index(article_id, reference_text, no_prev_chunking, chunk_size=256, chunk_overlap=20, only_checking=False):
    index_path = f"../data/vector_indices{'/annotated_data' if annotated else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/{article_id}/"
    index = None
    if only_checking:
        if os.path.exists(index_path) and os.listdir(index_path):
            print(article_id + ": Index exists.")
            return True

    try:
        storage_context = StorageContext.from_defaults(persist_dir=index_path)
        index = load_index_from_storage(storage_context)
        print(article_id + ": Loaded existing index.")
    except Exception as e:
        print(e)
        print(article_id + ": Creating a new index.")
        try: 
            if no_prev_chunking:
                index = create_index(reference_text, chunk_size, chunk_overlap)
            else:
                index = create_index_with_chunking_first(reference_text, chunk_size, chunk_overlap)
            index.storage_context.persist(persist_dir=index_path)
        except Exception as e:
            print(e)
            print(article_id + ": Failed to create index.")
            print(reference_text)
    return index

### Create Indices for all reference articles

In [23]:
%%time

ids_to_index = []
no_prev_chunking = True

for _, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        reference_article_id = row['Reference Article ID']
        if reference_article_id and (len(ids_to_index) == 0 or reference_article_id in ids_to_index):
            reference_text = get_reference_text(reference_article_id)
            index = load_or_create_index(reference_article_id, reference_text, no_prev_chunking, only_checking=True)

[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/annotated_data/te3l_no_prev_chunking/full_model/r001/docstore.json'
r001: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/annotated_data/te3l_no_prev_chunking/full_model/r002/docstore.json'
r002: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/annotated_data/te3l_no_prev_chunking/full_model/r003/docstore.json'
r003: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/annotated_data/te3l_no_prev_chunking/full_model/r004/docstore.json'
r004: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/annotated_d

## Identifying top 3 chunks

In [24]:
from llama_index.core.retrievers import VectorIndexRetriever

def get_top_k_similar_chunks(statement, index, k=3):
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=k,
    )
    retrieved_nodes = retriever.retrieve(statement)
    return retrieved_nodes

In [33]:
import json
import os

def save_similar_chunks(doc_ids, reference_id, corrected_statements):
    file_path = f"../data/similar_chunks{'/annotated_data' if annotated else ''}{'/corrected_statements' if corrected_statements else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/{reference_id}.json"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as file:
        json.dump(doc_ids, file)

def load_similar_chunks(reference_id, corrected_statements):
    file_path = f"../data/similar_chunks{'/annotated_data' if annotated else ''}{'/corrected_statements' if corrected_statements else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/{reference_id}.json"
    with open(file_path, 'r') as file:
        doc_ids = json.load(file)
    return doc_ids

### Save the document ids and text contents of the top k chunks of all reference articles to the df

In [32]:
def get_doc_ids(response):
    doc_ids = []
    for node in response:
        doc_ids.append(node.dict()['node']['id_'])
    return doc_ids

In [46]:
def save_top_k_chunk_ids(df, no_prev_chunking, corrected_statements=False, k=3):
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            print(f"------ Starting {reference_article_id} ------")
            
            # Try to load similar chunks first
            try:
                doc_ids = load_similar_chunks(reference_article_id, corrected_statements)
                print("Loaded similar chunks successfully.")
            except FileNotFoundError:
                # Load reference text and create chunks
                reference_text = get_reference_text(reference_article_id)
                
                # Load or create index
                index = load_or_create_index(reference_article_id, reference_text, no_prev_chunking)
                
                # Get the statement and retrieve top chunks
                statement = row["Statement with Citation"]
                if corrected_statements:
                    statement = row["Corrected Statement"]
                    if pd.isna(statement) or statement == "null":
                        statement = row["Statement with Citation"]
                    else:
                        print(statement)

                print("Receiving top chunks")

                try:
                    response = get_top_k_similar_chunks(statement, index, k)
                    doc_ids = get_doc_ids(response)
                    
                    # Save the top chunks
                    print("Saving top chunks")
                    save_similar_chunks(doc_ids, reference_article_id, corrected_statements)
                except Exception as e:
                    print(e)
                    print("Failed to get top chunks.")
            print("")

In [None]:
%%time 

corrected_statements = False

save_top_k_chunk_ids(df, no_prev_chunking, corrected_statements=corrected_statements)

------ Starting r001 ------
Loaded similar chunks successfully.

------ Starting r002 ------
Loaded similar chunks successfully.

------ Starting r003 ------
Loaded similar chunks successfully.

------ Starting r004 ------
Loaded similar chunks successfully.

------ Starting r005 ------
r005: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r006 ------
r006: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r007 ------
r007: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r008 ------
r008: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r009 ------
r009: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r010 ------
r010: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r011 ------
r011: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r012 ------
r012: Loaded existing index.
Rece

In [48]:
import pandas as pd

output_dir = f"../data/dfs{'/annotated_data' if annotated else ''}{'/corrected_statements' if corrected_statements else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/"
os.makedirs(output_dir, exist_ok=True)
# df2 = pd.read_pickle(os.path.join(output_dir, f"ReferenceErrorDetection_data_with_chunk_info.pkl"))

In [51]:
def add_top_k_chunk_ids_and_texts_to_df(df, corrected_statements=False, k=3):
    doc_ids_list = []
    doc_texts_list = []
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']

            if len(ids_to_index) != 0 and reference_article_id not in ids_to_index:
                doc_ids_list.append(row[f'Top_{k}_Chunk_IDs'])
                doc_texts_list.append(row[f'Top_{k}_Chunk_Texts'])
                continue
            
            print(f"------ Starting {reference_article_id} ------")

            # load index
            index_path = f"../data/vector_indices{'/annotated_data' if annotated else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/{reference_article_id}/"
            storage_context = StorageContext.from_defaults(persist_dir=index_path)
            index = load_index_from_storage(storage_context)

            # load similar chunks
            doc_ids = load_similar_chunks(reference_article_id, corrected_statements)
            doc_texts = [index.docstore.docs[doc_id].text for doc_id in doc_ids]

            # add to lists
            doc_ids_list.append(doc_ids)
            doc_texts_list.append(doc_texts)
        else:
            doc_ids_list.append(None)
            doc_texts_list.append(None)
    
    df[f'Top_{k}_Chunk_IDs'] = doc_ids_list
    df[f'Top_{k}_Chunk_Texts'] = doc_texts_list
    return df

In [55]:
ids_to_index = []
corrected_statements = False
df2 = add_top_k_chunk_ids_and_texts_to_df(df, corrected_statements=corrected_statements)

------ Starting r001 ------
------ Starting r002 ------
------ Starting r003 ------
------ Starting r004 ------
------ Starting r005 ------
------ Starting r006 ------
------ Starting r007 ------
------ Starting r008 ------
------ Starting r009 ------
------ Starting r010 ------
------ Starting r011 ------
------ Starting r012 ------
------ Starting r013 ------
------ Starting r013 ------
------ Starting r014 ------
------ Starting r015 ------
------ Starting r005 ------
------ Starting r017 ------
------ Starting r018 ------
------ Starting r019 ------
------ Starting r020 ------
------ Starting r021 ------
------ Starting r022 ------
------ Starting r023 ------
------ Starting r024 ------
------ Starting r013 ------
------ Starting r025 ------
------ Starting r026 ------
------ Starting r027 ------
------ Starting r028 ------
------ Starting r029 ------
------ Starting r030 ------
------ Starting r031 ------
------ Starting r032 ------
------ Starting r033 ------
------ Starting r034

In [56]:
df2.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Corrected Statement,Citation Statement Section,...,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Error Type,Supporting Sentences,Suited for Task,Top_3_Chunk_IDs,Top_3_Chunk_Texts
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,,,...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,,"[e3f23238-791d-4749-81e5-fd3c55c2686b, af18d41...",[Nomenclature\nES:Expert system ANN:Artificial...
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,,,...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,,"[66e4b712-fc01-4965-a835-5bd9dcdb40cb, 5957595...",[Heat transfer improvement of water/single-wal...
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,,,...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,,"[e53a16b7-7a90-4656-b2c6-ad1c105cc881, 01a8b83...",[The correspondence curve for our photochemica...
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,,,...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,,"[934c19f1-dd5d-4427-bf55-8a4d1aeb9f5a, ec73144...",[Determination of the median effective concent...
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,,,...,Yes,No,Yes,Unsubstantiate,Irrelevant,,,,"[f38e8c2d-1f59-4744-9cdd-7dd775bca1ed, c2fe9ce...","[In the table, China's intermediary centrality..."


In [57]:
import os

# Ensure the directory exists
output_dir = f"../data/dfs{'/annotated_data' if annotated else ''}{'/corrected_statements' if corrected_statements else ''}/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{grobid_model}/"
os.makedirs(output_dir, exist_ok=True)

# Save the DataFrame to a pickle file
df2.to_pickle(os.path.join(output_dir, f"ReferenceErrorDetection_data_with_chunk_info.pkl"))