# Identify the 3 top matching chunks using LlamaIndex

LlamaIndex:
- Github: https://github.com/run-llama/llama_index
- Documentation: https://docs.llamaindex.ai/en/latest/

In [83]:
import pandas as pd

# read the xlsx data into a pandas dataframe
df = pd.read_excel('../data/ReferenceErrorDetection_data.xlsx')

In [84]:
df.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Reference Article ID,Reference Article DOI,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,r001,10.1155/2021/2087027,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiate,Irrelevant
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,r002,10.1016/j.physa.2018.12.031,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiate,Irrelevant
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,r003,10.1088/1742-6596/1937/1/012038,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiate,Irrelevant
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,r004,10.1186/s12871-018-0543-3,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiate,Irrelevant
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,r005,10.36410/jcpr.2022.23.3.312,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiate,Irrelevant


In [85]:
first_entry = df.query("`Reference Article Retracted` == 'No' and `Label` == 'Fully substantiate'").iloc[0]
first_entry

Source                                                     Smith & Cumberledge, 2020
Citing Article ID                                                               c058
Citing Article DOI                                           10.1126/science.aan0177
Citing Article Title               High dislocation density-induced large ductili...
Citing Article Retracted                                                          No
Citing Article Downloaded                                                        Yes
Domain                                                                       Physics
Statement with Citation            Tensile properties of our steel compared with ...
Reference Article ID                                                            r091
Reference Article DOI                                     10.2320/matertrans.46.1839
Reference Article Title            The role of retained austenite on tensile prop...
Reference Article Abstract         In high-carbon, silicon-rich s

In [86]:
first_entry_id = first_entry['Reference Article ID']
first_entry_id

'r091'

## Get text from reference article

In [87]:
grobid_model = "small_model"
extension = "xml"

In [88]:
import glob

def get_file_path(reference_article_id):
    # Construct the file path pattern using the Reference Article ID of the first entry
    file_pattern = f"../data/extractions/{grobid_model}/{reference_article_id}*.{extension}"

    # Find the file that matches the pattern
    file_list = glob.glob(file_pattern)
    if file_list:
        file_path = file_list[0]
        return file_path
    else: 
        print("No matching file found.")
        return None

In [89]:
import xml.etree.ElementTree as ET

def get_reference_text(reference_article_id):
    global extension
    
    # Get the file path
    file_path = get_file_path(reference_article_id)
    
    if file_path:
        if extension == "txt":
            # Read the text file
            with open(file_path, 'r') as file:
                reference_text = file.read()
            return reference_text

        elif extension == "xml":
            # Parse the XML file
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Extract the text content from the XML file
            reference_text = ''.join(root.itertext())
            return reference_text

In [90]:
reference_text = get_reference_text(first_entry_id)

In [91]:
reference_text

'\n\t\n\t\t\n\t\t\t\n\t\t\t\tThe Role of Retained Austenite on Tensile Properties of Steels with Bainitic Microstructures\n\t\t\t\t\n\t\t\t\t\tSpanish Ministerio de Ciencia y Tecnologı\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\tEPSRC/MOD\n\t\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\tAugust 15, 2005\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tCarlosGarcı ´a-Mateo\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tDepartment of Physical Metallurgy\n\t\t\t\t\t\t\t\tMateralia Research Group. Centro Nacional de Investigaciones Metalu ´rgicas (CENIM-CSIC)\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tAvda. Gregorio del Amo, 8\n\t\t\t\t\t\t\t\t\t28040\n\t\t\t\t\t\t\t\t\tMadrid\n\t\t\t\t\t\t\t\t\tSpain\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tFranciscaGCaballero\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\tDepartment of Physical Metallurgy\n\t\t\t\t\t\t\t\tMateralia Research Group. Centro Nacional de Investigaciones Metalu ´rgicas (CENIM-CSIC)\n\t\t\t\t\t

## Set OpenAI key

In [92]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

## Setting up vector index

### Chunk splitting

In [93]:
from llama_index.core.node_parser import TokenTextSplitter

def create_chunks(text, chunk_size=256, chunk_overlap=20):
    token_text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = token_text_splitter.split_text(text)
    return chunks

In [94]:
reference_chunks = create_chunks(reference_text)

### Reloading or generating index

In [95]:
model = "gpt-3.5-turbo-0125"
model_embeddings = "text-embedding-3-small"

In [96]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI

Settings.embed_model = OpenAIEmbedding(model=model_embeddings, api_key=open_ai_key)
Settings.llm = OpenAI(model=model, temperature=0)

In [97]:
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import VectorStoreIndex, Document

def load_or_create_index(article_id, reference_chunks):
    index_path = f"../data/vector_indices/{grobid_model}/{article_id}/"
    index = None

    try:
        storage_context = StorageContext.from_defaults(persist_dir=index_path)
        index = load_index_from_storage(storage_context)
        print(article_id + ": Loaded existing index.")
    except Exception as e:
        print(e)
        print(article_id + ": Creating a new index.")
        documents = [Document(text=chunk) for chunk in reference_chunks]
        index = VectorStoreIndex.from_documents(documents)
        index.storage_context.persist(persist_dir=index_path)
    
    return index

In [98]:
index = load_or_create_index(first_entry_id, reference_chunks)

r091: Loaded existing index.


### Create Indices for all reference articles

In [54]:
for _, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        reference_article_id = row['Reference Article ID']
        reference_text = get_reference_text(reference_article_id)
        reference_chunks = create_chunks(reference_text)
        index = load_or_create_index(reference_article_id, reference_chunks)

[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/full_model_texts/r001/docstore.json'
r001: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/full_model_texts/r002/docstore.json'
r002: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/full_model_texts/r003/docstore.json'
r003: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/full_model_texts/r004/docstore.json'
r004: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/citation-verification/notebooks/../data/vector_indices/full_model_texts/r005/docstore.json'
r005: Creating a new index.
[Errno 2] No such file or directory: '/home/ibelter/master_thesis/cita

## Identifying top 3 chunks

In [55]:
statement = first_entry["Statement with Citation"]
statement

'Tensile properties of our steel compared with those of other existing high strength metallic materials. These include nanobainite steel [citation 36].'

In [56]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

def get_top_k_similar_statements(statement, index, k=3):
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=k,
    )
    response_synthesizer = get_response_synthesizer(
        response_mode="no_text",
    )
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )
    response = query_engine.query(statement)
    return response

In [57]:
response = get_top_k_similar_statements(statement, index)
response

Response(response='', source_nodes=[NodeWithScore(node=TextNode(id_='8c3b39ce-ab7d-4020-8e44-2067f0c8bb15', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09c468d8-a386-423f-9853-a0257dc6abf9', node_type='4', metadata={}, hash='f8b60ca3481c74f17510a46adc72b0094e923b122ced363bba1f03c5eb48d376')}, metadata_template='{key}: {value}', metadata_separator='\n', text='optical axis is free of dispersion of the resonator, as we could keep the desired anisotropic behavior. To verify our proposed scheme, we have performed full-wave simulations using CST Microwave Studio, based on the finite difference time domain (FDTD) algorithm  [28] . We utilize the perfect conductor approximation (PCA) for simplicity. The results presented here can be extended to other frequency regimes, when the dispersion of metal is taken into consideration. We focus on the wavelength λ = 2.21a(3.68b), wher

In [58]:
def get_doc_ids(response, index):
    doc_ids = []
    for source_node in response.source_nodes:
        node_id = source_node.dict()['node']['relationships']['1']['node_id']
        doc_ids.append([doc_id for doc_id, doc in index.docstore.docs.items() if doc.dict()['relationships']['1']['node_id'] == node_id][0])
    return doc_ids

In [59]:
get_doc_ids(response, index)

['8c3b39ce-ab7d-4020-8e44-2067f0c8bb15',
 'de796540-5d9a-4d01-b50e-96075b1cce6f',
 'fa63a1b3-8228-4614-80a9-952bb48f4ed9']

In [60]:
import json
import os

def save_similar_chunks(doc_ids, reference_id):
    file_path = f"../data/similar_chunks/{grobid_model}/{reference_id}.json"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as file:
        json.dump(doc_ids, file)

def load_similar_chunks(reference_id):
    file_path = f"../data/similar_chunks/{grobid_model}/{reference_id}.json"
    with open(file_path, 'r') as file:
        doc_ids = json.load(file)
    return doc_ids

In [30]:
save_similar_chunks(get_doc_ids(response, index), first_entry_id)

### Save the document ids and text contents of the top k chunks of all reference articles to the df

In [99]:
def save_top_k_chunk_ids(df, k=3):
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            print(f"------ Starting {reference_article_id} ------")
            
            # Try to load similar chunks first
            try:
                doc_ids = load_similar_chunks(reference_article_id)
                print("Loaded similar chunks successfully.")
            except FileNotFoundError:
                # Load reference text and create chunks
                reference_text = get_reference_text(reference_article_id)
                reference_chunks = create_chunks(reference_text)
                
                # Load or create index
                index = load_or_create_index(reference_article_id, reference_chunks)
                
                # Get the statement and retrieve top chunks
                statement = row["Statement with Citation"]
                print("Receiving top chunks")
                response = get_top_k_similar_statements(statement, index, k)
                doc_ids = get_doc_ids(response, index)
                
                # Save the top chunks
                print("Saving top chunks")
                save_similar_chunks(doc_ids, reference_article_id)
            
            print("")

In [100]:
save_top_k_chunk_ids(df)

------ Starting r001 ------
r001: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r002 ------
r002: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r003 ------
r003: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r004 ------
r004: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r005 ------
r005: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r006 ------
r006: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r007 ------
r007: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r008 ------
r008: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r009 ------
r009: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r010 ------
r010: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r011 ------
r0

In [48]:
doc_ids = load_similar_chunks(reference_article_id)
doc_texts = [index.docstore.docs[doc_id].text for doc_id in doc_ids]

In [63]:
def add_top_k_chunk_ids_and_texts_to_df(df, k=3):
    doc_ids_list = []
    doc_texts_list = []
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            print(f"------ Starting {reference_article_id} ------")

            # load index
            index_path = f"../data/vector_indices/{grobid_model}/{reference_article_id}/"
            storage_context = StorageContext.from_defaults(persist_dir=index_path)
            index = load_index_from_storage(storage_context)

            # load similar chunks
            doc_ids = load_similar_chunks(reference_article_id)
            doc_texts = [index.docstore.docs[doc_id].text for doc_id in doc_ids]

            # add to lists
            doc_ids_list.append(doc_ids)
            doc_texts_list.append(doc_texts)
        else:
            doc_ids_list.append(None)
            doc_texts_list.append(None)
    
    df[f'Top_{k}_Chunk_IDs'] = doc_ids_list
    df[f'Top_{k}_Chunk_Texts'] = doc_texts_list
    return df

In [101]:
df2 = add_top_k_chunk_ids_and_texts_to_df(df)

------ Starting r001 ------
------ Starting r002 ------
------ Starting r003 ------
------ Starting r004 ------
------ Starting r005 ------
------ Starting r006 ------
------ Starting r007 ------
------ Starting r008 ------
------ Starting r009 ------
------ Starting r010 ------
------ Starting r011 ------
------ Starting r012 ------
------ Starting r013 ------
------ Starting r013 ------
------ Starting r014 ------
------ Starting r015 ------
------ Starting r005 ------
------ Starting r017 ------
------ Starting r018 ------
------ Starting r019 ------
------ Starting r020 ------
------ Starting r021 ------
------ Starting r022 ------
------ Starting r023 ------
------ Starting r024 ------
------ Starting r013 ------
------ Starting r025 ------
------ Starting r026 ------
------ Starting r027 ------
------ Starting r028 ------
------ Starting r029 ------
------ Starting r030 ------
------ Starting r031 ------
------ Starting r032 ------
------ Starting r033 ------
------ Starting r034

In [102]:
df2.head()

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Reference Article ID,Reference Article DOI,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Top_3_Chunk_IDs,Top_3_Chunk_Texts
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,r001,10.1155/2021/2087027,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[66ce2c05-0e4b-48dd-a146-b131119b32b4, 23ff46b...",[Transactions on Cybernetics\n\t\t\n\t\t\t51\n...
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,r002,10.1016/j.physa.2018.12.031,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[1eee1d68-3311-453d-9b1f-003bfd410db9, ba602d1...",[J. Exergy\n\t\t\n\t\t\t22\n\t\t\t2\n\t\t\t\n\...
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,r003,10.1088/1742-6596/1937/1/012038,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiate,Irrelevant,"[5c343544-9684-47c6-9a60-04b3fbfaece4, f602dee...",[AND RESULTPhotochemical blood performance of ...
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,r004,10.1186/s12871-018-0543-3,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[ae9ef0d1-eb32-4246-a315-da0067c2060a, 416cfc7...",[Rep\n\t\t\n\t\t\t37\n\t\t\t2\n\t\t\t\n\t\t\t2...
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,r005,10.36410/jcpr.2022.23.3.312,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiate,Irrelevant,"[69e069c2-d1de-4017-8660-0de9f5e6a36e, be0d8b8...",[with high centrality are usually the main nod...


In [103]:
df2.to_pickle(f'../data/dfs/{grobid_model}/ReferenceErrorDetection_data_with_chunk_info.pkl')