# Identify the 3 top matching chunks using LlamaIndex

LlamaIndex:
- Github: https://github.com/run-llama/llama_index
- Documentation: https://docs.llamaindex.ai/en/latest/

In [162]:
import pandas as pd

# read the xlsx data into a pandas dataframe
df = pd.read_excel('../data/ReferenceErrorDetection_data.xlsx')

In [163]:
df

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Reference Article ID,Reference Article DOI,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,r001,10.1155/2021/2087027,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiate,Irrelevant
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,r002,10.1016/j.physa.2018.12.031,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiate,Irrelevant
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,r003,10.1088/1742-6596/1937/1/012038,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiate,Irrelevant
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,r004,10.1186/s12871-018-0543-3,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiate,Irrelevant
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,r005,10.36410/jcpr.2022.23.3.312,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiate,Irrelevant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,"Smith & Cumberledge, 2020",c175,10.1038/ncomms15218,Fast oxygen diffusion and iodide defects media...,No,Yes,Chemistry,The impact of film microstructure on charge ca...,r239,10.1063/1.4889845,Role of the crystallization substrate on the p...,We have fabricated CH3NH3PbI3−xClx perovskite ...,Yes,No,Yes,Fully substantiate,
246,"Smith & Cumberledge, 2020",c176,10.1038/ncomms16007,Muscle-specific CRISPR/Cas9 dystrophin gene ed...,No,Yes,Medicine,Mutations in the dystrophin (DMD) gene result ...,r240,10.1038/345315a0,Deficiency of a glycoprotein component of the ...,"Dystrophin, the protein encoded by the Duchenn...",Yes,No,Yes,Fully substantiate,
247,"Smith & Cumberledge, 2020",c085,10.1038/s41467-017-00519-2,Nanodiamonds suppress the growth of lithium de...,No,Yes,Chemistry,Aggregation of nanodiamond particles cannot be...,r241,10.1016/j.diamond.2008.01.033,Deagglomeration and functionalisation of deton...,We have achieved the covalent functionalisatio...,Yes,No,Yes,Fully substantiate,
248,"Smith & Cumberledge, 2020",c177,10.1038/ncomms15081,Single-cell RNA-seq enables comprehensive tumo...,No,Yes,Medicine,Single-cell genome analysis is expected to hav...,r242,10.1101/gr.191098.115,The first five years of single-cell cancer gen...,Single-cell sequencing (SCS) is a powerful new...,Yes,No,Yes,Fully substantiate,


In [3]:
first_entry = df.query("`Reference Article Retracted` == 'No' and `Label` == 'Fully substantiate'").iloc[0]
first_entry

Source                                                     Smith & Cumberledge, 2020
Citing Article ID                                                               c058
Citing Article DOI                                           10.1126/science.aan0177
Citing Article Title               High dislocation density-induced large ductili...
Citing Article Retracted                                                          No
Citing Article Downloaded                                                        Yes
Domain                                                                       Physics
Statement with Citation            Tensile properties of our steel compared with ...
Reference Article ID                                                            r091
Reference Article DOI                                     10.2320/matertrans.46.1839
Reference Article Title            The role of retained austenite on tensile prop...
Reference Article Abstract         In high-carbon, silicon-rich s

In [127]:
first_entry_id = first_entry['Reference Article ID']
first_entry_id

'r091'

## Get text from reference article

In [122]:
import glob
import xml.etree.ElementTree as ET

def get_reference_text(reference_article_id):
    # Construct the file path pattern using the Reference Article ID of the first entry
    file_pattern = f"../data/extractions/{reference_article_id}*.xml"

    # Find the file that matches the pattern
    file_list = glob.glob(file_pattern)
    if file_list:
        file_path = file_list[0]
        
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract the text content from the XML file
        reference_text = ''.join(root.itertext())
        return reference_text
    else:
        print("No matching file found.")

In [128]:
reference_text = get_reference_text(first_entry_id)

## Set OpenAI key

In [12]:
# Read the content of open_ai_key.txt into a variable
with open('../open_ai_key.txt', 'r') as file:
    open_ai_key = file.read().strip()

In [None]:
# import os
# import openai

# os.environ["OPENAI_API_KEY"] = open_ai_key
# openai.api_key = os.environ["OPENAI_API_KEY"]

# import nest_asyncio

# nest_asyncio.apply()

## Setting up vector index

### Chunk splitting

In [121]:
from llama_index.core.node_parser import TokenTextSplitter

def create_chunks(text, chunk_size=512, chunk_overlap=50):
    token_text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = token_text_splitter.split_text(text)
    return chunks

In [129]:
reference_chunks = create_chunks(reference_text)

### Reloading or generating index

In [8]:
model = "gpt-3.5-turbo-0125"
model_embeddings = "text-embedding-3-small"

In [20]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI

Settings.embed_model = OpenAIEmbedding(model=model_embeddings, api_key=open_ai_key)
Settings.llm = OpenAI(model=model, temperature=0)

In [137]:
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import VectorStoreIndex, Document

def load_or_create_index(article_id, reference_chunks):
    index_path = f"../data/vector_indices/{article_id}/"
    index = None

    try:
        print(article_id + ": Loaded existing index.")
        storage_context = StorageContext.from_defaults(persist_dir=index_path)
        index = load_index_from_storage(storage_context)
    except Exception as e:
        print(e)
        print(article_id + ": Creating a new index.")
        documents = [Document(text=chunk) for chunk in reference_chunks]
        index = VectorStoreIndex.from_documents(documents)
        index.storage_context.persist(persist_dir=index_path)
    
    return index

In [150]:
index = load_or_create_index(first_entry_id, reference_chunks)

r091: Loaded existing index.


### Create Indices for all reference articles

In [139]:
for _, row in df.iterrows():
    if row['Reference Article Downloaded'] == 'Yes':
        reference_article_id = row['Reference Article ID']
        reference_text = get_reference_text(reference_article_id)
        reference_chunks = create_chunks(reference_text)
        index = load_or_create_index(reference_article_id, reference_chunks)

r001: Loaded existing index.
r002: Loaded existing index.
r003: Loaded existing index.
r004: Loaded existing index.
r005: Loaded existing index.
r006: Loaded existing index.
r007: Loaded existing index.
r008: Loaded existing index.
r009: Loaded existing index.
r010: Loaded existing index.
r011: Loaded existing index.
r012: Loaded existing index.
r013: Loaded existing index.
r013: Loaded existing index.


KeyboardInterrupt: 

## Identifying top 3 chunks

In [136]:
statement = first_entry["Statement with Citation"]
statement

'Tensile properties of our steel compared with those of other existing high strength metallic materials. These include nanobainite steel [citation 36].'

In [141]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

def get_top_k_similar_statements(statement, index, k=3):
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=k,
    )
    response_synthesizer = get_response_synthesizer(
        response_mode="no_text",
    )
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )
    response = query_engine.query(statement)
    return response

In [151]:
response = get_top_k_similar_statements(statement, index)

In [157]:
def get_doc_ids(response, index):
    doc_ids = []
    for source_node in response.source_nodes:
        node_id = source_node.dict()['node']['relationships']['1']['node_id']
        doc_ids.append([doc_id for doc_id, doc in index.docstore.docs.items() if doc.dict()['relationships']['1']['node_id'] == node_id][0])
    return doc_ids

In [158]:
get_doc_ids(response, index)

['ee55ca02-8ba4-45f3-82fe-c88826eef2b2',
 '1cead63a-77a9-4c16-bffd-c1d774443bc4',
 'c79667d2-37d3-4b58-a132-aa1efba9dc9a']

In [159]:
import json
import os

def save_similar_chunks(doc_ids, reference_id):
    file_path = f"../data/similar_chunks/{reference_id}.json"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as file:
        json.dump(doc_ids, file)

def load_similar_chunks(reference_id):
    file_path = f"../data/similar_chunks/{reference_id}.json"
    with open(file_path, 'r') as file:
        doc_ids = json.load(file)
    return doc_ids

In [160]:
save_similar_chunks(get_doc_ids(response, index), first_entry_id)

### Save the document ids and text contents of the top k chunks of all reference articles to the df

In [161]:
def save_top_k_chunk_ids(df, k=3):
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            print(f"------ Starting {reference_article_id} ------")
            reference_text = get_reference_text(reference_article_id)
            reference_chunks = create_chunks(reference_text)
            index = load_or_create_index(reference_article_id, reference_chunks)
            statement = row["Statement with Citation"]
            print("Receiving top chunks")
            response = get_top_k_similar_statements(statement, index, k)
            doc_ids = get_doc_ids(response, index)
            print("Saving top chunks")
            save_similar_chunks(doc_ids, reference_article_id)
            print("")

In [164]:
save_top_k_chunk_ids(df)

------ Starting r001 ------
r001: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r002 ------
r002: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r003 ------
r003: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r004 ------
r004: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r005 ------
r005: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r006 ------
r006: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r007 ------
r007: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r008 ------
r008: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r009 ------
r009: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r010 ------
r010: Loaded existing index.
Receiving top chunks
Saving top chunks

------ Starting r011 ------
r0

In [168]:
def add_top_k_chunk_ids_and_texts_to_df(df, k=3):
    doc_ids_list = []
    doc_texts_list = []
    for _, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            reference_article_id = row['Reference Article ID']
            print(f"------ Starting {reference_article_id} ------")
            reference_text = get_reference_text(reference_article_id)
            reference_chunks = create_chunks(reference_text)
            index = load_or_create_index(reference_article_id, reference_chunks)
            statement = row["Statement with Citation"]
            print("Receiving top chunks")
            response = get_top_k_similar_statements(statement, index, k)
            doc_ids = get_doc_ids(response, index)
            doc_texts = [index.docstore.docs[doc_id].text for doc_id in doc_ids]
            doc_ids_list.append(doc_ids)
            doc_texts_list.append(doc_texts)
            print("")
        else:
            doc_ids_list.append(None)
            doc_texts_list.append(None)
    
    df[f'Top_{k}_Chunk_IDs'] = doc_ids_list
    df[f'Top_{k}_Chunk_Texts'] = doc_texts_list
    return df

In [170]:
df2 = add_top_k_chunk_ids_and_texts_to_df(df)

------ Starting r001 ------
r001: Loaded existing index.
Receiving top chunks

------ Starting r002 ------
r002: Loaded existing index.
Receiving top chunks

------ Starting r003 ------
r003: Loaded existing index.
Receiving top chunks

------ Starting r004 ------
r004: Loaded existing index.
Receiving top chunks

------ Starting r005 ------
r005: Loaded existing index.
Receiving top chunks

------ Starting r006 ------
r006: Loaded existing index.
Receiving top chunks

------ Starting r007 ------
r007: Loaded existing index.
Receiving top chunks

------ Starting r008 ------
r008: Loaded existing index.
Receiving top chunks

------ Starting r009 ------
r009: Loaded existing index.
Receiving top chunks

------ Starting r010 ------
r010: Loaded existing index.
Receiving top chunks

------ Starting r011 ------
r011: Loaded existing index.
Receiving top chunks

------ Starting r012 ------
r012: Loaded existing index.
Receiving top chunks

------ Starting r013 ------
r013: Loaded existing in

In [171]:
df2

Unnamed: 0,Source,Citing Article ID,Citing Article DOI,Citing Article Title,Citing Article Retracted,Citing Article Downloaded,Domain,Statement with Citation,Reference Article ID,Reference Article DOI,Reference Article Title,Reference Article Abstract,Reference Article PDF Available,Reference Article Retracted,Reference Article Downloaded,Label,Explanation,Top_3_Chunk_IDs,Top_3_Chunk_Texts
0,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Others have aimed to reduce irreversibility or...,r001,10.1155/2021/2087027,A Fault Analysis Method for Three-Phase Induct...,The fault prediction and abductive fault diagn...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[5cb7b532-0bf6-42e8-b245-5ce118330981, 9c03161...",[the winding and lead wirep 15Connection box j...
1,PubPeer,c001,10.1016/j.est.2021.103553,Heating a residential building using the heat ...,Yes,Yes,Engineering,Some researchers have also studied various hea...,r002,10.1016/j.physa.2018.12.031,Develop 24 dissimilar ANNs by suitable archite...,The artificial neural network optimization met...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[cfec4c95-2610-4b0d-b81e-3c46710e02e3, 17e8175...",[Therm. Anal. Calorim\n\t\t\n\t\t\t131\n\t\t\t...
2,PubPeer,c002,10.1155/2022/4601350,Oxidative Potential and Nanoantioxidant Activi...,Yes,Yes,Chemistry,The relative content of total flavonoids in th...,r003,10.1088/1742-6596/1937/1/012038,Lipid Data Acquisition for devices Treatment o...,"Recently, the widespread deployment of smart p...",Yes,No,Yes,Unsubstantiate,Irrelevant,"[58d642c1-5f8e-4ac3-bc25-c74eb9c6f5bf, b00c81c...",[mmol/l and 6.96 mmol/l TG calculation concent...
3,PubPeer,c003,10.1155/2022/2408685,The Choice of Anesthetic Drugs in Outpatient H...,Yes,Yes,Medicine,Research has shown that remimazolam tosylate e...,r004,10.1186/s12871-018-0543-3,"Effect of propofol on breast cancer cell, the ...",Breast cancer is the second leading cause of c...,Yes,No,Yes,Unsubstantiate,Irrelevant,"[021c7cfc-9329-4667-8653-753f07eb787d, 4aac749...",[J Anaesthesiol\n\t\t\n\t\t\t22\n\t\t\t8\n\t\t...
4,PubPeer,c004,10.1155/2022/4783847,A Fault-Tolerant Structure for Nano-Power Comm...,Yes,Yes,Engineering,if the efficiency of the routing algorithm is ...,r005,10.36410/jcpr.2022.23.3.312,Analysis and research hotspots of ceramic mate...,"From the perspective of scientometrics, comb t...",Yes,No,Yes,Unsubstantiate,Irrelevant,"[716a8a3a-eed7-4c86-89a3-d63beecb4ff3, 7776e6e...","[size indicator is high, reflecting the import..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,"Smith & Cumberledge, 2020",c175,10.1038/ncomms15218,Fast oxygen diffusion and iodide defects media...,No,Yes,Chemistry,The impact of film microstructure on charge ca...,r239,10.1063/1.4889845,Role of the crystallization substrate on the p...,We have fabricated CH3NH3PbI3−xClx perovskite ...,Yes,No,Yes,Fully substantiate,,"[3306f87a-b14e-4401-bc90-03dde2664c51, a5d6c1b...",[Export Citation\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t...
246,"Smith & Cumberledge, 2020",c176,10.1038/ncomms16007,Muscle-specific CRISPR/Cas9 dystrophin gene ed...,No,Yes,Medicine,Mutations in the dystrophin (DMD) gene result ...,r240,10.1038/345315a0,Deficiency of a glycoprotein component of the ...,"Dystrophin, the protein encoded by the Duchenn...",Yes,No,Yes,Fully substantiate,,"[719ddf4d-fa3e-4936-92d0-a4f0caa289d2, 8b307a7...",[concentration of one of these is greatly redu...
247,"Smith & Cumberledge, 2020",c085,10.1038/s41467-017-00519-2,Nanodiamonds suppress the growth of lithium de...,No,Yes,Chemistry,Aggregation of nanodiamond particles cannot be...,r241,10.1016/j.diamond.2008.01.033,Deagglomeration and functionalisation of deton...,We have achieved the covalent functionalisatio...,Yes,No,Yes,Fully substantiate,,"[fe36d6e9-fb79-4fee-bc96-0788267f4ce8, 8e6543a...",[nanoparticles have found considerable interes...
248,"Smith & Cumberledge, 2020",c177,10.1038/ncomms15081,Single-cell RNA-seq enables comprehensive tumo...,No,Yes,Medicine,Single-cell genome analysis is expected to hav...,r242,10.1101/gr.191098.115,The first five years of single-cell cancer gen...,Single-cell sequencing (SCS) is a powerful new...,Yes,No,Yes,Fully substantiate,,"[a37b0d35-5a7c-4d14-ba49-22c39f815626, 9b399e3...",[Rev Cancer\n\t\t\n\t\t\t7\n\t\t\t\n\t\t\t2007...


In [174]:
df2.to_pickle('../data/ReferenceErrorDetection_data_with_chunk_info.pkl')