In [None]:
!pip install llama_index
!pip install openai
!pip install PyPDF2
!pip install fitz
!pip install PyMuPDF
!pip install --upgrade pymupdf
!pip install fuzzywuzzy




In [None]:
import os
os.environ['OPENAI_API_KEY']=""

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader('/content/data').load_data()
index = VectorStoreIndex.from_documents(documents)

In [None]:
documents

[Document(id_='46323171-d9fe-4cf9-9ce1-4580db9c6b72', embedding=None, metadata={'page_label': '1', 'file_name': 'ak_prostate.pdf', 'file_path': '/content/data/ak_prostate.pdf', 'file_type': 'application/pdf', 'file_size': 10052254, 'creation_date': '2025-04-28', 'last_modified_date': '2025-04-28'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='iScience\nArticle\nMultiplex imaging of localized prostate tumors\nreveals altered spatial organization of AR-positive\ncells in the microenvironment\nC¸i /C21gdem Ak,\nZeynep Sayar,\nGuillaume\nThibault, ..., Young\nHwan Chang,\nVasilis Stavrinides,\nSebnem Ece Eksi\neksi@ohsu.edu\nHighlights\nWe 

In [None]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7e6890c4e850>

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("which cellular neighborhoods were positive clinical variable association and had no negative association?")
print(response)

Cellular neighborhoods enriched in M1-like MDM showed positive clinical variable association and had no negative association.


In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("which cellular neighborhoods were positive clinical variable association and had no negative association?")
print(response)

Cellular neighborhoods enriched in M1-like MDM showed positive clinical variable association and had no negative association.


In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("list the unique cellular neighborhoods identified with details of the type of cells")
print(response)


The unique cellular neighborhoods identified include:
1. Neutrophil-Enriched neighborhood: characterized by a high density of neutrophils associated with vasculature and innate immune cells.
2. Neuroendocrine-Enriched neighborhood: a mixture of epithelial and immune cell types enriched, with denser neuroendocrine cells compared to other epithelial neighborhoods.
3. Paneth-Cell-Enriched neighborhood: observed only within the small intestine, enriched with Paneth cells known to be restricted to the small intestine and enriched within the intestinal crypt.


In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("which cellular neighborhood was striclty negative clinical variable association?")
print(response)

Vimentin+ and fibronectin+ fibroblast abundances were not correlated, but their respective neighborhoods were inversely correlated, suggesting a negative clinical variable association.


In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("Disease full name")
print(response)

Precision Oncology


In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("what factors does the paper focus on? : survival,response to immunotherapym,histology type,None")
print(response)

The paper focuses on survival, response to immunotherapy, and histology type.


In [None]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SimilarityPostprocessor
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=4,
)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.8)
query_engine = RetrieverQueryEngine(
    retriever=retriever,node_postprocessors=[postprocessor])

In [None]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)

Final Response: The paper focuses on survival, response to
immunotherapy, and histology type.
______________________________________________________________________
Source Node 1/2
Node ID: 5e09b69e-7c50-448f-838e-4ffdb9b0ec6f
Similarity: 0.8244290984390621
Text: comparisons. If the ANOVA result was signi ﬁcant, a Tukey
honestly signiﬁcant difference post-hoc test was conducted to
determine which groups were signi ﬁcantly different from one another.
A Benjamini–Hochberg correction was used to account for multiple
hypothesis testing in analyses that involved systematically testing
multiple variables. p-va...
______________________________________________________________________
Source Node 2/2
Node ID: 219e162f-df82-4f55-b5d1-9d9c0c5074c9
Similarity: 0.824426433156154
Text: As opposed to Vodnala et al., we did not detect a complete
reduction for the 2HC-treated T cell condition in the B16F10 model,
which could be attributed to the aggressive nature of the B16F10
model.11 Three days post

In [None]:
'''Storing indexed data in a disk to avoid relying on in-memory storage and retrieving data from disk'''
import os.path
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)

PERSIST_DIR = "./storage"
if not os.path.exists(PERSIST_DIR):
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

query_engine = index.as_query_engine()
response = query_engine.query("Cellular neighbourhoods")
print(response)

Cellular neighbourhoods play a crucial role in understanding the spatial organization and interactions of different cell types within tissues. By analyzing cellular neighbourhoods, researchers can identify patterns of cell distribution, cell-cell interactions, and how these neighborhoods may impact various biological processes such as survival, immune responses, and disease progression.


In [40]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from llama_index.core import (
    VectorStoreIndex,
    Document,
)

def update_csv_from_extracted_data(csv_path, extracted_data):
    """
    Updates a CSV file with extracted data from the query responses.
    """
    columns_order = ["paper_name", "disease_name", "clinical_variable", "unique_neighborhoods", "clinical_variable_association"]
    extracted_data = {col: extracted_data.get(col, "") for col in columns_order}

    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
    else:
        df = pd.DataFrame(columns=extracted_data.keys())


    new_row_df = pd.DataFrame([extracted_data])


    df = pd.concat([df, new_row_df], ignore_index=True)

    df.to_csv(csv_path, index=False)
    print(f"Row added to CSV: {csv_path}")

def extract_title_from_pdf(pdf_path):
    """
    Attempts to extract the title from the first page of the PDF, accounting for multi-line titles.
    """
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0].extract_text()


    lines = first_page.splitlines()


    filtered_lines = [line.strip() for line in lines if len(line.split()) > 4]


    if len(filtered_lines) >= 2:
        title = f"{filtered_lines[0]} {filtered_lines[1]}".strip()
    elif filtered_lines:
        title = filtered_lines[0].strip()
    else:
        title = "Unknown Title"  # Default if no title is found


    non_title_keywords = ["ARTICLE", "OPEN", "CHECK FOR UPDATES"]
    for keyword in non_title_keywords:
        title = title.replace(keyword, "").strip()

    return title

queries = {
    "disease_name": "What is the disease focus of the study?",
    "clinical_variable": "What are the clinical variables analyzed?",
    "unique_neighborhoods": "list the unique cellular neighborhoods identified with details of the type of cells",
    "clinical_variable_association": "list the unique cellular neighborhoods and the kind of clinical variable association they had. whethe the association was positive negative or no association at all",
}

def query_fields(queries, query_engine):
    """
    Queries the engine for each field and collects the responses.
    """
    extracted_data = {}
    for field, query in queries.items():
        response = query_engine.query(query)
        extracted_data[field] = str(response).strip()  # Clean the response text
        print(f"Extracted for {field}: {response}")
    return extracted_data


csv_path = "./result_data2.csv"


pdf_paths = [
  '/content/data/ak_prostate.pdf',
  '/content/data/blise_head_and_neck.pdf',
  '/content/data/blise_pancreatic.pdf',
  '/content/data/eng_breast.pdf',
  '/content/data/hickey_intestine.pdf',
  '/content/data/hickey_melanoma.pdf',
  '/content/data/jin_lymphoma.pdf',
  '/content/data/karimi_brain.pdf',
  '/content/data/lake_kidney.pdf',
  '/content/data/lemaitre_hepatocellular.pdf'
]



for pdf_path in pdf_paths:
    print(f"Processing PDF: {pdf_path}")


    title = extract_title_from_pdf(pdf_path)
    print(f"Extracted Title: {title}")


    reader = PdfReader(pdf_path)
    pdf_content = ""
    for page in reader.pages:
        pdf_content += page.extract_text()


    document = Document(text=pdf_content, metadata={"file_name": os.path.basename(pdf_path)})


    index = VectorStoreIndex.from_documents([document])


    query_engine = index.as_query_engine()


    extracted_data = query_fields(queries, query_engine)


    extracted_data["paper_name"] = title


    update_csv_from_extracted_data(csv_path, extracted_data)

print(f"CSV file updated with data from {len(pdf_paths)} PDFs!")


Processing PDF: /content/data/ak_prostate.pdf
Extracted Title: Multiplex imaging of localized prostate tumors reveals altered spatial organization of AR-positivecells in the microenvironment
Extracted for disease_name: The disease focus of the study is prostate cancer.
Extracted for clinical_variable: The clinical variables analyzed in the study include AR + rCN frequency between TAN and tumors (G3 and G4), as well as the mean AR + rCN frequency differences between TAN and tumors (G3 and G4) in subsampling analysis. Additionally, the study also looked at the AR + rCN (CN1) frequency differences between TAN and tumors (G3 and G4) in spatial significance tests 1 and 2.
Extracted for unique_neighborhoods: Four neighborhoods were identified:
1. AR+ non-immune stromal cells interacting with neuroendocrine cells.
2. Mesenchymal/endothelial cells interacting with immune cells.
3. Smooth muscle and M2 macrophages.
4. Lack of AR+ stroma interacting with various cell types such as luminal epithe

In [39]:
import os
import pandas as pd
import fitz
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document


def update_csv(csv_path, data_row):
    """
    Updates the CSV file with extracted data row by row.
    Ensures correct formatting and missing field handling.
    """
    columns = ["paper_name", "disease_name", "clinical_variable", "unique_neighborhoods",
               "clinical_variable_association"]


    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
    else:
        df = pd.DataFrame(columns=columns)


    data_row = {col: data_row.get(col, "Not Available") for col in columns}


    df = pd.concat([df, pd.DataFrame([data_row])], ignore_index=True)
    df.to_csv(csv_path, index=False)
    print(f"CSV updated: {csv_path}")




def extract_title_with_font_size(pdf_path):

    doc = fitz.open(pdf_path)
    first_page = doc[0]

    text_instances = []
    blocks = first_page.get_text("dict")["blocks"]
    for block in blocks:
        if "lines" in block:  # Ensure "lines" exists
            for line in block["lines"]:
                for span in line["spans"]:
                    text_instances.append({
                        "text": span["text"].strip(),
                        "font_size": span["size"],
                        "y_position": span["bbox"][1]
                    })

    if text_instances:
        text_instances.sort(key=lambda x: (-x["font_size"], x["y_position"]))

        largest_font_size = text_instances[0]["font_size"]


        largest_font_spans = [t for t in text_instances if t["font_size"] == largest_font_size]

        largest_font_spans.sort(key=lambda x: x["y_position"])
        top_three_spans = largest_font_spans[:5]

        title = " ".join([t["text"] for t in top_three_spans])
        return title.strip()

    return "Unknown Title"





def query_fields(queries, query_engine):
    """
    Executes the queries using the query engine and extracts the data.
    """
    extracted_data = {}
    for field, query in queries.items():
        try:
            response = query_engine.query(query)
            extracted_data[field] = str(response).strip()
            print(f"Extracted for {field}: {response}")
        except Exception as e:
            print(f"Error extracting {field}: {e}")
            extracted_data[field] = "Not Available"
    return extracted_data


def process_pdf(pdf_path, csv_path, queries):
    """
    Processes a single PDF file to extract relevant data and save it to the CSV.
    """
    print(f"Processing PDF: {pdf_path}")


    title = extract_title_with_font_size(pdf_path)
    print(f"Extracted Title: {title}")


    doc = fitz.open(pdf_path)
    pdf_content = ""
    for page in doc:
        pdf_content += page.get_text()


    document = Document(text=pdf_content, metadata={"file_name": os.path.basename(pdf_path)})
    index = VectorStoreIndex.from_documents([document])
    query_engine = index.as_query_engine()


    extracted_data = query_fields(queries, query_engine)


    extracted_data["paper_name"] = title


    update_csv(csv_path, extracted_data)


queries = {
    "disease_name": "Disease Name",
    "clinical_variable": "what factors does the paper focus on? Choose one of these option: survival,response to immunotherapym,histology type,None",
    "unique_neighborhoods": "Just List the unique cellular neighborhoods identified with details of the type of cells.format example tumor cell,bcell",
    "clinical_variable_association" :"what associations did the cellular neighborhoods have to clinical variable. format example positive,negative,none maintain the order with respect to the unique neighborhoods",
}

csv_path = "./result_data1.csv"

data_folder = "/content/data"
pdf_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith(".pdf")]

for pdf_path in pdf_files:
    process_pdf(pdf_path, csv_path, queries)

print("All documents processed and results saved to CSV.")


Processing PDF: /content/data/blise_pancreatic.pdf
Extracted Title: Machine learning links T cell function and spatial localization to neoadjuvant immunotherapy and clinical outcome in pancreatic cancer  Katie E. Blise , Shamilene Sivagnanam
Extracted for disease_name: Pancreatic Cancer
Extracted for clinical_variable: response to immunotherapy
Extracted for unique_neighborhoods: neoplastic epithelial cell, B cell
neoplastic epithelial cell, T cell
neoplastic epithelial cell, myeloid cell
neoplastic epithelial cell, mesenchymal fibroblast-like cell
Extracted for clinical_variable_association: positive,positive,none,negative,negative,negative,negative
CSV updated: ./result_data1.csv
Processing PDF: /content/data/hickey_melanoma.pdf
Extracted Title: T cell-mediated curation and restructuring of tumor tissue coordinates an effective immune response
Extracted for disease_name: Melanoma
Extracted for clinical_variable: response to immunotherapy
Extracted for unique_neighborhoods: Tumor cell

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz

expected_results_path = "/content/labelpapers.csv"
generated_results_path = "/content/result_data.csv"

expected_results = pd.read_csv(expected_results_path)
generated_results = pd.read_csv(generated_results_path)


if expected_results.shape != generated_results.shape:
    print("Mismatch in shape between the two files.")
    print(f"Expected shape: {expected_results.shape}")
    print(f"Generated shape: {generated_results.shape}")
else:

    total_cells = expected_results.size
    matches = 0
    threshold = 80


    for col in expected_results.columns:
        for i in range(len(expected_results)):
            expected = str(expected_results.at[i, col])
            generated = str(generated_results.at[i, col])


            similarity = fuzz.ratio(expected, generated)

            if similarity >= threshold:
                matches += 1


    accuracy = (matches / total_cells) * 100


    print(f"Fuzzy Matching Accuracy: {accuracy:.2f}%")
    print(f"Total Matches: {matches}")
    print(f"Total Cells: {total_cells}")


    mismatches = []
    for col in expected_results.columns:
        for i in range(len(expected_results)):
            expected = str(expected_results.at[i, col])
            generated = str(generated_results.at[i, col])
            similarity = fuzz.ratio(expected, generated)
            if similarity < threshold:
                mismatches.append({
                    "Row": i + 1,
                    "Column": col,
                    "Expected": expected,
                    "Generated": generated,
                    "Similarity": similarity
                })


    if mismatches:
        mismatches_df = pd.DataFrame(mismatches)
        mismatches_df.to_csv("mismatch_details_fuzzy.csv", index=False)
        print("Mismatch details saved to 'mismatch_details_fuzzy.csv'")
    else:
        print("All entries are sufficiently similar!")


Fuzzy Matching Accuracy: 33.33%
Total Matches: 5
Total Cells: 15
Mismatch details saved to 'mismatch_details_fuzzy.csv'
