In [1]:
import openai
import os
from scripts.get_abstract_epmc import get_abstract
import pandas as pd
import numpy as np
import re
import json

## What essential data is missing?

Which is the data that went unreported in the table? How to retrieve it? (how did they introduce the NM in the organism, bioassay parameters, type of size measurement and other descriptors, etc). For named nanomaterials, retrieve their descriptors if made available elsewhere.

Also missing: more data about the studies, metadata (journal, etc)

In [3]:
df = pd.read_csv("../data/perc_id_g.csv")

## Get abstracts



In [4]:
seen = []
count = 0
total = len(np.unique(df['provided_identifier']))
for index, row in df.iterrows():
    if pd.notnull(row['pmcid']):     
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            text = get_abstract(pmcid)
            if text != "":
                df.at[index, 'abstract'] = text
                count += 1
print(f'{count} abstracts retrieved out of {total} different journal articles')


Not available: ('abstractText') for PMC4127427
55 abstracts retrieved out of 116 different journal articles


In [5]:
with open('resources/openAI_key.txt', 'r') as f:
    api_key = f.read().strip()
os.environ['OPENAI_API_KEY'] = api_key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Construct prompt

prompt = """Scan the following abstract describing the use of animal models to investigate nanomaterial or nanoparticle biodistribution in organs: \n{} \n\nAnswer the questions:
- Which nanomaterial is assessed?
- Experimental conditions?
Return answer as a list
"""


In [6]:
text_test = df.iloc[330]['abstract']
# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"


# Test 1:

A systematic study of in vitro and in vivo behavior of biodegradable mesoporous silica nanoparticles (bMSNs), designed to carry multiple cargos (both small and macromolecular drugs) and subsequently self-destruct following release of their payloads, is presented. Complete degradation of bMSNs is seen within 21 d of incubation in simulated body fluid. The as-synthesized bMSNs are intrinsically radiolabeled with oxophilic zirconium-89 (<sup>89</sup>Zr, <i>t</i><sub>1/2</sub> = 78.4 h) radionuclide to track their in vivo pharmacokinetics via positron emission tomography imaging. Rapid and persistent CD105 specific tumor vasculature targeting is successfully demonstrated in murine model of metastatic breast cancer by using TRC105 (an anti-CD105 antibody)-conjugated bMSNs. This study serves to illustrate a simple, versatile, and readily tunable approach to potentially overcome the current challenges facing nanomedicine and further the goals of personalized nanotheranostics.

In [7]:
query = """Scan the following scientific article abstract describing the use of animal models to investigate nanomaterial or nanoparticle biodistribution in organs to fill up the following key-value pairs.

Abstract:
\"\"\"
{}
\"\"\"

Key-value pairs:

assessed nanomaterial:
commercial or synthesized:
labelling used for the in vivo assay:
instrumental equipment used for measurements for the in vivo assay:
animal model used: 
route of administration of nanomaterial:
age/sex of the animal model:
fate of the nanomaterial observed:
"""


def question_answer(query, abstract, model, temperature):
    response = openai.ChatCompletion.create(
        messages=[
        {'role': 'system', 'content': 'You answer questions about the abstract of a journal article.'},
        {'role': 'user', 'content': query},
        ],
        model=model,
        temperature=temperature,
    )

    response_text = response['choices'][0]['message']['content']
    dictionary = {}

    # Split the string by lines
    lines = response_text.strip().split("\n")

    # Iterate over each line
    for line in lines:
        if ":" in line:
            key, value = line.strip().split(": ")
            dictionary['values'][key] = value
    dictionary['response'] = response
    return dictionary

test = question_answer(query, text_test, model=GPT_MODEL, temperature=0)['values']
print(json.dumps(test, indent = 4))

assessed nanomaterial: biodegradable mesoporous silica nanoparticles (bMSNs)

commercial or synthesized: synthesized

labelling used for the in vivo assay: oxophilic zirconium-89 (<sup>89</sup>Zr)

instrumental equipment used for measurements for the in vivo assay: positron emission tomography (PET) imaging

animal model used: murine model of metastatic breast cancer

route of administration of nanomaterial: not specified

age/sex of the animal model: not specified

fate of the nanomaterial observed: complete degradation of bMSNs within 21 days of incubation in simulated body fluid, rapid and persistent CD105 specific tumor vasculature targeting successfully demonstrated in the murine model of metastatic breast cancer.


In [11]:
seen = []
count = 0
total = len(np.unique(df['abstract']))
for index, row in df.iterrows():
    if pd.notnull(row['abstract']):     
        abstract = row['abstract']
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            condition = df['pmcid'] == pmcid
            question_answer = question_answer(query, abstract, GPT_MODEL, 0)['values']
            df.loc[condition, 'nm_llm'] = question_answer['assessed nanomaterial']
            df.loc[condition, 'commercial_synthesized_llm'] = question_answer['commercial or synthesized']
            df.loc[condition, 'model_llm'] = question_answer['animal model used']
            df.loc[condition, 'route_admin_llm'] = question_answer['route of administration of nanomaterial']
            


{'assessed nanomaterial': 'biodegradable mesoporous silica nanoparticles (bMSNs)', 'commercial or synthesized': 'synthesized', 'labelling used for the in vivo assay': 'oxophilic zirconium-89 (<sup>89</sup>Zr)', 'instrumental equipment used for measurements for the in vivo assay': 'positron emission tomography (PET) imaging', 'animal model used': 'murine model of metastatic breast cancer', 'route of administration of nanomaterial': 'not specified', 'age/sex of the animal model': 'not specified', 'fate of the nanomaterial observed': 'complete degradation of bMSNs within 21 days of incubation in simulated body fluid, rapid and persistent CD105 specific tumor vasculature targeting successfully demonstrated in the murine model of metastatic breast cancer.'}


Now for all abstracts