In [2]:
import openai
import os
from scripts.get_abstract_epmc import get_abstract
import pandas as pd
import numpy as np
import re
import json

## What essential data is missing?

Which is the data that went unreported in the table? How to retrieve it? (how did they introduce the NM in the organism, bioassay parameters, type of size measurement and other descriptors, etc). For named nanomaterials, retrieve their descriptors if made available elsewhere.

Also missing: more data about the studies, metadata (journal, etc)

In [3]:
df = pd.read_csv("../data/perc_id_g.csv")

## Get abstracts



In [4]:
seen = []
count = 0
total = len(np.unique(df['provided_identifier']))
for index, row in df.iterrows():
    if pd.notnull(row['pmcid']):     
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            text = get_abstract(pmcid)
            if text != "":
                df.at[index, 'abstract'] = text
                count += 1
print(f'{count} abstracts retrieved out of {total} different journal articles')


Not available: ('abstractText') for PMC4127427
55 abstracts retrieved out of 116 different journal articles


In [5]:
with open('resources/openAI_key.txt', 'r') as f:
    api_key = f.read().strip()
os.environ['OPENAI_API_KEY'] = api_key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Construct prompt

prompt = """Scan the following abstract describing the use of animal models to investigate nanomaterial or nanoparticle biodistribution in organs: \n{} \n\nAnswer the questions:
- Which nanomaterial is assessed?
- Experimental conditions?
Return answer as a list
"""


In [10]:
text_test = df.iloc[330]['abstract']
# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
text_test


'A systematic study of in vitro and in vivo behavior of biodegradable mesoporous silica nanoparticles (bMSNs), designed to carry multiple cargos (both small and macromolecular drugs) and subsequently self-destruct following release of their payloads, is presented. Complete degradation of bMSNs is seen within 21 d of incubation in simulated body fluid. The as-synthesized bMSNs are intrinsically radiolabeled with oxophilic zirconium-89 (<sup>89</sup>Zr, <i>t</i><sub>1/2</sub> = 78.4 h) radionuclide to track their in vivo pharmacokinetics via positron emission tomography imaging. Rapid and persistent CD105 specific tumor vasculature targeting is successfully demonstrated in murine model of metastatic breast cancer by using TRC105 (an anti-CD105 antibody)-conjugated bMSNs. This study serves to illustrate a simple, versatile, and readily tunable approach to potentially overcome the current challenges facing nanomedicine and further the goals of personalized nanotheranostics.'

# Test 1:

A systematic study of in vitro and in vivo behavior of biodegradable mesoporous silica nanoparticles (bMSNs), designed to carry multiple cargos (both small and macromolecular drugs) and subsequently self-destruct following release of their payloads, is presented. Complete degradation of bMSNs is seen within 21 d of incubation in simulated body fluid. The as-synthesized bMSNs are intrinsically radiolabeled with oxophilic zirconium-89 (<sup>89</sup>Zr, <i>t</i><sub>1/2</sub> = 78.4 h) radionuclide to track their in vivo pharmacokinetics via positron emission tomography imaging. Rapid and persistent CD105 specific tumor vasculature targeting is successfully demonstrated in murine model of metastatic breast cancer by using TRC105 (an anti-CD105 antibody)-conjugated bMSNs. This study serves to illustrate a simple, versatile, and readily tunable approach to potentially overcome the current challenges facing nanomedicine and further the goals of personalized nanotheranostics.

In [18]:
query = """Scan the following scientific article abstract describing the use of animal models to investigate nanomaterial or nanoparticle biodistribution in organs to fill up the following key-value pairs.

Abstract:
\"\"\"
{}
\"\"\"

Key-value pairs:

assessed nanomaterial:
commercial or synthesized:
labelling used for the in vivo assay:
instrumental equipment used for measurements for the in vivo assay:
animal model used: 
route of administration of nanomaterial:
age/sex of the animal model:
fate of the nanomaterial observed:
"""


def question_answer(query, abstract, model, temperature):
    query = query.format(abstract)
    response = openai.ChatCompletion.create(
        messages=[
        {'role': 'system', 'content': 'You answer questions about the abstract of a journal article.'},
        {'role': 'user', 'content': query},
        ],
        model=model,
        temperature=temperature,
    )

    response_text = response['choices'][0]['message']['content']
    dictionary = {'values':{}, 'response':'',}

    # Split the string by lines
    lines = response_text.strip().split("\n")

    # Iterate over each line
    for line in lines:
        if ":" in line:
            key, value = line.strip().split(": ")
            dictionary['values'][key] = value
    dictionary['response'] = response
    return dictionary

test = question_answer(query, text_test, model=GPT_MODEL, temperature=0)
print(json.dumps(test, indent = 4))

{
    "values": {
        "assessed nanomaterial": "biodegradable mesoporous silica nanoparticles (bMSNs)",
        "commercial or synthesized": "synthesized",
        "labelling used for the in vivo assay": "oxophilic zirconium-89 (<sup>89</sup>Zr)",
        "instrumental equipment used for measurements for the in vivo assay": "positron emission tomography (PET) imaging",
        "animal model used": "murine model of metastatic breast cancer",
        "route of administration of nanomaterial": "not specified",
        "age/sex of the animal model": "not specified",
        "fate of the nanomaterial observed": "rapid and persistent CD105 specific tumor vasculature targeting demonstrated in the animal model of metastatic breast cancer. Complete degradation of bMSNs seen within 21 d of incubation in simulated body fluid."
    },
    "response": {
        "id": "chatcmpl-7OjQ96GGxXmFrQ4iE7CmQNrxQJCmF",
        "object": "chat.completion",
        "created": 1686128825,
        "model": "g

In [19]:
seen = []
count = 0
total = len(set(df['abstract']))
for index, row in df.iterrows():
    if pd.notnull(row['abstract']):     
        abstract = row['abstract']
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            condition = df['pmcid'] == pmcid
            answer = question_answer(query, abstract, GPT_MODEL, 0)['values']
            df.loc[condition, 'nm_llm'] = answer['assessed nanomaterial']
            df.loc[condition, 'commercial_synthesized_llm'] = answer['commercial or synthesized']
            df.loc[condition, 'model_llm'] = answer['animal model used']
            df.loc[condition, 'route_admin_llm'] = answer['route of administration of nanomaterial']
            


RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID d2d748360d1d44d24d818dea54b35bbc in your message.)

In [32]:
print("NMs identified:\n\t-")
print("\n\t-".join(set(df['nm_llm'].dropna())))
print("commercial/synthesized?\n\t-")
print("\n\t-".join(set(df['commercial_synthesized_llm'].dropna())))
print("animal model identified:\n\t-")
print("\n\t-".join(set(df['model_llm'].dropna())))
print("How was the NM administered:\n\t-")
print("\n\t-".join(set(df['route_admin_llm'].dropna())))

NMs identified:
	-
mesoporous silica nanoparticles (MSNs)
	-Hollow mesoporous silica nanoparticle (HMSN)
	-double-PEGylated biocompatible reduced graphene oxide nanosheets anchored with iron oxide nanoparticles (RGO-IONP-(1st)PEG-(2nd)PEG)
	-Au-IO hetero-nanostructures (Au-IONPs)
	-nano-graphene oxide (GO) sheets
	-Mesoporous silica nanoparticles (MSN)
	-graphene oxide (GO) nanoconjugates
	-biocompatible mesoporous silica (mSiO2) nanoparticles
	-nanographene
	-(64)Cu-doped AuNCs
	-gold nanoparticles (AuNPs)
	-nano-graphene oxide (GO) conjugated to a monoclonal antibody (mAb) against follicle-stimulating hormone receptor (FSHR)
	-Gold nanorods (AuNR)
	-Gold nanorod (GNR)-based nanoplatform
	-Gold nanoparticles doped with (199)Au atoms
	-Gold nanorods
	-Au-tripods
	-Polyethylene glycol (PEG)-coated gold nanoparticles (AuNPs)
	-multimodal silica nanoparticle
	-gold surface-enhanced Raman scattering (SERS) nanoparticles
	-Au nanostructures
	-PEGylated reduced graphene oxide - iron oxide na

Using the NM type as a benchmark for accuracy:

In [38]:
seen = []
count = 0
total = len(set(df['abstract']))
for index, row in df.iterrows():
    if pd.notnull(row['abstract']):     
        abstract = row['abstract']
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            print(str(row['pmcid']) , ' | ' , str(row['Name']) ," -----> " + str(row['nm_llm']))

PMC4437573  |  Peptide Au NP 5nm  -----> epidermal growth factor peptide-targeted gold nanoparticles (EGF<sub>pep</sub>-Au NPs)
PMC4180787  |  64Cu-doped AuNCs 2.5nm  -----> (64)Cu-doped AuNCs
PMC3985880  |  Gold tripods <20nm  -----> Au-tripods
PMC4358630  |  Gold nanospheres 56.8nm  -----> Au nanostructures
PMC3404261  |  64Cu-DOTA-PEGAuNCs (55 nm)  -----> Gold nanocages
PMC3211348  |  43 nm AuNP-PEG5000 (Kennedy et al. 2011)  -----> gold nanoparticles (AuNPs)
PMC3379889  |  50nm Au PEG  -----> Spherical and rod-shaped gold nanoparticles with surface poly(ethylene glycol) (PEG) chains
PMC3563754  |  64Cu-NOTA-Au-IONP-Affibody 24.4nm  -----> Au-IO hetero-nanostructures (Au-IONPs)
PMC4151626  |  SERS nanoparticles Gold 120nm  -----> gold surface-enhanced Raman scattering (SERS) nanoparticles
PMC2745599  |  20-nm AuNPs coated with PEG5000-TA  -----> Polyethylene glycol (PEG)-coated gold nanoparticles (AuNPs)
PMC4836969  |   5 nm 199Au-AuNP-PEG  -----> Gold nanoparticles doped with (199)