In [4]:
import openai
import time
import os
from scripts.get_abstract_epmc import get_abstract
from scripts.get_full_text_epmc import get_full_text
import pandas as pd
import numpy as np
import re
import json

# Using the OpenAI API for text completion

This notebook uses the OpenAI API to assess the GPT-3.5-turbo model performance for extracting assay-specific data from the abstracts of the available open access journal articles sourced in [Nanoparticle biodistribution coefficients: A quantitative approach for understanding the tissue distribution of nanoparticles](https://doi.org/10.1016/j.addr.2023.114708) (continues [exploration-clean](00_exploration-clean.ipynb)).

In [5]:
df = pd.read_csv("../data/perc_id_g.csv")
tokens = 0 #TODO add up all used tokens * pricing

## Get abstracts
The Europe PMC API is used to retrieve the abstracts for the available Open Access journal articles used to build the dataset.


In [6]:
seen = []
count = 0
total = len(np.unique(df['provided_identifier']))
for index, row in df.iterrows():
    if pd.notnull(row['pmcid']):     
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            text = get_abstract(pmcid)
            if text != "":
                df.at[index, 'abstract'] = text
                count += 1
print(f'{count} abstracts retrieved out of {total} different journal articles')


Not available: ('abstractText') for PMC4127427
55 abstracts retrieved out of 116 different journal articles


## Set up the OpenAI text completion API

Needed API key:

In [7]:
with open('resources/openAI_key.txt', 'r') as f:
    api_key = f.read().strip()
os.environ['OPENAI_API_KEY'] = api_key
openai.api_key = os.getenv("OPENAI_API_KEY")
# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

## Test

The abstract used for the test belongs to [PMC5102673](https://doi.org/10.1002/advs.201600122). The abstract is:

    A systematic study of in vitro and in vivo behavior of biodegradable mesoporous silica nanoparticles (bMSNs), designed to carry multiple cargos (both small and macromolecular drugs) and subsequently self-destruct following release of their payloads, is presented. Complete degradation of bMSNs is seen within 21 d of incubation in simulated body fluid. The as-synthesized bMSNs are intrinsically radiolabeled with oxophilic zirconium-89 (<sup>89</sup>Zr, <i>t</i><sub>1/2</sub> = 78.4 h) radionuclide to track their in vivo pharmacokinetics via positron emission tomography imaging. Rapid and persistent CD105 specific tumor vasculature targeting is successfully demonstrated in murine model of metastatic breast cancer by using TRC105 (an anti-CD105 antibody)-conjugated bMSNs. This study serves to illustrate a simple, versatile, and readily tunable approach to potentially overcome the current challenges facing nanomedicine and further the goals of personalized nanotheranostics.

In [8]:
text_test = df.iloc[330]['abstract']
id_test = df.iloc[330]['pmcid']
doi_test = df.iloc[330]['doi']
print(id_test, " ", doi_test)
print(text_test)


PMC5102673   10.1002/advs.201600122
A systematic study of in vitro and in vivo behavior of biodegradable mesoporous silica nanoparticles (bMSNs), designed to carry multiple cargos (both small and macromolecular drugs) and subsequently self-destruct following release of their payloads, is presented. Complete degradation of bMSNs is seen within 21 d of incubation in simulated body fluid. The as-synthesized bMSNs are intrinsically radiolabeled with oxophilic zirconium-89 (<sup>89</sup>Zr, <i>t</i><sub>1/2</sub> = 78.4 h) radionuclide to track their in vivo pharmacokinetics via positron emission tomography imaging. Rapid and persistent CD105 specific tumor vasculature targeting is successfully demonstrated in murine model of metastatic breast cancer by using TRC105 (an anti-CD105 antibody)-conjugated bMSNs. This study serves to illustrate a simple, versatile, and readily tunable approach to potentially overcome the current challenges facing nanomedicine and further the goals of personalize

Setting up the query and function used for the API call:

In [9]:
query = """Scan the following scientific article abstract describing the use of animal models to investigate nanomaterial or nanoparticle biodistribution in organs to fill up the following key-value pairs, respecting the formatting:

Abstract:
\"\"\"
{}
\"\"\"

Key-value pairs:

1. assessed nanomaterial:
2. commercial or synthesized:
3. labelling used for the in vivo assay:
4. instrumental equipment used for measurements for the in vivo assay:
5. animal model used: 
6. route of administration of nanomaterial:
7. age/sex of the animal model:
8. fate of the nanomaterial observed: (3 words max)
9. time points included in the in vivo assay
"""


def question_answer(query, text, model, temperature):
    query = query.format(text)
    messages = [
        {'role': 'system', 'content': 'You answer questions about the abstract of a journal article.'},
        {'role': 'user', 'content': query},
    ]
    retries = 3
    wait_time = 30

    while retries > 0:
        try:
            response = openai.ChatCompletion.create(
                messages=messages,
                model=model,
                temperature=temperature
            )
            tokens = int(response['usage']['total_tokens'])
            print(f'Used {tokens} total tokens')
            response_text = response['choices'][0]['message']['content']
            print(response_text)
            dictionary = {'values': {}, 'response': response_text}

            # Split the string by lines
            lines = response_text.strip().split("\n")

            # Iterate over each line
            for line in lines:
                if ":" in line:
                    key, value = line.strip().split(": ")
                    dictionary['values'][key] = value

            return dictionary

        except openai.error.RateLimitError:
            print('RateLimitError: Too many requests. Retrying after {} seconds...'.format(wait_time))
            time.sleep(wait_time)
            retries -= 1

    raise Exception('Max retries exceeded. Request could not be completed.')


test_abstract = question_answer(query, text_test, model=GPT_MODEL, temperature=0)
print(json.dumps(test_abstract, indent = 4))

RateLimitError: Too many requests. Retrying after 30 seconds...
Used 539 total tokens
1. assessed nanomaterial: biodegradable mesoporous silica nanoparticles (bMSNs)
2. commercial or synthesized: synthesized
3. labelling used for the in vivo assay: oxophilic zirconium-89 (<sup>89</sup>Zr)
4. instrumental equipment used for measurements for the in vivo assay: positron emission tomography (PET) imaging
5. animal model used: murine model of metastatic breast cancer
6. route of administration of nanomaterial: not specified
7. age/sex of the animal model: not specified
8. fate of the nanomaterial observed: complete degradation
9. time points included in the in vivo assay: not specified, but degradation observed within 21 days
{
    "values": {
        "1. assessed nanomaterial": "biodegradable mesoporous silica nanoparticles (bMSNs)",
        "2. commercial or synthesized": "synthesized",
        "3. labelling used for the in vivo assay": "oxophilic zirconium-89 (<sup>89</sup>Zr)",
        

Now performing the request for all available open access abstracts:

In [10]:
seen = []
count = 0
total = len(set(df['abstract']))
for index, row in df.iterrows():
    if pd.notnull(row['abstract']):     
        abstract = row['abstract']
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            condition = df['pmcid'] == pmcid
            print(f'####{pmcid}####')
            # Map keys back
            answer = question_answer(query, abstract, GPT_MODEL, 0)['values']
            for value in answer:
                # Check if the value matches the regex pattern
                if re.search(r'assessed nanomaterial', value):
                    df.loc[condition, 'nm_llm'] = answer[value]
                if re.search(r'commercial', value):
                    df.loc[condition, 'commercial_synthesized_llm'] = answer[value]
                if re.search(r'animal model', value):
                    df.loc[condition, 'model_llm'] = answer[value]
                if re.search(r'route of administration', value):
                    df.loc[condition, 'route_admin_llm'] = value
                if re.search(r'fate', value):
                    df.loc[condition, 'fate_llm'] = answer[value] 
                if re.search(r'time points', value):
                    df.loc[condition, 'timepoints_llm'] = answer[value]

####PMC4437573####
Used 628 total tokens
1. assessed nanomaterial: gold nanoparticles (Au NPs)
2. commercial or synthesized: not specified
3. labelling used for the in vivo assay: intrinsic fluorescence of photodynamic therapy (PDT) drug Pc 4
4. instrumental equipment used for measurements for the in vivo assay: not specified
5. animal model used: not specified
6. route of administration of nanomaterial: intravenous (IV) injection
7. age/sex of the animal model: not specified
8. fate of the nanomaterial observed: biodistribution impact
9. time points included in the in vivo assay: 4 hours after IV injection
####PMC4180787####
Used 601 total tokens
1. assessed nanomaterial: gold nanocluster ((64)Cu-doped AuNCs)
2. commercial or synthesized: synthesized
3. labelling used for the in vivo assay: PET radionuclide (64)Cu
4. instrumental equipment used for measurements for the in vivo assay: dual-modality positron emission tomography (PET) and near-infrared (NIR) fluorescence imaging
5. anima

In [11]:
print("NMs identified:\n\t-")
print("\n\t-".join(set(df['nm_llm'].dropna())))
print("commercial/synthesized?\n\t-")
print("\n\t-".join(set(df['commercial_synthesized_llm'].dropna())))
print("animal model identified:\n\t-")
print("\n\t-".join(set(df['model_llm'].dropna())))
print("How was the NM administered:\n\t-")
print("\n\t-".join(set(df['route_admin_llm'].dropna())))

NMs identified:
	-
laser-ablated dextran-coated AuNP (AuNPd)
	-PEGylated and heparinized magnetic iron oxide nano-platform (DNPH)
	-gold nanoparticles (Au NPs)
	-polystyrene-b-poly(ethylene oxide) diblock copolymer micelles
	-Gold nanoparticles (AuNPs)
	-nano-graphene oxide (GO)
	-polymeric nanoparticles
	-liposomes
	-biocompatible mesoporous silica (mSiO2) nanoparticles
	-Hollow mesoporous silica nanoparticles (HMSNs)
	-Cur-loaded nanoparticles
	-biodegradable mesoporous silica nanoparticles (bMSNs)
	-gold surface-enhanced Raman scattering (SERS) nanoparticles
	-Gold nanocages
	-reduced graphene oxide (RGO)
	-amorphous silica nanoparticles (SNPs)
	-Reduced graphene oxide nanosheets anchored with iron oxide nanoparticles
	-Gold nanorods
	-ultrasmall porous silica nanoparticles (UPSN)
	-nanographene oxide (GO)
	-ultrasmall porous silica nanoparticles (UPSNs)
	-Nano-graphene oxide (GO) sheets
	-Gold nanorods (AuNR)
	-K237/FA-PEG-PLGA nanoparticles
	-PEGylated liposomes derivatized with v

Overview of results:

Using the NM type as a benchmark for accuracy:

In [12]:
seen = []
count = 0
total = len(set(df['abstract']))
for index, row in df.iterrows():
    if pd.notnull(row['abstract']):     
        abstract = row['abstract']
        pmcid = row['pmcid']
        if pmcid not in seen:
            seen.append(pmcid)
            print(str(row['pmcid']) , ' | ' , str(row['Name']) ," -----> " + str(row['nm_llm']))

PMC4437573  |  Peptide Au NP 5nm  -----> gold nanoparticles (Au NPs)
PMC4180787  |  64Cu-doped AuNCs 2.5nm  -----> gold nanocluster ((64)Cu-doped AuNCs)
PMC3985880  |  Gold tripods <20nm  -----> Au-tripods
PMC4358630  |  Gold nanospheres 56.8nm  -----> Au nanostructures
PMC3404261  |  64Cu-DOTA-PEGAuNCs (55 nm)  -----> Gold nanocages
PMC3211348  |  43 nm AuNP-PEG5000 (Kennedy et al. 2011)  -----> Gold nanoparticles (AuNPs)
PMC3379889  |  50nm Au PEG  -----> gold nanoparticles
PMC3563754  |  64Cu-NOTA-Au-IONP-Affibody 24.4nm  -----> Au-IO hetero-nanostructures (Au-IONPs)
PMC4151626  |  SERS nanoparticles Gold 120nm  -----> gold surface-enhanced Raman scattering (SERS) nanoparticles
PMC2745599  |  20-nm AuNPs coated with PEG5000-TA  -----> Polyethylene glycol (PEG)-coated gold nanoparticles (AuNPs)
PMC4836969  |   5 nm 199Au-AuNP-PEG  -----> Gold nanoparticles doped with (199)Au atoms
PMC3492114  |  PEG-modified gold nanorods AP 5.0 10.6*49.6  -----> Gold nanorods
PMC3425121  |  AuNR - D

Data is saved to [perc_id_g_llm.csv](../data/perc_id_g_llm.csv)

In [13]:
df.to_csv('../data/perc_id_g_llm.csv', index=False)

## Full texts

### Test
Same journal article as before

In [14]:
id_test = df.iloc[330]['pmcid']
text_test = get_full_text(id_test)
condition = df['pmcid'] == id_test
df.loc[condition, 'full_text'] = text_test
len(text_test)

46329

In [15]:
text_test = text_test[0:15000]

Need to get around the [token limit](https://platform.openai.com/docs/models/gpt-4): set up a function that splits the full text into chunks to generate a synthesized version

In [16]:
query_ftext = """Scan the following scientific article abstract describing the use of animal models to investigate nanomaterial or nanoparticle biodistribution in organs to fill up the following key-value pairs, respecting the formatting:

Text:
\"\"\"
{}
\"\"\"

Key-value pairs:

1. assessed nanomaterial:
2. commercial or synthesized:
3. labelling used for the in vivo assay:
4. instrumental equipment used for measurements for the in vivo assay:
5. animal model used: 
6. route of administration of nanomaterial:
7. age/sex of the animal model:
8. fate of the nanomaterial observed: (3 words max)
9. time points included in the in vivo assay
"""

def ftext_tochunks(text, max_tokens = 4096, chars_per_token = 0.75):
    chars = len(text)
    chunk_size = int(chars_per_token * max_tokens)
    chunks = [text[i:i+chunk_size] for i in range(0, chars, chunk_size)]
    print("Text divided into", len(chunks), "chunks")
    return chunks
    
def query_chat_openai(query, text, model, temperature):
    query = query.format(text)
    messages = [
        {'role': 'system', 'content': 'You answer questions about the abstract of a journal article.'},
        {'role': 'user', 'content': query},
    ]
    retries = 3
    wait_time = 30

    while retries > 0:
        try:
            response = openai.ChatCompletion.create(
                messages=messages,
                model=model,
                temperature=temperature
            )

            response_text = response['choices'][0]['message']['content']
            #print(response_text)
            tokens = int(response['usage']['total_tokens'])
            print(f'Used {tokens} total tokens')
            dictionary = {'values': {}, 'response': response_text}

            # Split the string by lines
            lines = response_text.strip().split("\n")

            # Iterate over each line
            for line in lines:
                if ":" in line:
                    key, value = line.strip().split(": ")
                    dictionary['values'][key] = value

            return dictionary, response_text

        except openai.error.RateLimitError:
            print('RateLimitError: Too many requests. Retrying after {} seconds...'.format(wait_time))
            time.sleep(wait_time)
            retries -= 1

    raise Exception('Max retries exceeded. Request could not be completed.')

def long_query_answer(query, text, model, temperature):
    chunks = ftext_tochunks(text)
    query_synthesize = 'Synthesize this excerpt of a scientific journal article into less than {} words. It will be concatenated to other synthesized chunks, so make sure to :\n\"\"\"\n{}\n\"\"\"'
    synthesized = ''
    i = 1
    words = int(4096/len(chunks))
    query_synthesize =query_synthesize.format(words, '{}')
    print(f'Synthesizing into chunks of less than {words} words')
    for chunk in chunks:
        print(f'Synthesizing chunk #{i}')
        i+=1
        synthesized += " " 
        synthesized += query_chat_openai(query_synthesize, chunk, GPT_MODEL, temperature)[1]
    #print(synthesized)
    response = query_chat_openai(query = query, text = synthesized, model=GPT_MODEL, temperature=0)[0]
    return response

test = long_query_answer(query=query_ftext, text=text_test, model=GPT_MODEL, temperature=0)



Text divided into 5 chunks
Synthesizing into chunks of less than 819 words
Synthesizing chunk #1
Used 1147 total tokens
Synthesizing chunk #2
Used 977 total tokens
Synthesizing chunk #3
Used 990 total tokens
Synthesizing chunk #4
Used 1043 total tokens
Synthesizing chunk #5
RateLimitError: Too many requests. Retrying after 30 seconds...
Used 960 total tokens
RateLimitError: Too many requests. Retrying after 30 seconds...
Used 1726 total tokens


In [17]:
print(json.dumps(test, indent = 4))

{
    "values": {
        "1. assessed nanomaterial": "Mesoporous silica nanoparticles (MSNs)",
        "2. commercial or synthesized": "Synthesized",
        "3. labelling used for the in vivo assay": "Chelator-free labeling of oxophilic position emitting isotopes",
        "4. instrumental equipment used for measurements for the in vivo assay": "PET imaging",
        "5. animal model used": "4T1 breast cancer murine models",
        "6. route of administration of nanomaterial": "Not specified",
        "7. age/sex of the animal model": "Not specified",
        "8. fate of the nanomaterial observed": "Self-destructing into small fragments",
        "9. time points included in the in vivo assay": "Not specified"
    },
    "response": "1. assessed nanomaterial: Mesoporous silica nanoparticles (MSNs)\n2. commercial or synthesized: Synthesized\n3. labelling used for the in vivo assay: Chelator-free labeling of oxophilic position emitting isotopes\n4. instrumental equipment used for measu

In [18]:
for i in range(len(test['values'])):
    key = list(test['values'].keys())[i]
    abstract  = test_abstract['values'][key]
    full_text = test['values'][key]
    print(f"{key}\n\tABSTRACT: {abstract}\n\tFULL TEXT: {full_text}")

1. assessed nanomaterial
	ABSTRACT: biodegradable mesoporous silica nanoparticles (bMSNs)
	FULL TEXT: Mesoporous silica nanoparticles (MSNs)
2. commercial or synthesized
	ABSTRACT: synthesized
	FULL TEXT: Synthesized
3. labelling used for the in vivo assay
	ABSTRACT: oxophilic zirconium-89 (<sup>89</sup>Zr)
	FULL TEXT: Chelator-free labeling of oxophilic position emitting isotopes
4. instrumental equipment used for measurements for the in vivo assay
	ABSTRACT: positron emission tomography (PET) imaging
	FULL TEXT: PET imaging
5. animal model used
	ABSTRACT: murine model of metastatic breast cancer
	FULL TEXT: 4T1 breast cancer murine models
6. route of administration of nanomaterial
	ABSTRACT: not specified
	FULL TEXT: Not specified
7. age/sex of the animal model
	ABSTRACT: not specified
	FULL TEXT: Not specified
8. fate of the nanomaterial observed
	ABSTRACT: complete degradation
	FULL TEXT: Self-destructing into small fragments
9. time points included in the in vivo assay
	ABSTRACT: 