In [1]:
from scripts.excel_imports import append_sheets_with_organ_column
from scripts.get_full_text_epmc import id_convert, get_identifier_type
import csv
import re
import numpy as np

# Exploration

This notebook explores the content of the supplementary materials to [Nanoparticle biodistribution coefficients: A quantitative approach for understanding the tissue distribution of nanoparticles](https://doi.org/10.1016/j.addr.2023.114708).


## Downloading the data

The data is available as a supplementary material from the article text: https://ars.els-cdn.com/content/image/1-s2.0-S0169409X23000236-mmc1.xlsx

In [2]:
%%bash

if [ ! -f "../data/perc_id_g_organ.xlsx" ]; then
  curl https://ars.els-cdn.com/content/image/1-s2.0-S0169409X23000236-mmc1.xlsx --output ../data/perc_id_g_organ.xlsx
    echo "xlsx data successfully downloaded and saved to ../data/perc_id_g_organ.xlsx."
    else
        echo "Data (../data/perc_id_g_organ.xlsx) already exists"
fi


Data (../data/perc_id_g_organ.xlsx) already exists


In [3]:
file_path = '../data/perc_id_g_organ.xlsx'
df = append_sheets_with_organ_column(file_path)
df.describe(include='object')


Column 'ID': 0 NaN values; datatype: int64
Column 'Time_h': 0 NaN values; datatype: float64
Column 'perc_ID_g': 17 NaN values; datatype: float64
Column 'Species': 0 NaN values; datatype: object
Column 'Age/weight': 994 NaN values; datatype: float64
Column 'Strain': 0 NaN values; datatype: object
Column 'Organ': 0 NaN values; datatype: object
Column 'Size_nm': 12 NaN values; datatype: float64
Column 'Analysis method': 0 NaN values; datatype: object
Column 'NP_Type': 0 NaN values; datatype: object
Column 'NP_Shape': 0 NaN values; datatype: object
Column 'Ligand': 360 NaN values; datatype: object
Column 'Charge': 2576 NaN values; datatype: object
Column 'PEG cover': 56 NaN values; datatype: object
Column 'PMID': 0 NaN values; datatype: object
Column 'Name': 3720 NaN values; datatype: object
Column 'Charge ': 5651 NaN values; datatype: object
Number of rows with at least one NaN value: 5703/5703
Organs: Muscle, Brain, Tail, Stomach, Liver, Spleen, Heart, Blood, Kidney, Intestine, Lung, Pla

Unnamed: 0,Species,Strain,Organ,Analysis method,NP_Type,NP_Shape,Ligand,Charge,PEG cover,PMID,Name,Charge.1
count,5703,5703,5703,5703,5703,5703,5343,3127,5647,5703,1983,52
unique,1,17,16,29,9,18,44,9,24,116,235,4
top,Mouse,Balb/c mice,Blood,PET,Gold,Nanoparticle,No,Neutral,5000,17962085,SPIO,Neutral
freq,5703,2357,809,962,1986,4194,3095,1731,2234,220,36,28


The provided 'pmids' (d'f['PMID']) are not only PMIDs. Also, to request full text from ePMC, we need the PMC identifier (if it exists)

In [4]:
df.rename(columns={'PMID': 'provided_identifier'}, inplace=True)
df['provided_identifier_type'] = df['provided_identifier'].apply(get_identifier_type)

In [5]:
seen = []
for index, row in df.iterrows():
    id = row['provided_identifier']
    if id not in seen:
        seen.append(id)
        converted = id_convert(id)
        if converted != False:
            pmcid = converted[0]
            doi = converted[1]
            df.at[index, 'pmcid'] = pmcid
            df.at[index, 'doi'] = doi


## Mappings
Retrieving the list of terms to map:

(TODO)

In [7]:

columns_to_exclude = ['index', 'perc_ID_g', 'ID', 'Time_h', 'Age/weight', 'Size_nm', 'PMID', 'doi', 'url', 'pmcid']

file_path = "../data/mappings.csv"
column_names = ['column', 'term', 'mapping']

with open(file_path, "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(column_names)
    for column in df.columns:
        writer.writerow(["", column, ""])
    total = len(list(df.columns))
    for column in df.columns:
        if column not in columns_to_exclude:
            unique_terms = df[column].unique()
            for term in unique_terms:
                total +=1
                writer.writerow([column, term, ""])
print(f"Total terms to map: {total}")

Total terms to map: 550


In [8]:
df.to_csv("../data/perc_id_g.csv", index=False)

[01_text_processing.ipynb](01_text_processing.ipynb) uses OpenAI models to try to retrieve more information for those articles with available (abstracts? full texts?)