In [101]:
import requests
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
from rdkit import Chem
from tqdm import tqdm

In [39]:
# Load data
data_dir = Path("~/Downloads/uniTox Ratings")

cardio = pd.read_csv(data_dir / "Friday_DICT_direct_4o_cardiotoxicity_comprehensive.csv")
derm = pd.read_csv(data_dir / "Thursday_Derm_direct_4o_dermtoxicity_comprehensive.csv")
hema = pd.read_csv(data_dir / "Thursday_Hema_direct_4o_hematoxicity_comprehensive.csv")
inf = pd.read_csv(data_dir / "Thursday_Inf_direct_4o_inftoxicity_comprehensive.csv")
liver = pd.read_csv(data_dir / "Thursday_DILI_direct_4o_livertoxicity_comprehensive.csv")
oto = pd.read_csv(data_dir / "Thursday_Oto_direct_4o_ototoxicity_comprehensive.csv")
pulm = pd.read_csv(data_dir / "Thursday_Pulm_direct_4o_pulmtoxicity_comprehensive.csv")
renal = pd.read_csv(data_dir / "Thursday_DIRIL_direct_4o_renaltoxicity_comprehensive.csv")

In [40]:
# Relabel datasets
name_to_dataset = {
    "cardio_toxicity": cardio,
    "dermatologic_toxicity": derm,
    "hematotoxicity": hema,
    "infertility": inf,
    "liver_toxicity": liver,
    "ototoxicity": oto,
    "pulmonary_toxicity": pulm,
    "renal_toxicity": renal,
}

for name, data in name_to_dataset.items():
    data.drop(columns=["Unnamed: 0", "initial_prompts", "no_less_most_prompts", "yes_no_prompts", "urls"], inplace=True)

    data.rename(columns={
        "reasoning": f"{name}_reasoning",
        "ternary_rating": f"{name}_ternary_rating",
        "binary_rating": f"{name}_binary_rating",
    }, inplace=True)

    data[f"{name}_confident_ternary_rating_0_1"] = [1 if tox == "Most" else (0 if tox == "No" else np.nan) for tox in data[f"{name}_confident_ternary_rating"]]
    data[f"{name}_binary_rating_0_1"] = [1 if tox == "Yes" else (0 if tox == "No" else np.nan) for tox in data[f"{name}_binary_rating"]]

# Merge datasets on generic name
datasets = list(name_to_dataset.values())
data = datasets[0]
for dataset in datasets[1:]:
    data = data.merge(dataset, on="generic_name")

In [42]:
data.to_csv(data_dir / "uniTox.csv", index=False)

In [106]:
# Get SMILES
def get_smiles_from_pubchem(drug_name: str) -> str | None:
    drug_name = drug_name.lower()

    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/CanonicalSMILES,Title/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()

        all_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(prop['CanonicalSMILES'])) for prop in data['PropertyTable']['Properties']]
        titles = [prop['Title'].lower() for prop in data['PropertyTable']['Properties']]

        title_to_smiles = defaultdict(list)
        for smiles, title in zip(all_smiles, titles):
            title_to_smiles[title].append(smiles)

        if len(all_smiles) == 0:
            raise ValueError(f"No SMILES found for {drug_name}")

        selected_smiles = title_to_smiles.get(drug_name, all_smiles)

        return "|".join(selected_smiles)
    else:
        return None

In [107]:
smiles = [get_smiles_from_pubchem(drug_name) for drug_name in tqdm(data["generic_name"])]

100%|██████████| 2418/2418 [22:14<00:00,  1.81it/s]


In [109]:
print(sum(s is None for s in smiles))
print(sum("|" in s for s in smiles if s is not None))

499
34


In [113]:
data["all_smiles"] = smiles

In [114]:
data["smiles"] = [s.split("|")[0] if s is not None else None for s in data["all_smiles"]]

In [115]:
data.to_csv(data_dir / "uniTox.csv", index=False)

In [116]:
data_with_smiles = data.dropna(subset=["smiles"])

In [117]:
data_with_smiles.to_csv(data_dir / "uniTox_smiles.csv", index=False)

In [119]:
len(data_with_smiles)

1919