In [1]:
from tqdm import tqdm
import requests
import pandas as pd
import numpy as np

In [2]:
assays = [
    "ATG_PPARg_TRANS_dn", 
    "ATG_PPARg_TRANS_up", 
    "TOX21_PPARg_BLA_antagonist_ratio", 
    "TOX21_PPARg_BLA_Agonist_ratio", 
    "NVS_NR_hPPARg", 
    "ERF_ENZ_hCYP19A1_dn", 
    "NVS_ADME_hCYP19A1", 
    "NVS_ADME_hCYP19A1_Activator", 
    "TOX21_Aromatase_Inhibition", 
    "CEETOX_H295R_ESTRADIOL_dn", 
    "CEETOX_H295R_ESTRADIOL_noMTC_dn", 
    "CEETOX_H295R_ESTRADIOL_noMTC_up", 
    "CEETOX_H295R_ESTRADIOL_up", 
    "CEETOX_H295R_ESTRONE_dn", 
    "CEETOX_H295R_ESTRONE_noMTC_dn", 
    "CEETOX_H295R_ESTRONE_noMTC_up", 
    "CEETOX_H295R_ESTRONE_up"
]
assays = [f"data/{assay}.csv" for assay in assays]
to_process = assays

In [5]:
test_data = [
    "skeec/Chemical List EDSP21LIST2-2021-12-28.csv",
    "skeec/Chemical List EDSPUOC-2022-06-16.csv",
    "skeec/Chemical List OSHA-2021-12-28.csv",
    "skeec/Chemical List UBAPMT-2021-12-28.csv",
    "skeec/Chemical List EDSP21LIST1-2021-12-28.csv"
]
to_process = test_data

In [6]:
cache = dict()

for filename in to_process:
    
    df = pd.read_csv(filename)

    print(filename)

    print("before cas/smile check", len(df))
    df.drop(df[(df["smiles"].isna()) & df["casrn"].str.startswith("NOCAS")].index.tolist(), inplace=True)
    print("after cas/smile check", len(df))

    print("remaining smiles nulls", df["smiles"].isna().sum())

    def fix_smiles(row):
        if str(row["smiles"]).strip() == "nan":
            cas = str(row["casrn"]).strip()
            # print(cas)
            
            if cas in cache.keys():
                smiles = cache[cas]
                print(cas, "--(cache)-->", smiles)
                row["smiles"] = smiles
            else:            
                url = f"https://chem.nlm.nih.gov/api/data/number/startswith/{cas}?data=smiles"
                resp = requests.get(url)
                if resp.status_code == 200:
                    d = resp.json()["results"][0]
                    # print(d)
                    if "structureDetails" in d.keys():
                        smiles = d["structureDetails"]["s"]
                        print(cas, "->", smiles)
                        row["smiles"] = smiles
                        cache[cas] = smiles
                else:
                    print(f"ERROR for cas:", cas)
        return row

    df = df.apply(lambda x: fix_smiles(x), axis=1)

    print("final smiles nulls", df["smiles"].isna().sum())

    df.dropna(subset=["smiles"], inplace=True)

    print("final length", len(df))
    
    df.to_csv(filename, index=False)
    
    print("saved", filename)

skeec/Chemical List EDSP21LIST2-2021-12-28.csv
before cas/smile check 106
after cas/smile check 106
remaining smiles nulls 0
final smiles nulls 0
final length 106
saved skeec/Chemical List EDSP21LIST2-2021-12-28.csv
skeec/Chemical List EDSPUOC-2022-06-16.csv
before cas/smile check 8437
after cas/smile check 8437
remaining smiles nulls 0
final smiles nulls 0
final length 8437
saved skeec/Chemical List EDSPUOC-2022-06-16.csv
skeec/Chemical List OSHA-2021-12-28.csv
before cas/smile check 811
after cas/smile check 811
remaining smiles nulls 0
final smiles nulls 0
final length 811
saved skeec/Chemical List OSHA-2021-12-28.csv
skeec/Chemical List UBAPMT-2021-12-28.csv
before cas/smile check 297
after cas/smile check 297
remaining smiles nulls 0
final smiles nulls 0
final length 297
saved skeec/Chemical List UBAPMT-2021-12-28.csv
skeec/Chemical List EDSP21LIST1-2021-12-28.csv
before cas/smile check 67
after cas/smile check 67
remaining smiles nulls 0
final smiles nulls 0
final length 67
saved