In [1]:
import numpy as np
import pandas as pd
from tdc.single_pred import ADME
from sklearn.preprocessing import MinMaxScaler
from jazzy.api import molecular_vector_from_smiles as mol_vect

In [2]:
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
tdc_datasets = {}

In [3]:
# download/load benchmark datasets from TDC-ADME
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [5]:
# create csv files with Jazzy features if not already created
for benchmark in tdc_benchmarks:
    #get the smiles and half-lives from datasets
    train_smiles = np.array(list(tdc_datasets[benchmark]["train"]["Drug"]) + list(tdc_datasets[benchmark]["valid"]["Drug"]))
    train_halflives = np.array(list(tdc_datasets[benchmark]["train"]["Y"]) + list(tdc_datasets[benchmark]["valid"]["Y"]))
    test_smiles = np.array(list(tdc_datasets[benchmark]["test"]["Drug"]))
    test_halflives = np.array(list(tdc_datasets[benchmark]["test"]["Y"]))

    # scale train half-lives
    reshaped_train_halflife = np.array(train_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_train_halflife)
    train_halflife_scaled = scaler.transform(reshaped_train_halflife)
    train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

    # scale test half-lives
    reshaped_test_halflife = np.array(test_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_test_halflife)
    test_halflife_scaled = scaler.transform(reshaped_test_halflife)
    test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])
    
    print(train_smiles.shape, train_halflives_scaled.shape, test_smiles.shape, test_halflives_scaled.shape)

    
    try:
        df = pd.read_csv(f"project_resources/jazzy_splits/{benchmark}_train.csv")
        df = pd.read_csv(f"project_resources/jazzy_splits/{benchmark}_test.csv")
        print(f"both {benchmark}_train.csv and {benchmark}_test.csv already exist in project_resources/jazzy_splits/TDC")
    
    except FileNotFoundError:
        # generate Jazzy features and save to csv files
        train_jazzy_fps = []
        train_jazzy_thalfs = []
        test_jazzy_fps = []
        test_jazzy_thalfs = []

        # train split
        for smi, thalf in zip(train_smiles, train_halflives_scaled):
            try:
                jazzy_fp = mol_vect(smi)
            except:
                jazzy_fp = None
            if jazzy_fp and not np.isnan(np.array(list(jazzy_fp.values()))).any():
                jazzy_fp_list = np.array([fp for fp in jazzy_fp.values()])
                train_jazzy_fps.append(jazzy_fp_list)
                train_jazzy_thalfs.append(thalf)

        # test split
        for smi, thalf in zip(test_smiles, test_halflives_scaled):
            try:
                jazzy_fp = mol_vect(smi)
            except:
                jazzy_fp = None
            if jazzy_fp and not np.isnan(np.array(list(jazzy_fp.values()))).any():
                jazzy_fp_list = np.array([fp for fp in jazzy_fp.values()])
                test_jazzy_fps.append(jazzy_fp_list)
                test_jazzy_thalfs.append(thalf)

        print(np.array(train_jazzy_fps).shape, np.array(train_jazzy_thalfs).shape, np.array(test_jazzy_fps).shape, np.array(test_jazzy_thalfs).shape)

        train_jazzy_csv = f"project_resources/jazzy_splits/{benchmark}_train.csv"
        df = pd.DataFrame(train_jazzy_fps, columns=['sdc', 'sdx', 'sa', 'dga', 'dgp', 'dgtot'])
        df.insert(0, "half-life", train_jazzy_thalfs)
        df.to_csv(train_jazzy_csv, index=False)
        print(f"{train_jazzy_csv} was successfully created")

        test_jazzy_csv = f"project_resources/jazzy_splits/{benchmark}_test.csv"
        df = pd.DataFrame(test_jazzy_fps, columns=['sdc', 'sdx', 'sa', 'dga', 'dgp', 'dgtot'])
        df.insert(0, "half-life", test_jazzy_thalfs)
        df.to_csv(test_jazzy_csv, index=False)
        print(f"{test_jazzy_csv} was successfully created")

(534,) (534,) (133,) (133,)
both obach_train.csv and obach_test.csv already exist in project_resources/jazzy_splits/TDC
(882,) (882,) (220,) (220,)
both microsome_train.csv and microsome_test.csv already exist in project_resources/jazzy_splits/TDC
(970,) (970,) (243,) (243,)
both hepatocyte_train.csv and hepatocyte_test.csv already exist in project_resources/jazzy_splits/TDC
