In [4]:
# isozyme.csv ... three columns in this order: index of mol, smiles, half-life

In [5]:
# isozyme_split.csv ... same three colums, but mols and their indexes are shuffeled
# after using ScaffoldSplitter from deepchem on the list of smiles or train_test_split from sklearn

In [6]:
# orig 3A4 csv file from ChEMBL -> only mols w/ "Standard Type" == "T1/2" (must be uppercase*)
# and only "Standard Value" column -> 3A4.csv w/ 70 mols
# * ... mols w/ "t1/2" don't have a "Standard Value"

In [7]:
# orig RLM csv file from PubChem -> remove invalid smiles (how?*) -> RLM.csv w/ 2524 mols
# * ... one way would be to convert them to jazzy molecule features
# and remove mols with empty dict

In [8]:
# orig HLC csv file from PubChem -> convert ">30" to 30 -> HLC.csv w/ 189 mols

In [19]:
import sys
import numpy as np
import pandas as pd
import deepchem as dc
from tdc.single_pred import ADME
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from jazzy.api import molecular_vector_from_smiles as mol_vect
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import isz_csv_data_formatting, split_csv_data_formatting

In [13]:
isozymes = ["3A4", "RLM", "HLC"]
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/data_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/data_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/data_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/data_splits/random/3A4_test.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/data_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/data_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/data_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/data_splits/random/RLM_test.csv",


    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/data_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/data_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/data_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/data_splits/random/HLC_test.csv"
}
smiles = {}
halflives = {}
years = {}
smiles_as_index = {}  # structure: {isozyme: {"smi": (idx, halflife, published),...}}
sorted_smiles = {}
smi_as_idx_time_sorted = {}  # same structure as smiles_as_index but tuples in each isozyme are sorted by publication date
years_split = {}
rand_split_smiles = {}
scaff_split_smiles = {}
time_split_smiles = {}
tdc_datasets = {}

In [3]:
# create and/or load csv files
for isozyme in isozymes:
    isz_csv_data_formatting(rel_paths[f"{isozyme}_source"], isozyme, sep=rel_paths[f"{isozyme}_sep"])

3A4.csv already exists in dir
RLM.csv already exists in dir
HLC.csv already exists in dir


In [4]:
# read smiles, their idxs, corresponding half-life and publication year as tuple(tuple(idx, smiles, half-life), year)
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    index = list(df["mol_idx"])
    _smiles = list(df["smiles"])
    halflife = list(df["half-life"])
    published = list(df["published"])
    smiles[isozyme] = _smiles
    halflives[isozyme] = halflife
    years[isozyme] = published
    smiles_as_index[isozyme] = {}
    for idx, smi, val, year in zip(index, _smiles, halflife, published):
        smiles_as_index[isozyme][smi] = (idx, val, year)
    first_smiles = smiles[isozyme][0]
    print(f"{smiles[isozyme][:5]}\n{halflives[isozyme][:5]}\n{smiles_as_index[isozyme][first_smiles]}")

['COc1ccc2[nH]cc(CCNC(C)=O)c2c1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3nnc(Cc4ccc(F)cc4F)cc32)[C@@H](CN2Cc3c(F)cccc3C2=O)CN1', 'O=c1[nH]c2ccccc2n1C1CCN(CCCC(c2ccc(F)cc2)c2ccc(F)cc2)CC1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3ncc([C@@H](O)c4ccc(F)cc4)cc32)[C@@H](CN2[C@H](C)COC[C@H]2C)CN1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3ncc(Cc4ccc(F)cc4)cc32)[C@@H](CN2Cc3ccccc3C2=O)CN1']
[0.0550250664243987, 0.1940522960504559, 0.0069483598138766, 0.0772707572389795, 0.1245136673422085]
(1, 0.0550250664243987, 2004)
['COc1cc2c3cc1Oc1c(O)c(OC)cc4c1[C@@H](Cc1ccc(cc1)Oc1cc(ccc1O)C[C@H]3N(C)CC2)N(C)CC4', 'O=C(NCCc1ccccn1)C1CC(=O)N(c2n[nH]c3cc(Br)ccc23)C1', 'Nc1nc2c(s1)C(c1ccc(F)c(F)c1)CC(=O)N2', 'Cc1nc(-c2ccco2)cc([C@H]2CN3CC[C@H]2C[C@@H]3CNC(=O)NC2CCCCC2)n1', 'Cc1ccc(-n2c(=O)c3oc4ccccc4c3n(CC(=O)Nc3ccc(F)c(Cl)c3)c2=O)cc1C']
[1.0, 0.9955079474775398, 0.9937802349689012, 0.9917069799585349, 0.990324809951624]
(1, 1.0, 2005)
['Cc1cc2cc(-c3c(C)noc3C)nc(Nc3cccc(F)c3)c2o1', 'CN(Cc1cccc(F)c1)C1CCOc2ccccc21', 'Fc1cccc(-c2cnc(Nc3c

In [5]:
# create a sorted version of smiles_as_index
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    sorted_df = df.sort_values(by="published", axis=0)
    sorted_index = list(sorted_df["mol_idx"])
    _sorted_smiles = list(sorted_df["smiles"])
    sorted_halflife = list(sorted_df["half-life"])
    sorted_published = list(sorted_df["published"])
    sorted_smiles[isozyme] = _sorted_smiles
    smi_as_idx_time_sorted[isozyme] = {}
    for idx, smi, val, year in zip(sorted_index, _sorted_smiles, sorted_halflife, sorted_published):
        smi_as_idx_time_sorted[isozyme][smi] = (idx, val, year)
    first_smiles = _sorted_smiles[0]
    print(f"{_sorted_smiles[:5]}\n{sorted_halflife[:5]}\n{smi_as_idx_time_sorted[isozyme][first_smiles]}")

['O=C(NC1CCN(CCc2c[nH]c3ccccc23)CC1)c1ccccc1', 'COc1cc(N)c(Cl)cc1C(=O)NC1CCN(Cc2ccccc2)CC1', 'O=C(CCCN1CCC(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F)cc1', 'COc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1', 'CC1(C)O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5(C)[C@@]4(F)[C@@H](O)C[C@]3(C)[C@]2(C(=O)CO)O1']
[0.0439188969472542, 0.0133402438387857, 0.0086159528284628, 0.003613173784704, 0.0327960515399637]
(25, 0.0439188969472542, 1980)
['O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12', 'Nc1cc2nc3ccccc3oc-2cc1=O', 'NC[C@H](O)c1ccc(O)c(O)c1', 'COc1cc2c3cc1Oc1c(O)c(OC)cc4c1[C@@H](Cc1ccc(cc1)Oc1cc(ccc1O)C[C@H]3N(C)CC2)N(C)CC4', 'CCOC(=O)N1CCN(Cc2nc3c(c(=O)n(C)c(=O)n3C)n2Cc2cccc(C)c2)CC1']
[0.1807187284035936, 0.2671043538355218, 0.4305459571527297, 1.0, 0.138217000691085]
(865, 0.1807187284035936, 2004)
['O=C(Nc1cnc(-c2ccccc2)nc1)c1ccccc1', 'COC(=O)[C@@H]1Cc2ncn(C)c2CN1C(=O)c1ccc(C#N)cc1', 'Cc1ccc(-c2noc([C@@H]3Cc4nc[nH]c4CN3Cc3cccc(C#N)c3)n2)cc1', 'O=C(Nc1cnc(-c2ccccc2)nc1)c1cccc(F)c1', 'Cc1ccc(-c2noc([C@@H

In [6]:
# random train-test split
for isozyme in isozymes:
    rand_train, rand_test = train_test_split(smiles[isozyme], test_size=0.2, random_state=42)
    rand_split_smiles[isozyme] = {}
    rand_split_smiles[isozyme]["train"] = rand_train
    rand_split_smiles[isozyme]["test"] = rand_test
    print(f"{isozyme}\ntrain: {len(rand_split_smiles[isozyme]['train'])}\ntest: {len(rand_split_smiles[isozyme]['test'])}\n")

3A4
train: 56
test: 14

RLM
train: 1421
test: 356

HLC
train: 151
test: 38



In [7]:
# save the random splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smiles_as_index, rand_split_smiles[isozyme], "random")

3A4_train.csv already exists in project_resources/base_splits/random
3A4_test.csv already exists in project_resources/base_splits/random
RLM_train.csv already exists in project_resources/base_splits/random
RLM_test.csv already exists in project_resources/base_splits/random
HLC_train.csv already exists in project_resources/base_splits/random
HLC_test.csv already exists in project_resources/base_splits/random


In [8]:
# train-test split with deepchem's ScaffoldSplitter
for isozyme in isozymes:
    # create scaffolds
    identifiers = smiles[isozyme]
    Xs = np.zeros(len(identifiers))
    dataset = dc.data.DiskDataset.from_numpy(X=Xs, ids=identifiers)
    scaffoldsplitter = dc.splits.ScaffoldSplitter()
    train, test = scaffoldsplitter.train_test_split(dataset)
    scaffold = scaffoldsplitter.generate_scaffolds(dataset)
    scaff_split_smiles[isozyme] = {}
    scaff_split_smiles[isozyme]["train"] = train.ids.tolist()
    scaff_split_smiles[isozyme]["test"] = test.ids.tolist()
    print(f"{isozyme}\ntrain: {len(scaff_split_smiles[isozyme]['train'])}\ntest: {len(scaff_split_smiles[isozyme]['test'])}\n")

    # check the number of mols in each scaffold
    lens = []
    for part in scaffold:
        lens.append(len(part))
    print(lens)
    total = 0
    for _len in lens:
        total += _len
    print(total)
#print(scaffold)  # use this to check the strucutre of molecules in large groups (the numbers are mol_ids)

3A4
train: 56
test: 14

[9, 8, 7, 5, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
70
RLM
train: 1421
test: 356

[39, 31, 30, 22, 18, 15, 15, 15, 15, 12, 11, 10, 10, 10, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [9]:
# save the ScaffoldSplitter splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smiles_as_index, scaff_split_smiles[isozyme], "scaffold_splitter")

3A4_train.csv already exists in project_resources/base_splits/scaffold_splitter
3A4_test.csv already exists in project_resources/base_splits/scaffold_splitter
RLM_train.csv already exists in project_resources/base_splits/scaffold_splitter
RLM_test.csv already exists in project_resources/base_splits/scaffold_splitter
HLC_train.csv already exists in project_resources/base_splits/scaffold_splitter
HLC_test.csv already exists in project_resources/base_splits/scaffold_splitter


In [10]:
# time split: train dataset ... oldest 80 % of molecules; test dataset ... newest 20 %
for isozyme in isozymes:
    num_mols = len(sorted_smiles[isozyme])
    train_size = int(np.floor(num_mols * 0.8))
    train_smiles = sorted_smiles[isozyme][:train_size]
    test_smiles = sorted_smiles[isozyme][train_size:]
    time_split_smiles[isozyme] = {}
    time_split_smiles[isozyme]["train"] = train_smiles
    time_split_smiles[isozyme]["test"] = test_smiles
    years_split[isozyme] = {}
    for split in ["train", "test"]:
        years_split[isozyme][split] = []
        for smi in time_split_smiles[isozyme][split]:
            year = smi_as_idx_time_sorted[isozyme][smi][2]
            years_split[isozyme][split].append(year)
    print(f"{isozyme}\ntrain: {len(train_smiles)}\ntest: {len(test_smiles)}\n")

3A4
train: 56
test: 14

RLM
train: 1421
test: 356

HLC
train: 151
test: 38



In [11]:
# save the time splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smi_as_idx_time_sorted, time_split_smiles[isozyme],
                              "time_split", include_year=True, years=years_split)

3A4_train.csv already exists in project_resources/base_splits/time_split
3A4_test.csv already exists in project_resources/base_splits/time_split
RLM_train.csv already exists in project_resources/base_splits/time_split
RLM_test.csv already exists in project_resources/base_splits/time_split
HLC_train.csv already exists in project_resources/base_splits/time_split
HLC_test.csv already exists in project_resources/base_splits/time_split


In [23]:
# download/load benchmark datasets from TDC-ADME and their Jazzy features
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

for benchmark in tdc_benchmarks:
    #get the smiles and half-lives from datasets
    train_smiles = np.array(list(tdc_datasets[benchmark]["train"]["Drug"]) + list(tdc_datasets[benchmark]["valid"]["Drug"]))
    train_halflives = np.array(list(tdc_datasets[benchmark]["train"]["Y"]) + list(tdc_datasets[benchmark]["valid"]["Y"]))
    test_smiles = np.array(list(tdc_datasets[benchmark]["test"]["Drug"]))
    test_halflives = np.array(list(tdc_datasets[benchmark]["test"]["Y"]))

    # scale train half-lives
    reshaped_train_halflife = np.array(train_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_train_halflife)
    train_halflife_scaled = scaler.transform(reshaped_train_halflife)
    train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

    # scale test half-lives
    reshaped_test_halflife = np.array(test_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_test_halflife)
    test_halflife_scaled = scaler.transform(reshaped_test_halflife)
    test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])
    
    print(train_smiles.shape, train_halflives_scaled.shape, test_smiles.shape, test_halflives_scaled.shape)

    
    try:
        df = pd.read_csv(f"project_resources/jazzy_splits/TDC/{benchmark}_train.csv")
        df = pd.read_csv(f"project_resources/jazzy_splits/TDC/{benchmark}_test.csv")
        print(f"both {benchmark}_train.csv and {benchmark}_test.csv already exist in project_resources/jazzy_splits/TDC")
    
    except FileNotFoundError:
        # generate Jazzy features and save to csv files
        train_jazzy_fps = []
        train_jazzy_thalfs = []
        test_jazzy_fps = []
        test_jazzy_thalfs = []

        for smi, thalf in zip(train_smiles, train_halflives_scaled):
            try:
                jazzy_fp = mol_vect(smi)
            except:
                jazzy_fp = None
            if jazzy_fp and not np.isnan(np.array(list(jazzy_fp.values()))).any():
                jazzy_fp_list = np.array([fp for fp in jazzy_fp.values()])
                train_jazzy_fps.append(jazzy_fp_list)
                train_jazzy_thalfs.append(thalf)

        for smi, thalf in zip(test_smiles, test_halflives_scaled):
            try:
                jazzy_fp = mol_vect(smi)
            except:
                jazzy_fp = None
            if jazzy_fp and not np.isnan(np.array(list(jazzy_fp.values()))).any():
                jazzy_fp_list = np.array([fp for fp in jazzy_fp.values()])
                test_jazzy_fps.append(jazzy_fp_list)
                test_jazzy_thalfs.append(thalf)

        print(np.array(train_jazzy_fps).shape, np.array(train_jazzy_thalfs).shape, np.array(test_jazzy_fps).shape, np.array(test_jazzy_thalfs).shape)

        train_jazzy_csv = f"project_resources/jazzy_splits/TDC/{benchmark}_train.csv"
        df = pd.DataFrame(train_jazzy_fps, columns=['sdc', 'sdx', 'sa', 'dga', 'dgp', 'dgtot'])
        df.insert(0, "half-life", train_jazzy_thalfs)
        df.to_csv(train_jazzy_csv, index=False)
        print(f"{train_jazzy_csv} was successfully created")

        test_jazzy_csv = f"project_resources/jazzy_splits/TDC/{benchmark}_test.csv"
        df = pd.DataFrame(test_jazzy_fps, columns=['sdc', 'sdx', 'sa', 'dga', 'dgp', 'dgtot'])
        df.insert(0, "half-life", test_jazzy_thalfs)
        df.to_csv(test_jazzy_csv, index=False)
        print(f"{test_jazzy_csv} was successfully created")

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


(534,) (534,) (133,) (133,)
both obach_train.csv and obach_test.csv already exist in project_resources/jazzy_splits/TDC
(882,) (882,) (220,) (220,)
both microsome_train.csv and microsome_test.csv already exist in project_resources/jazzy_splits/TDC
(970,) (970,) (243,) (243,)
both hepatocyte_train.csv and hepatocyte_test.csv already exist in project_resources/jazzy_splits/TDC
