In [1]:
# isozyme.csv ... three columns in this order: index of mol, smiles, half-life

In [2]:
# isozyme_split.csv ... same three colums, but mols and their indexes are shuffeled
# after using ScaffoldSplitter from deepchem on the list of smiles or train_test_split from sklearn

In [3]:
# orig 3A4 csv file from ChEMBL -> only mols w/ "Standard Type" == "T1/2" (must be uppercase*)
# and only "Standard Value" column -> 3A4.csv w/ 70 mols
# * ... mols w/ "t1/2" don't have a "Standard Value"

In [4]:
# orig RLM csv file from PubChem -> remove invalid smiles (how?*) -> RLM.csv w/ 2524 mols
# * ... one way would be to convert them to jazzy molecule features
# and remove mols with empty dict

In [5]:
# orig HLC csv file from PubChem -> convert ">30" to 30 -> HLC.csv w/ 189 mols

In [3]:
import sys
import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.model_selection import train_test_split
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import isz_csv_data_formatting, split_csv_data_formatting, abs_file_path

In [4]:
isozymes = ["3A4", "RLM", "HLC"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/data_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/data_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/data_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/data_splits/random/3A4_test.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/data_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/data_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/data_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/data_splits/random/RLM_test.csv",


    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/data_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/data_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/data_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/data_splits/random/HLC_test.csv"
}
smiles = {}
halflives = {}
years = {}
smiles_as_index = {}  # structure: {isozyme: {"smi": (idx, halflife, published),...}}
sorted_smiles = {}
smi_as_idx_time_sorted = {}  # same structure as smiles_as_index but tuples in each isozyme are sorted by publication date
years_split = {}
rand_split_smiles = {}
scaff_split_smiles = {}
time_split_smiles = {}

In [5]:
# create and/or load csv files
for isozyme in isozymes:
    isz_csv_data_formatting(rel_paths[f"{isozyme}_source"], isozyme, sep=rel_paths[f"{isozyme}_sep"])

3A4.csv already exists in dir
RLM.csv already exists in dir
HLC.csv already exists in dir


In [6]:
# read smiles, their idxs, corresponding half-life and publication year as tuple(tuple(idx, smiles, half-life), year)
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    index = list(df["mol_idx"])
    _smiles = list(df["smiles"])
    halflife = list(df["half-life"])
    published = list(df["published"])
    smiles[isozyme] = _smiles
    halflives[isozyme] = halflife
    years[isozyme] = published
    smiles_as_index[isozyme] = {}
    for idx, smi, val, year in zip(index, _smiles, halflife, published):
        smiles_as_index[isozyme][smi] = (idx, val, year)
    first_smiles = smiles[isozyme][0]
    print(f"{smiles[isozyme][:5]}\n{halflives[isozyme][:5]}\n{smiles_as_index[isozyme][first_smiles]}")

['COc1ccc2[nH]cc(CCNC(C)=O)c2c1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3nnc(Cc4ccc(F)cc4F)cc32)[C@@H](CN2Cc3c(F)cccc3C2=O)CN1', 'O=c1[nH]c2ccccc2n1C1CCN(CCCC(c2ccc(F)cc2)c2ccc(F)cc2)CC1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3ncc([C@@H](O)c4ccc(F)cc4)cc32)[C@@H](CN2[C@H](C)COC[C@H]2C)CN1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3ncc(Cc4ccc(F)cc4)cc32)[C@@H](CN2Cc3ccccc3C2=O)CN1']
[0.3333, 1.167, 0.045, 0.4667, 0.75]
(1, 0.3333, 2004)
['CCCCOc1ccc(CC(=O)NO)cc1', 'COc1ccc(CCn2c(N)cc(=O)[nH]c2=S)cc1OC', 'COc1ccc(Cl)cc1C(=O)Nc1ccc(NC(=O)c2ccco2)cc1', 'CC(C)(C)n1nc(-c2ccc(Cl)cc2)c2c(N)ncnc21', 'Cn1c2ccccc2c(=O)c2c(=O)n(-c3ccccc3)c(C3CCCCC3)nc21']
[30.0, 30.0, 30.0, 30.0, 30.0]
(1, 30.0, 2005)
['Cc1cc2cc(-c3c(C)noc3C)nc(Nc3cccc(F)c3)c2o1', 'CN(Cc1cccc(F)c1)C1CCOc2ccccc21', 'Fc1cccc(-c2cnc(Nc3cncnc3)c3c2CCO3)c1', 'O=C(c1ccc(-c2ncc3cnc(-c4cccc(F)c4)cn23)cc1)N1CCc2cccnc21', 'Cc1ccc(NC(=O)c2nc3sccn3c2-c2cccc(C#N)c2)cc1']
[8.2, 8.6, 9.6, 17.3, 17.4]
(1, 8.2, 2018)


In [7]:
# create a sorted version of smiles_as_index
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    sorted_df = df.sort_values(by="published", axis=0)
    sorted_index = list(sorted_df["mol_idx"])
    _sorted_smiles = list(sorted_df["smiles"])
    sorted_halflife = list(sorted_df["half-life"])
    sorted_published = list(sorted_df["published"])
    sorted_smiles[isozyme] = _sorted_smiles
    smi_as_idx_time_sorted[isozyme] = {}
    for idx, smi, val, year in zip(sorted_index, _sorted_smiles, sorted_halflife, sorted_published):
        smi_as_idx_time_sorted[isozyme][smi] = (idx, val, year)
    first_smiles = _sorted_smiles[0]
    print(f"{_sorted_smiles[:5]}\n{sorted_halflife[:5]}\n{smi_as_idx_time_sorted[isozyme][first_smiles]}")

['O=C(NC1CCN(CCc2c[nH]c3ccccc23)CC1)c1ccccc1', 'COc1cc(N)c(Cl)cc1C(=O)NC1CCN(Cc2ccccc2)CC1', 'O=C(CCCN1CCC(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F)cc1', 'COc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2)cc1', 'CC1(C)O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]5(C)[C@@]4(F)[C@@H](O)C[C@]3(C)[C@]2(C(=O)CO)O1']
[0.2667, 0.08333, 0.055, 0.025, 0.2]
(25, 0.2667, 1980)
['O=c1c(-c2ccc(O)cc2)coc2cc(O)cc(O)c12', 'O=c1c(-c2ccc(O)cc2)coc2cc(O)ccc12', 'Nc1cc2nc3ccccc3oc-2cc1=O', 'CC(C)=CCNc1ncnc2nc[nH]c12', 'NC[C@H](O)c1ccc(O)c(O)c1']
[30.0, 30.0, 8.63, 30.0, 13.36]
(51, 30.0, 2004)
['O=C(Nc1cnc(-c2ccccc2)nc1)c1ccccc1', 'COC(=O)[C@@H]1Cc2ncn(C)c2CN1C(=O)c1ccc(C#N)cc1', 'Cc1ccc(-c2noc([C@@H]3Cc4nc[nH]c4CN3Cc3cccc(C#N)c3)n2)cc1', 'O=C(Nc1cnc(-c2ccccc2)nc1)c1cccc(F)c1', 'Cc1ccc(-c2noc([C@@H]3Cc4nc[nH]c4CN3Cc3ccc(C(=O)O)cc3)n2)cc1']
[120.0, 120.0, 63.4, 52.0, 102.3]
(107, 120.0, 2007)


In [8]:
# random train-test split
for isozyme in isozymes:
    rand_train, rand_test = train_test_split(smiles[isozyme], test_size=0.2, random_state=42)
    rand_split_smiles[isozyme] = {}
    rand_split_smiles[isozyme]["train"] = rand_train
    rand_split_smiles[isozyme]["test"] = rand_test
    print(f"{isozyme}\ntrain: {len(rand_split_smiles[isozyme]['train'])}\ntest: {len(rand_split_smiles[isozyme]['test'])}\n")

3A4
train: 56
test: 14

RLM
train: 2024
test: 507

HLC
train: 151
test: 38



In [9]:
# save the random splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smiles_as_index, rand_split_smiles[isozyme], "random")

3A4_train.csv already exists in project_resources/base_splits/random
3A4_test.csv already exists in project_resources/base_splits/random
RLM_train.csv already exists in project_resources/base_splits/random
RLM_test.csv already exists in project_resources/base_splits/random
HLC_train.csv already exists in project_resources/base_splits/random
HLC_test.csv already exists in project_resources/base_splits/random


In [10]:
# train-test split with deepchem's ScaffoldSplitter
for isozyme in isozymes:
    identifiers = smiles[isozyme]
    Xs = np.zeros(len(identifiers))
    dataset = dc.data.DiskDataset.from_numpy(X=Xs, ids=identifiers)
    scaffoldsplitter = dc.splits.ScaffoldSplitter()
    train, test = scaffoldsplitter.train_test_split(dataset)
    scaff_split_smiles[isozyme] = {}
    scaff_split_smiles[isozyme]["train"] = train.ids.tolist()
    scaff_split_smiles[isozyme]["test"] = test.ids.tolist()
    print(f"{isozyme}\ntrain: {len(scaff_split_smiles[isozyme]['train'])}\ntest: {len(scaff_split_smiles[isozyme]['test'])}\n")

3A4
train: 56
test: 14

RLM
train: 2024
test: 507

HLC
train: 151
test: 38



In [11]:
# save the ScaffoldSplitter splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smiles_as_index, scaff_split_smiles[isozyme], "scaffold_splitter")

3A4_train.csv already exists in project_resources/base_splits/scaffold_splitter
3A4_test.csv already exists in project_resources/base_splits/scaffold_splitter
RLM_train.csv already exists in project_resources/base_splits/scaffold_splitter
RLM_test.csv already exists in project_resources/base_splits/scaffold_splitter
HLC_train.csv already exists in project_resources/base_splits/scaffold_splitter
HLC_test.csv already exists in project_resources/base_splits/scaffold_splitter


In [12]:
np.floor(189 * 0.8)

151.0

In [13]:
# time split: train dataset ... oldest 80 % of molecules; test dataset ... newest 20 %
for isozyme in isozymes:
    num_mols = len(sorted_smiles[isozyme])
    train_size = int(np.floor(num_mols * 0.8))
    train_smiles = sorted_smiles[isozyme][:train_size]
    test_smiles = sorted_smiles[isozyme][train_size:]
    time_split_smiles[isozyme] = {}
    time_split_smiles[isozyme]["train"] = train_smiles
    time_split_smiles[isozyme]["test"] = test_smiles
    years_split[isozyme] = {}
    for split in ["train", "test"]:
        years_split[isozyme][split] = []
        for smi in time_split_smiles[isozyme][split]:
            year = smi_as_idx_time_sorted[isozyme][smi][2]
            years_split[isozyme][split].append(year)
    print(f"{isozyme}\ntrain: {len(train_smiles)}\ntest: {len(test_smiles)}\n")

3A4
train: 56
test: 14

RLM
train: 2024
test: 507

HLC
train: 151
test: 38



In [17]:
# save the time splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smi_as_idx_time_sorted, time_split_smiles[isozyme],
                              "time_split", include_year=True, years=years_split)

3A4_train.csv was successfully created in project_resources/base_splits/time_split
3A4_test.csv was successfully created in project_resources/base_splits/time_split
RLM_train.csv was successfully created in project_resources/base_splits/time_split
RLM_test.csv was successfully created in project_resources/base_splits/time_split
HLC_train.csv was successfully created in project_resources/base_splits/time_split
HLC_test.csv was successfully created in project_resources/base_splits/time_split
