In [1]:
# isozyme.csv ... three columns in this order: index of mol, smiles, half-life

In [2]:
# isozyme_data-split.csv ... same three colums, but mols and their indexes are shuffeled
# after using ScaffoldSplitter from deepchem on the list of smiles
# (how to preserve the pairing between smiles and its index?)
# data-splits are ["train", "test"]

In [3]:
# orig 3A4 csv file from ChEMBL -> only mols w/ "Standard Type" == "T1/2" (must be uppercase*)
# and only "Standard Value" column -> 3A4.csv w/ 70 mols
# * ... mols w/ "t1/2" don't have a "Standard Value"

In [4]:
# orig RLM csv file from PubChem -> remove invalid smiles (how?*) -> RLM.csv w/ 2524 mols
# * ... one way would be to convert them to jazzy molecule features
# and remove mols with empty dict

In [5]:
# orig HLC csv file from PubChem -> convert ">30" to 30 -> HLC.csv w/ 189 mols

In [9]:
import sys
import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.model_selection import train_test_split
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import *

In [10]:
isozymes = ["3A4", "RLM", "HLC"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/data_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/data_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/data_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/data_splits/random/3A4_test.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/data_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/data_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/data_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/data_splits/random/RLM_test.csv",


    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/data_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/data_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/data_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/data_splits/random/HLC_test.csv"
}
smiles = {}
halflife = {}
smiles_as_index = {}  # structure: {isozyme: {"smi": (idx, halflife),...}}
scaff_split_smiles = {}
rand_split_smiles = {}

In [3]:
# create and/or load csv files
for isozyme in isozymes:
    isz_csv_data_formatting(rel_paths[f"{isozyme}_source"], isozyme, sep=rel_paths[f"{isozyme}_sep"])

3A4.csv already exists in dir
RLM.csv was successfully created
HLC.csv already exists in dir


In [4]:
# read smiles, their idxs and corresponding half-life as tuple(idx, smiles, half-life)
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    index = df["mol_idx"]
    smiles[isozyme] = []
    smiles[isozyme] = list(df["smiles"])
    df_halflife = df["half-life"]
    smiles_as_index[isozyme] = {}
    halflife[isozyme] = []
    for idx, smi, val in zip(index, smiles[isozyme], df_halflife):
        smiles_as_index[isozyme][smi] = (idx, val)
        halflife[isozyme].append(val)
    first_smiles = smiles[isozyme][0]
    print(f"{smiles[isozyme][:5]}\n{halflife[isozyme][:5]}\n{smiles_as_index[isozyme][first_smiles]}")

['COc1ccc2[nH]cc(CCNC(C)=O)c2c1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3nnc(Cc4ccc(F)cc4F)cc32)[C@@H](CN2Cc3c(F)cccc3C2=O)CN1', 'O=c1[nH]c2ccccc2n1C1CCN(CCCC(c2ccc(F)cc2)c2ccc(F)cc2)CC1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3ncc([C@@H](O)c4ccc(F)cc4)cc32)[C@@H](CN2[C@H](C)COC[C@H]2C)CN1', 'C[C@@H]1CN(CC(=O)N2CC(C)(C)c3ncc(Cc4ccc(F)cc4)cc32)[C@@H](CN2Cc3ccccc3C2=O)CN1']
[0.3333, 1.167, 0.045, 0.4667, 0.75]
(1, 0.3333)
['CCCCOc1ccc(CC(=O)NO)cc1', 'COc1ccc(CCn2c(N)cc(=O)[nH]c2=S)cc1OC', 'COc1ccc(Cl)cc1C(=O)Nc1ccc(NC(=O)c2ccco2)cc1', 'CC(C)(C)n1nc(-c2ccc(Cl)cc2)c2c(N)ncnc21', 'Cn1c2ccccc2c(=O)c2c(=O)n(-c3ccccc3)c(C3CCCCC3)nc21']
[30.0, 30.0, 30.0, 30.0, 30.0]
(1, 30.0)
['Cc1cc2cc(-c3c(C)noc3C)nc(Nc3cccc(F)c3)c2o1', 'CN(Cc1cccc(F)c1)C1CCOc2ccccc21', 'Fc1cccc(-c2cnc(Nc3cncnc3)c3c2CCO3)c1', 'O=C(c1ccc(-c2ncc3cnc(-c4cccc(F)c4)cn23)cc1)N1CCc2cccnc21', 'Cc1ccc(NC(=O)c2nc3sccn3c2-c2cccc(C#N)c2)cc1']
[8.2, 8.6, 9.6, 17.3, 17.4]
(1, 8.2)


In [5]:
# train-test split with deepchem's ScaffoldSplitter
for isozyme in isozymes:
    identifiers = smiles[isozyme]
    Xs = np.zeros(len(identifiers))
    dataset = dc.data.DiskDataset.from_numpy(X=Xs, ids=identifiers)
    scaffoldsplitter = dc.splits.ScaffoldSplitter()
    train, test = scaffoldsplitter.train_test_split(dataset)
    scaff_split_smiles[isozyme] = {}
    scaff_split_smiles[isozyme]["train"] = train.ids.tolist()
    scaff_split_smiles[isozyme]["test"] = test.ids.tolist()
    print(f"{isozyme}\ntrain: {len(scaff_split_smiles[isozyme]['train'])}\ntest: {len(scaff_split_smiles[isozyme]['test'])}\n")

3A4
train: 56
test: 14

RLM
train: 2024
test: 507

HLC
train: 151
test: 38



In [6]:
# save the ScaffoldSplitter splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smiles_as_index, scaff_split_smiles[isozyme], "scaffold_splitter")

3A4_train.csv was successfully created in project_resources/data_splits/scaffold_splitter
3A4_test.csv was successfully created in project_resources/data_splits/scaffold_splitter
RLM_train.csv was successfully created in project_resources/data_splits/scaffold_splitter
RLM_test.csv was successfully created in project_resources/data_splits/scaffold_splitter
HLC_train.csv was successfully created in project_resources/data_splits/scaffold_splitter
HLC_test.csv was successfully created in project_resources/data_splits/scaffold_splitter


In [7]:
# random train-test split
for isozyme in isozymes:
    rand_train, rand_test = train_test_split(smiles[isozyme], test_size=0.2, random_state=42)
    rand_split_smiles[isozyme] = {}
    rand_split_smiles[isozyme]["train"] = rand_train
    rand_split_smiles[isozyme]["test"] = rand_test
    print(f"{isozyme}\ntrain: {len(rand_split_smiles[isozyme]['train'])}\ntest: {len(rand_split_smiles[isozyme]['test'])}\n")

3A4
train: 56
test: 14

RLM
train: 2024
test: 507

HLC
train: 151
test: 38



In [8]:
# save the random splits to csv files
for isozyme in isozymes:
    split_csv_data_formatting(isozyme, smiles_as_index, rand_split_smiles[isozyme], "random")

3A4_train.csv was successfully created in project_resources/data_splits/random
3A4_test.csv was successfully created in project_resources/data_splits/random
RLM_train.csv was successfully created in project_resources/data_splits/random
RLM_test.csv was successfully created in project_resources/data_splits/random
HLC_train.csv was successfully created in project_resources/data_splits/random
HLC_test.csv was successfully created in project_resources/data_splits/random
