In [1]:
import optuna
import joblib
import numpy as np
import pandas as pd
import sys
from concurrent.futures import ThreadPoolExecutor
from optuna.storages import JournalStorage, JournalFileStorage
from tdc.single_pred import ADME
from project_resources.import_utils import NotebookFinder
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import fp_from_smiles, parse_jazzy_df, HyperparamTuner

  from .autonotebook import tqdm as notebook_tqdm


importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
tdc_datasets = {}
smiles = {}
halflives = {}
mol_features = {}

In [3]:
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [4]:
for benchmark in tdc_benchmarks:
    print(benchmark)
    smiles[benchmark] = {}
    halflives[benchmark] = {}
    halflives[benchmark]["morgan"] = {}
    
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    smiles[benchmark]["train"] = np.array(benchmark_train_smiles)
    smiles[benchmark]["test"] = np.array(benchmark_test_smiles)
    
    benchmark_train_halflives = tdc_datasets[benchmark]["train"]["Y"]
    benchmark_test_halflives = tdc_datasets[benchmark]["test"]["Y"]
    halflives[benchmark]["morgan"]["train"] = np.array(benchmark_train_halflives)
    halflives[benchmark]["morgan"]["test"] = np.array(benchmark_test_halflives)
    
    print(benchmark_train_smiles.shape, benchmark_train_halflives.shape, benchmark_test_smiles.shape, benchmark_test_halflives.shape)

obach
(467,) (467,) (133,) (133,)
microsome
(772,) (772,) (220,) (220,)
hepatocyte
(849,) (849,) (243,) (243,)


In [5]:
mol_features["morgan"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    mol_features["morgan"][benchmark] = {}
    train_morgan_fps = np.array(fp_from_smiles(smiles[benchmark]["train"]))
    test_morgan_fps = np.array(fp_from_smiles(smiles[benchmark]["test"]))
    mol_features["morgan"][benchmark]["train"] = train_morgan_fps
    mol_features["morgan"][benchmark]["test"] = test_morgan_fps
    print(train_morgan_fps.shape, test_morgan_fps.shape)

obach
(467, 124) (133, 124)
microsome
(772, 124) (220, 124)
hepatocyte
(849, 124) (243, 124)


In [6]:
mol_features["jazzy"] = {}
halflives["jazzy"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    train_jazzy_df = pd.read_csv(f"project_resources/jazzy_splits/TDC/{benchmark}_train.csv")
    test_jazzy_df = pd.read_csv(f"project_resources/jazzy_splits/TDC/{benchmark}_test.csv")
    train_fts, train_jazzy_thalfs, contains_nan = parse_jazzy_df(train_jazzy_df, no_idx_smi=True)
    test_fts, test_jazzy_thalfs, contains_nan = parse_jazzy_df(test_jazzy_df, no_idx_smi=True)
    
    mol_features["jazzy"][benchmark] = {}
    mol_features["jazzy"][benchmark]["train"] = train_fts
    mol_features["jazzy"][benchmark]["test"] = test_fts
    halflives["jazzy"][benchmark] = {}
    halflives["jazzy"][benchmark]["train"] = train_jazzy_thalfs
    halflives["jazzy"][benchmark]["test"] = test_jazzy_thalfs
    
    print(np.array(train_fts).shape, np.array(train_jazzy_thalfs).shape, np.array(test_fts).shape, np.array(test_jazzy_thalfs).shape)

obach
     525, [0.0033169983665033, 11.3305, 0.0, 3.3351, -4.054, -69.5826, -60.2942]
     130, [0.0078064793190067, 10.9705, 1.8136, 5.8249, -16.4281, -118.9807, -121.6516]
(525, 6) (525,) (130, 6) (130,)
microsome
     882, [0.0652380952380952, 10.5072, 1.4478, 4.6964, -13.5025, -92.8889, -106.3914]
     220, [0.0884353741496598, 3.5084, 3.5128, 4.5042, -12.6756, -80.3865, -84.2257]
(882, 6) (882,) (220, 6) (220,)
hepatocyte
     970, [0.0, 9.8552, 1.4451, 4.4407, -15.1209, -91.0733, -102.926]
     243, [0.0825850340136054, 10.2098, 1.292, 5.2199, -18.4498, -95.9609, -98.199]
(970, 6) (970,) (243, 6) (243,)
