In [1]:
from deepmol.loaders import CSVLoader
from deepmol.splitters import MultiTaskStratifiedSplitter
from rdkit import RDLogger
import logging
import warnings

warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
RDLogger.DisableLog('rdApp.*')

dataset = CSVLoader("final_dataset_top.csv", labels_fields = ['C00073', 'C00078', 'C00079', 'C00082', 'C00235', 'C00341',
                                                 'C00353', 'C00448', 'C01789', 'C03506',
                                                 'C00047', 'C00108', 'C00187', 'C00148', 'C00041',
                                                 'C00129', 'C00062', 'C01852', 'C00049', 'C00135'],
          id_field="ids", smiles_field="smiles").create_dataset()

train, test = MultiTaskStratifiedSplitter().train_test_split(dataset, seed=123)

2023-07-03 18:22:39,261 — INFO — Assuming multitask since y has more than one dimension. If otherwise, explicitly set the mode to 'classification' or 'regression'!


In [2]:
train.y.shape

(1131, 20)

In [3]:
from deepmol.pipeline_optimization._feature_selector_objectives import _get_feature_selector
from deepmol.base import PassThroughTransformer
from deepmol.pipeline_optimization._scaler_objectives import _get_scaler
from deepmol.pipeline_optimization._featurizer_objectives import _get_featurizer
from deepmol.pipeline_optimization._standardizer_objectives import _get_standardizer
from deepmol.models import SklearnModel
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier


def get_hyperparameters_for_models(model, trial):
    if model.__name__ == "DecisionTreeClassifier":
        criterion = trial.suggest_categorical("model__criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("model__max_depth", 10, 500)
        min_samples_split = trial.suggest_int("model__min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("model__min_samples_leaf", 1, 10)
        model(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split,
              min_samples_leaf=min_samples_leaf)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "ExtraTreeClassifier":
        criterion = trial.suggest_categorical("model__criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("model__max_depth", 10, 500)
        min_samples_split = trial.suggest_int("model__min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("model__min_samples_leaf", 1, 10)
        model(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split,
              min_samples_leaf=min_samples_leaf)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "ExtraTreesClassifier":
        criterion = trial.suggest_categorical("model__criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("model__max_depth", 10, 500)
        min_samples_split = trial.suggest_int("model__min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("model__min_samples_leaf", 1, 10)
        model(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split,
              min_samples_leaf=min_samples_leaf)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "KNeighborsClassifier":
        n_neighbors =  trial.suggest_int("model__n_neighbors", 1, 100)
        weights =  trial.suggest_categorical("model__weights", ["uniform", "distance"])
        algorithm =  trial.suggest_categorical("model__algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
        model(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "MLPClassifier":
        hidden_layer_sizes = trial.suggest_categorical("model__hidden_layer_sizes", [(100,), (100, 100), (100, 100, 100)])
        activation = trial.suggest_categorical("model__activation", ["identity", "logistic", "tanh", "relu"])
        solver = trial.suggest_categorical("model__solver", ["lbfgs", "sgd", "adam"])
        batch_size = trial.suggest_categorical("model__batch_size", [16, 32, 64, 128, 256])
        early_stopping = True
        if solver == "adam" or solver == "sgd":
            learning_rate_init = trial.suggest_categorical("model__learning_rate", [0.00001, 0.0001, 0.001, 0.01, 0.1])
            learning_rate = trial.suggest_categorical("model__learning_rate", ["constant", "invscaling", "adaptive"])

            model(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, batch_size=batch_size,
              learning_rate_init=learning_rate_init, learning_rate=learning_rate, early_stopping=early_stopping)
        else:
            model(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, batch_size=batch_size,
              early_stopping=early_stopping)

        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "RadiusNeighborsClassifier":
        radius = trial.suggest_int("model__radius", 1, 5)
        weights = trial.suggest_categorical("model__weights", ["uniform", "distance"])
        algorithm = trial.suggest_categorical("model__algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
        model(radius=radius, weights=weights, algorithm=algorithm)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "RandomForestClassifier":
        criterion = trial.suggest_categorical("model__criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("model__max_depth", 10, 500)
        min_samples_split = trial.suggest_int("model__min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("model__min_samples_leaf", 1, 10)
        n_estimators = trial.suggest_int("model__n_estimators", 10, 1000)
        model(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split,
              min_samples_leaf=min_samples_leaf, n_estimators=n_estimators)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "RidgeClassifier":
        alpha = trial.suggest_categorical("model__alpha", [0.1, 1.0, 10.0])
        solver = trial.suggest_categorical("model__solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])
        tol = trial.suggest_categorical("model__tol", [0.0001, 0.001, 0.01, 0.1])
        fit_intercept = trial.suggest_categorical("model__fit_intercept", [True, False])
        model(alpha=alpha, solver=solver, tol=tol, fit_intercept=fit_intercept)
        return SklearnModel(model, model_dir="model")
    elif model.__name__ == "RidgeClassifierCV":
        alpha = trial.suggest_categorical("model__alpha", [0.1, 1.0, 10.0])
        solver = trial.suggest_categorical("model__solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])
        tol = trial.suggest_categorical("model__tol", [0.0001, 0.001, 0.01, 0.1])
        fit_intercept = trial.suggest_categorical("model__fit_intercept", [True, False])
        model(alpha=alpha, solver=solver, tol=tol, fit_intercept=fit_intercept)
        return SklearnModel(model, model_dir="model")



def objective(trial):
    models = [DecisionTreeClassifier, ExtraTreeClassifier, ExtraTreesClassifier, KNeighborsClassifier,
              MLPClassifier, RadiusNeighborsClassifier, RandomForestClassifier, RidgeClassifier, RidgeClassifierCV]
    model = trial.suggest_categorical('model', models)
    model = get_hyperparameters_for_models(model, trial)
    standardizer = _get_standardizer(trial)
    featurizer = _get_featurizer(trial, '1D')
    if featurizer.__class__.__name__ == 'TwoDimensionDescriptors' or \
            featurizer.__class__.__name__ == 'All3DDescriptors':
        scaler = _get_scaler(trial)
    else:
        scaler = PassThroughTransformer()
    feature_selection = _get_feature_selector(trial, "classification")
    final_steps = [('standardizer', standardizer), ('featurizer', featurizer), ('scaler', scaler),
                   ('feature_selector', feature_selection), ('model', model)]
    return final_steps


2023-07-03 18:22:45.410796: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-03 18:22:45.439640: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-03 18:22:45.440054: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from deepmol.pipeline_optimization import PipelineOptimization
from deepmol.metrics import Metric

po = PipelineOptimization(direction='maximize', study_name='test_predictor_pipeline')
def f1_score_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')
metric = Metric(f1_score_macro)

po.optimize(train_dataset=train, test_dataset=test, objective_steps=objective,
            metric=metric, n_trials=10, save_top_n=2)

[32m[I 2023-07-03 18:23:01,271][0m A new study created in memory with name: test_predictor_pipeline[0m
[32m[I 2023-07-03 18:23:01,273][0m Trial 0 finished with value: -inf and parameters: {'model': <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>, 'model__hidden_layer_sizes': (100, 100, 100), 'model__activation': 'relu', 'model__solver': 'sgd', 'model__batch_size': 16, 'model__learning_rate': 0.01}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 18:23:01,275][0m Trial 1 finished with value: -inf and parameters: {'model': <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>, 'model__hidden_layer_sizes': (100, 100, 100), 'model__activation': 'identity', 'model__solver': 'adam', 'model__batch_size': 256, 'model__learning_rate': 0.001}. Best is trial 0 with value: -inf.[0m


CategoricalDistribution does not support dynamic value space.
CategoricalDistribution does not support dynamic value space.
2023-07-03 18:23:01,276 — INFO — Standardizer CustomStandardizer initialized with -1 jobs.


[33m[W 2023-07-03 18:23:02,477][0m Trial 2 failed with parameters: {'model': <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>, 'model__criterion': 'entropy', 'model__max_depth': 170, 'model__min_samples_split': 10, 'model__min_samples_leaf': 8, 'standardizer': 'custom_standardizer', 'standardization_type': 'heavy_standardisation', '1D_featurizer': 'maccs', 'feature_selector': 'pass_through_transformer'} because of the following error: TypeError("fit() missing 1 required positional argument: 'y'").[0m
Traceback (most recent call last):
  File "/home/bisbii/anaconda3/envs/sm_precursor_predictor/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/home/bisbii/anaconda3/envs/sm_precursor_predictor/lib/python3.9/site-packages/deepmol/pipeline_optimization/objective_wrapper.py", line 64, in __call__
    pipeline.fit(train_dataset)
  File "/home/bisbii/anaconda3/envs/sm_precursor_predictor/lib/python3.9/site-packa

TypeError: fit() missing 1 required positional argument: 'y'

In [10]:
pipeline.evaluate(test, metrics=[Metric(f1_score_macro)], per_task_metrics=True)

({'f1_score_macro': 0.6312369289383678},
 {'f1_score_macro': [0.9427942794279429,
   0.9131830732678354,
   0.9213231476674331,
   0.9014936247723133,
   0.8611893815808986,
   0.8394648829431439,
   0.9006720905553591,
   0.7346938775510203,
   1.0,
   0.4926829268292683,
   0.6701902748414377,
   0.8275290215588722,
   0.8317152103559871,
   0.6821731748726656,
   0.4951456310679612,
   0.8768064439706231,
   0.7287742683280208,
   0.8723613156602849,
   0.8733766233766234,
   0.7832831673998611]})

In [12]:
from sklearn.metrics import f1_score
from deepmol.metrics import Metric
from deepmol.pipeline_optimization import PipelineOptimization

po = PipelineOptimization(direction='maximize', study_name='test_pipeline', storage='sqlite:///test_pipeline.db')
metric = Metric(f1_score_macro)

po.optimize(train_dataset=train, test_dataset=test, objective_steps='sklearn',
            metric=metric, n_trials=10, data=train, save_top_n=2)

[32m[I 2023-07-03 17:26:05,848][0m A new study created in RDB with name: test_pipeline[0m
[32m[I 2023-07-03 17:26:05,938][0m Trial 0 finished with value: -inf and parameters: {'1D_featurizer': 'layered', 'fpSize': 2048, 'minPath': 3, 'maxPath': 5, 'feature_selector': 'select_from_model_fs', 'n_estimators': 799}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 17:26:05,995][0m Trial 1 finished with value: -inf and parameters: {'1D_featurizer': 'morgan', 'radius': 6, 'n_bits': 1024, 'feature_selector': 'pass_through_transformer'}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 17:26:06,036][0m Trial 2 finished with value: -inf and parameters: {'1D_featurizer': 'maccs', 'feature_selector': 'pass_through_transformer'}. Best is trial 0 with value: -inf.[0m


['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
Unknown task type: ['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'class

[32m[I 2023-07-03 17:26:06,106][0m Trial 3 finished with value: -inf and parameters: {'1D_featurizer': 'morgan', 'radius': 4, 'n_bits': 2048, 'feature_selector': 'percentil_fs', 'percentile': 40}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 17:26:06,174][0m Trial 4 finished with value: -inf and parameters: {'1D_featurizer': 'morgan', 'radius': 2, 'n_bits': 2048, 'feature_selector': 'boruta_algorithm', 'support_weak': False}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 17:26:06,233][0m Trial 5 finished with value: -inf and parameters: {'1D_featurizer': 'mixed', 'f1': '2d_descriptors', 'f2': 'rdk', 'feature_selector': 'pass_through_transformer'}. Best is trial 0 with value: -inf.[0m


Unknown task type: ['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
Unknown task type: ['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'clas

[32m[I 2023-07-03 17:26:06,311][0m Trial 6 finished with value: -inf and parameters: {'1D_featurizer': 'atom_pair', 'nBits': 1024, 'minLength': 1, 'maxLength': 20, 'feature_selector': 'percentil_fs', 'percentile': 92}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 17:26:06,396][0m Trial 7 finished with value: -inf and parameters: {'1D_featurizer': 'layered', 'fpSize': 1024, 'minPath': 2, 'maxPath': 9, 'feature_selector': 'percentil_fs', 'percentile': 40}. Best is trial 0 with value: -inf.[0m
[32m[I 2023-07-03 17:26:06,479][0m Trial 8 finished with value: -inf and parameters: {'1D_featurizer': 'atom_pair', 'nBits': 1024, 'minLength': 3, 'maxLength': 20, 'feature_selector': 'boruta_algorithm', 'support_weak': True}. Best is trial 0 with value: -inf.[0m


Unknown task type: ['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
Unknown task type: ['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'clas

[32m[I 2023-07-03 17:26:06,570][0m Trial 9 finished with value: -inf and parameters: {'1D_featurizer': 'atom_pair', 'nBits': 1024, 'minLength': 3, 'maxLength': 20, 'feature_selector': 'k_best', 'k': 21}. Best is trial 0 with value: -inf.[0m


Unknown task type: ['classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'classification']
