In [1]:
# relevant imports

from optunaz.three_step_opt_build_merge import (
    optimize,
    buildconfig_best,
    build_best,
    build_merged,
)
from optunaz.config import ModelMode, OptimizationDirection
from optunaz.config.optconfig import RandomForestClassifier, KNeighborsClassifier, LogisticRegression, ChemPropClassifier
from optunaz.config.optconfig import OptimizationConfig
from optunaz.datareader import Dataset
from optunaz.descriptors import ECFP, MACCS_keys, ECFP_counts, PathFP
import pickle

# hyperparameter optimisation

config = OptimizationConfig(
    data=Dataset(
        input_column="Structure", # to be predicted from ie input SMILES -> get predicted activity eg
        response_column="Class", # to be predicted
        training_dataset_file="M:/ML_scripts/notebooks/tobramycin_undersampled_train.csv", # file containing training data
        test_dataset_file="M:/ML_scripts/notebooks/tobramycin_undersampled_test.csv"), # hidden during optimisation
    descriptors=[ECFP.new(), MACCS_keys.new(), PathFP.new()], # QSARtuna also has internal physchem descriptors
    algorithms=[RandomForestClassifier.new(
                n_estimators=RandomForestClassifier.Parameters.RandomForestClassifierParametersNEstimators(
                    low=10, high=100
                )
        ), KNeighborsClassifier.new(), LogisticRegression.new(), ChemPropClassifier.new()
    ],
    settings=OptimizationConfig.Settings(
        mode=ModelMode.CLASSIFICATION,
        cross_validation=5, # number of splits into 'mini' train/test sets for validation
        n_trials=100, # number of optimisation trials for hyperparameter optimisation
        n_startup_trials=40, # number of initial, exploratory, random trials (no optimisation in those trials)
        n_jobs=-1, # number of parallel jobs, set to -1 to use as many CPU cores available
        direction=OptimizationDirection.MAXIMIZATION, 
        random_seed=42, # seed for reproducibility
    ),
)

In [2]:
study = optimize(config, study_name='run1') # running the optuna optimisation study

[I 2025-01-27 09:55:17,153] A new study created in memory with name: run1
[I 2025-01-27 09:55:17,308] A new study created in memory with name: study_name_0
[I 2025-01-27 09:55:18,923] Trial 0 finished with value: 0.6465952800543215 and parameters: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 6, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 0 with value: 0.6465952800543215.
[I 2025-01-27 09:55:19,502] Trial 1 finished with value: 0.6326620314734288 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 

Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 10, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "MACCS_keys", "parameters": {}}'}, return [0.5663154155350739]


[I 2025-01-27 09:55:23,711] Trial 7 finished with value: 0.5407329684953791 and parameters: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 2, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "PathFP", "parameters": {"maxPath": 3, "fpSize": 2048}}'}. Best is trial 3 with value: 0.6595327164541944.
[I 2025-01-27 09:55:24,289] Trial 8 finished with value: 0.556198274332163 and parameters: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 3, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'd

Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 4, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "MACCS_keys", "parameters": {}}'}, return [0.5504615683802662]


[I 2025-01-27 09:55:25,318] Trial 11 finished with value: 0.6022605403260524 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 12, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 76, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "MACCS_keys", "parameters": {}}'}. Best is trial 3 with value: 0.6595327164541944.
[I 2025-01-27 09:55:25,535] Trial 12 finished with value: 0.6397594928697388 and parameters: {'algorithm_name': 'LogisticRegression', 'LogisticRegression_algorithm_hash': '8908f22fafa8a855aeb58e3dc9f9ce8e', 'solver__8908f22fafa8a855aeb58e3dc9f9ce8e': 'lbfgs', 'C__8908f22fafa8a855aeb58e3dc9f9ce8e': 1.0, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 3 with value: 0.6595327164541944.
[I 2025-01-27 09:55:25,541] Tr

Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 6, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6465952800543215]


[I 2025-01-27 09:55:25,935] Trial 14 finished with value: 0.6670249135127893 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 10, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 24, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 2025-01-27 09:55:26,276] Trial 15 finished with value: 0.6023698968393871 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 29, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 59, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "MACCS

Duplicated trial: {'algorithm_name': 'LogisticRegression', 'LogisticRegression_algorithm_hash': '8908f22fafa8a855aeb58e3dc9f9ce8e', 'solver__8908f22fafa8a855aeb58e3dc9f9ce8e': 'lbfgs', 'C__8908f22fafa8a855aeb58e3dc9f9ce8e': 1.0, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6397594928697388]


[I 2025-01-27 09:55:26,787] Trial 17 finished with value: 0.512960361594594 and parameters: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 6, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "PathFP", "parameters": {"maxPath": 3, "fpSize": 2048}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 2025-01-27 09:55:27,100] Trial 18 finished with value: 0.6152854102054451 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 11, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 35, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": 

Duplicated trial: {'algorithm_name': 'LogisticRegression', 'LogisticRegression_algorithm_hash': '8908f22fafa8a855aeb58e3dc9f9ce8e', 'solver__8908f22fafa8a855aeb58e3dc9f9ce8e': 'saga', 'C__8908f22fafa8a855aeb58e3dc9f9ce8e': 1.0, 'descriptor': '{"name": "PathFP", "parameters": {"maxPath": 3, "fpSize": 2048}}'}, return [0.4973096645831836]


[I 2025-01-27 09:55:30,079] Trial 27 finished with value: 0.5596798859694294 and parameters: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 6, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "MACCS_keys", "parameters": {}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 2025-01-27 09:55:30,327] Trial 28 finished with value: 0.5773975086214922 and parameters: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 9, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name"

Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 4, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "MACCS_keys", "parameters": {}}'}, return [0.5504615683802662]


[I 2025-01-27 09:55:31,089] Trial 31 finished with value: 0.6607281133976857 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 11, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 45, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 2025-01-27 09:55:31,100] Trial 32 pruned. Duplicate parameter set


Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 6, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "PathFP", "parameters": {"maxPath": 3, "fpSize": 2048}}'}, return [0.512960361594594]


[I 2025-01-27 09:55:31,386] Trial 33 finished with value: 0.4937040691599487 and parameters: {'algorithm_name': 'LogisticRegression', 'LogisticRegression_algorithm_hash': '8908f22fafa8a855aeb58e3dc9f9ce8e', 'solver__8908f22fafa8a855aeb58e3dc9f9ce8e': 'newton-cg', 'C__8908f22fafa8a855aeb58e3dc9f9ce8e': 1.0, 'descriptor': '{"name": "PathFP", "parameters": {"maxPath": 3, "fpSize": 2048}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 2025-01-27 09:55:31,395] Trial 34 pruned. Duplicate parameter set
[I 2025-01-27 09:55:31,405] Trial 35 pruned. Duplicate parameter set
[I 2025-01-27 09:55:31,415] Trial 36 pruned. Duplicate parameter set
[I 2025-01-27 09:55:31,425] Trial 37 pruned. Duplicate parameter set


Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 6, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6465952800543215]
Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KNeighborsClassifier_algorithm_hash': 'e51ca55089f389fc37a736adb2aa0e42', 'metric__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsMetric.MINKOWSKI: 'minkowski'>, 'n_neighbors__e51ca55089f389fc37a736adb2aa0e42': 4, 'weights__e51ca55089f389fc37a736adb2aa0e42': <KNeighborsWeights.UNIFORM: 'uniform'>, 'descriptor': '{"name": "MACCS_keys", "parameters": {}}'}, return [0.5504615683802662]
Duplicated trial: {'algorithm_name': 'KNeighborsClassifier', 'KN

[I 2025-01-27 09:55:31,850] Trial 38 finished with value: 0.6414880741469204 and parameters: {'algorithm_name': 'LogisticRegression', 'LogisticRegression_algorithm_hash': '8908f22fafa8a855aeb58e3dc9f9ce8e', 'solver__8908f22fafa8a855aeb58e3dc9f9ce8e': 'sag', 'C__8908f22fafa8a855aeb58e3dc9f9ce8e': 1.0, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 2025-01-27 09:55:32,256] Trial 39 finished with value: 0.5087808952355071 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 14, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 25, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "PathFP", "parameters": {"maxPath": 3, "fpSize": 2048}}'}. Best is trial 14 with value: 0.6670249135127893.
[I 202

Duplicated trial: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 6, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 20, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6544957865207367]


[I 2025-01-27 09:55:37,208] Trial 55 finished with value: 0.658077597678162 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 6, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 42, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 52 with value: 0.6739880025512673.
[I 2025-01-27 09:55:37,538] Trial 56 finished with value: 0.6738241342840838 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 9, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 20, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "

Duplicated trial: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 7, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 23, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6733896554270584]


[I 2025-01-27 09:55:40,341] Trial 64 finished with value: 0.666148133324753 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 10, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 100, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 61 with value: 0.681029655104861.
[I 2025-01-27 09:55:40,934] Trial 65 finished with value: 0.667859611918311 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 10, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 92, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", 

Duplicated trial: {'algorithm_name': 'LogisticRegression', 'LogisticRegression_algorithm_hash': '8908f22fafa8a855aeb58e3dc9f9ce8e', 'solver__8908f22fafa8a855aeb58e3dc9f9ce8e': 'lbfgs', 'C__8908f22fafa8a855aeb58e3dc9f9ce8e': 1.0, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6397594928697388]


[I 2025-01-27 09:55:51,645] Trial 89 finished with value: 0.6746597972036243 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 11, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 18, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 61 with value: 0.681029655104861.
[I 2025-01-27 09:55:51,911] Trial 90 finished with value: 0.6070579289959466 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 13, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 19, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "MACCS_

Duplicated trial: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 13, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 18, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6743357736162032]


[I 2025-01-27 09:55:52,941] Trial 94 finished with value: 0.6595718292881894 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 10, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 30, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}. Best is trial 61 with value: 0.681029655104861.
[I 2025-01-27 09:55:53,296] Trial 95 finished with value: 0.6741689603054075 and parameters: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 12, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 22, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP",

Duplicated trial: {'algorithm_name': 'RandomForestClassifier', 'RandomForestClassifier_algorithm_hash': '81728f5b173c58aa046e6327faa7c4b5', 'max_depth__81728f5b173c58aa046e6327faa7c4b5': 9, 'n_estimators__81728f5b173c58aa046e6327faa7c4b5': 13, 'max_features__81728f5b173c58aa046e6327faa7c4b5': <RandomForestMaxFeatures.AUTO: 'auto'>, 'descriptor': '{"name": "ECFP", "parameters": {"radius": 3, "nBits": 2048, "returnRdkit": false}}'}, return [0.6674103921040592]


In [3]:
build_best(buildconfig_best(study), "./test_classifier.pkl") # build a model with the best, optimised parameters

BuildConfig(data=Dataset(training_dataset_file='M:/ML_scripts/notebooks/tobramycin_undersampled_train.csv', input_column='Structure', response_column='Class', response_type='classification', aux_column=None, aux_transform=None, deduplication_strategy=KeepMedian(name='KeepMedian'), split_strategy=NoSplitting(name='NoSplitting'), test_dataset_file='M:/ML_scripts/notebooks/tobramycin_undersampled_test.csv', save_intermediate_files=False, intermediate_training_dataset_file=None, intermediate_test_dataset_file=None, log_transform=False, log_transform_base=None, log_transform_negative=None, log_transform_unit_conversion=None, probabilistic_threshold_representation=False, probabilistic_threshold_representation_threshold=None, probabilistic_threshold_representation_std=None, _sets_initialized=True), metadata=BuildConfig.Metadata(name='', cross_validation=5, shuffle=False, best_trial=61, best_value=0.681029655104861, n_trials=100, visualization=None), descriptor=ECFP(name='ECFP', parameters=ECF

In [4]:
with open(r'M:\ML_scripts\notebooks\test_classifier.pkl', 'rb') as a:
    run1_model = pickle.load(a) # load the model

# use the model to predict class (1 or 0) from an unseen SMILES string

run1_model.predict_from_smiles('Cc1cc(c(o1)C)C(=O)[N@]2CCCCNC(=O)[C@@H]3C[C@@H](C[N@]3C(=O)Cc4c[n@](c5c4cccc5)C)[N@@](CCC2)C(=O)c6ccccc6')

array([0.89112007])

In [5]:
print(run1_model) # print, just to see metrics 


# for this model:
# train scores:
# accuracy=0.89, f1=0.9, neg_brier_score=-0.12 (better=closer to 0), roc_auc=0.96
# 
# test scores:
# accuracy=0.57, f1=0.5333, neg_brier_score=-0.25, roc_auc=0.62 

QSARtunaModel(predictor=RandomForestClassifier(class_weight='balanced', max_depth=7, max_features=1.0,
                       n_estimators=19, n_jobs=-1, random_state=42), descriptor=ECFP(name='ECFP', parameters=ECFP.Parameters(radius=3, nBits=2048, returnRdkit=False)), mode=<ModelMode.CLASSIFICATION: 'classification'>, transform=None, aux_transform=None, metadata={'name': '', 'buildconfig': {'data': {'training_dataset_file': 'M:/ML_scripts/notebooks/tobramycin_undersampled_train.csv', 'input_column': 'Structure', 'response_column': 'Class', 'response_type': 'classification', 'deduplication_strategy': {'name': 'KeepMedian'}, 'split_strategy': {'name': 'NoSplitting'}, 'test_dataset_file': 'M:/ML_scripts/notebooks/tobramycin_undersampled_test.csv', 'save_intermediate_files': False, 'log_transform': False, 'log_transform_base': None, 'log_transform_negative': None, 'log_transform_unit_conversion': None, 'probabilistic_threshold_representation': False, 'probabilistic_threshold_representati