In [1]:
# Check CUDA and GCC version
!nvcc -V
!gcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [2]:
# First, install all the required packages.
# we recommend using Python version 3.10. Additionally, please ensure that you have CUDA (version >10.1) and GCC (version 7.*) installed, as GraphDot relies on these components.
!pip install numpy==1.22.3 git+https://gitlab.com/Xiangyan93/graphdot.git@feature/xy git+https://github.com/bp-kelley/descriptastorus git+https://github.com/Xiangyan93/chemprop.git@molalkit
!pip install mgktools molalkit

Collecting git+https://gitlab.com/Xiangyan93/graphdot.git@feature/xy
  Cloning https://gitlab.com/Xiangyan93/graphdot.git (to revision feature/xy) to /tmp/pip-req-build-17c80q40
  Running command git clone --filter=blob:none --quiet https://gitlab.com/Xiangyan93/graphdot.git /tmp/pip-req-build-17c80q40
  Running command git checkout -b feature/xy --track origin/feature/xy
  Switched to a new branch 'feature/xy'
  Branch 'feature/xy' set up to track remote branch 'feature/xy' from 'origin'.
  Resolved https://gitlab.com/Xiangyan93/graphdot.git to commit a092ee06670620f2ebce73530d7d16a2cef1cde3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/bp-kelley/descriptastorus
  Cloning https://github.com/bp-kelley/descriptastorus to /tmp/pip-req-build-7nkr4o8p
  Running command git clone --filter=blob:none --quiet https://github.com/bp-kelley/descriptastorus /tmp/pip-req-build-7nkr4o8p
  Resolved https://github.com/bp-kelley/descriptastorus to commit da97609

Collecting mgktools
  Using cached mgktools-0.1.0-py3-none-any.whl (73 kB)
Collecting molalkit
  Using cached molalkit-0.0.1-py3-none-any.whl (2.5 MB)
Collecting rxntools>=0.0.2 (from mgktools)
  Using cached rxntools-0.0.2-py3-none-any.whl (14 kB)
Installing collected packages: rxntools, molalkit, mgktools
Successfully installed mgktools-0.1.0 molalkit-0.0.1 rxntools-0.0.2


In [3]:
# MolALKit comes equipped with an executable file "molalkit_run", making it a breeze to run active learning from your Linux terminal.
# To access the help message and understand the available arguments for "molalkit_run," simply use the following command:
!molalkit_run --help

usage: molalkit_run --save_dir SAVE_DIR [--n_jobs N_JOBS] [--data_path DATA_PATH] --metrics
                    [{roc-auc,accuracy,precision,recall,f1_score,mcc,rmse,mae,mse,r2,max} ...]
                    [--evaluate_stride EVALUATE_STRIDE] [--top_k TOP_K] [--yoked_learning_only]
                    --learning_type {passive,explorative,exploitive}
                    [--exploitive_target EXPLOITIVE_TARGET] [--init_size INIT_SIZE]
                    [--batch_size BATCH_SIZE] [--batch_mode {naive,clustering}]
                    [--stop_ratio STOP_RATIO] [--stop_size STOP_SIZE] [--max_iter MAX_ITER]
                    [--save_cpt_stride SAVE_CPT_STRIDE] [--load_checkpoint]
                    [--forget_protocol {forget_first,forget_random,min_oob_uncertainty,max_oob_uncertainty,min_oob_error,min_loo_error}]
                    [--forget_cutoff FORGET_CUTOFF] [--forget_size FORGET_SIZE]
                    [--forget_ratio FORGET_RATIO] [--data_public DATA_PUBLIC]
                    [

In [4]:
# We aim to execute explorative active learning on the BACE classification dataset, which is divided into two sets: one for active learning data selection and the other for validation using a scaffold split (50:50 ratio). For this task, we're employing a random forest as our surrogate model, while RDKit features serve as the molecular descriptors. Performance evaluation occurs every 10 iterations of active learning, using ROC-AUC as the chosen metric. We've set the random seed to 0, and the results will be stored in the 'bace' directory.
# Here's the command to achieve this:
!molalkit_run --data_public bace --metrics roc-auc --learning_type explorative --model_config_selector RandomForest_RDKitNorm_Config --split_type scaffold_order --split_sizes 0.5 0.5 --evaluate_stride 10 --seed 0 --save_dir bace --n_jobs 4

100% 1513/1513 [00:01<00:00, 775.49it/s]
Total scaffolds = 671 | 
split 0 scaffolds = 75 | 
split 1 scaffolds = 596 | 
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   32.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   48.6s
[Parallel(n_jobs=4)]: Done 755 out of 755 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   2 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   29.4s
[Parallel(n_jobs=4)]: Done 756 out of 756 | elapsed:   48.0s finished
Start a new active learning run.
[Parallel(n_jobs=4)]: Using backend LokyBackend

In [5]:
from molalkit.args import ActiveLearningArgs
from molalkit.al.learner import ActiveLearner


arguments = [
    '--data_public', 'bace',
    '--metrics', 'roc-auc',
    '--learning_type', 'explorative',
    '--model_config_selector', 'RandomForest_RDKitNorm_Config',
    '--split_type', 'scaffold_order',
    '--split_sizes', '0.5', '0.5',
    '--evaluate_stride', '10',
    '--seed', '0',
    '--save_dir', 'bace',
    '--n_jobs', '4'
]
args = ActiveLearningArgs().parse_args(arguments)
active_learner = ActiveLearner(save_dir=args.save_dir,
                               selection_method=args.selection_method,
                               forgetter=args.forgetter,
                               model_selector=args.model_selector,
                               dataset_train_selector=args.data_train_selector,
                               dataset_pool_selector=args.data_pool_selector,
                               dataset_val_selector=args.data_val_selector,
                               metrics=args.metrics,
                               top_k_id=args.top_k_id,
                               model_evaluators=args.model_evaluators,
                               dataset_train_evaluators=args.data_train_evaluators,
                               dataset_pool_evaluators=args.data_pool_evaluators,
                               dataset_val_evaluators=args.data_val_evaluators,
                               yoked_learning_only=args.yoked_learning_only,
                               stop_size=args.stop_size,
                               evaluate_stride=args.evaluate_stride,
                               kernel=args.kernel_selector,
                               save_cpt_stride=args.save_cpt_stride,
                               seed=args.seed,
                               logger=args.logger)
active_learner.run(max_iter=args.max_iter)

100%|██████████| 1513/1513 [00:00<00:00, 1663.94it/s]
Total scaffolds = 671 | 
split 0 scaffolds = 75 | 
split 1 scaffolds = 596 | 
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   21.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   31.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   46.9s
[Parallel(n_jobs=4)]: Done 755 out of 755 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   2 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   24.7s
[Parallel(n_jobs=4)]: Done 749 out of 756 | elapsed:   46.2s remaining:    0.4s
[Parallel(n_jobs=4)]: Done 756 out of 756 | elapsed:   46.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend 