In [1]:
# First, install all the required packages.
# we recommend using Python version 3.10. Additionally, please ensure that you have CUDA (version >10.1) and GCC (version 7.*) installed, as GraphDot relies on these components.
!pip install numpy==1.22.3 git+https://gitlab.com/Xiangyan93/graphdot.git@feature/xy git+https://github.com/bp-kelley/descriptastorus git+https://github.com/Xiangyan93/chemprop.git@molalkit
!pip install mgktools
!pip install -e ...

Collecting git+https://gitlab.com/Xiangyan93/graphdot.git@feature/xy
  Cloning https://gitlab.com/Xiangyan93/graphdot.git (to revision feature/xy) to /tmp/pip-req-build-swl4yrgw
  Running command git clone -q https://gitlab.com/Xiangyan93/graphdot.git /tmp/pip-req-build-swl4yrgw
  Running command git checkout -b feature/xy --track origin/feature/xy
  Switched to a new branch 'feature/xy'
  Branch 'feature/xy' set up to track remote branch 'feature/xy' from 'origin'.
  Resolved https://gitlab.com/Xiangyan93/graphdot.git to commit 45a7e9ad53fd4748852b2ce78cad8927cc62e454
Collecting git+https://github.com/bp-kelley/descriptastorus
  Cloning https://github.com/bp-kelley/descriptastorus to /tmp/pip-req-build-d3flgngc
  Running command git clone -q https://github.com/bp-kelley/descriptastorus /tmp/pip-req-build-d3flgngc
  Resolved https://github.com/bp-kelley/descriptastorus to commit da9760932ab9a78b116bc697795dd9e1798f087a
Collecting git+https://github.com/Xiangyan93/chemprop.gi

In [2]:
# MolALKit comes equipped with an executable file "molalkit_run", making it a breeze to run active learning from your Linux terminal.
# To access the help message and understand the available arguments for "molalkit_run," simply use the following command:
!molalkit_run --help

usage: molalkit_run --save_dir SAVE_DIR [--n_jobs N_JOBS]
                    [--data_path DATA_PATH] --metrics
                    [{roc-auc,accuracy,precision,recall,f1_score,mcc,rmse,mae,mse,r2,max} ...]
                    [--evaluate_stride EVALUATE_STRIDE] [--top_k TOP_K]
                    [--yoked_learning_only] --learning_type
                    {passive,explorative,exploitive,PI,EI,UCB}
                    [--exploitive_target EXPLOITIVE_TARGET]
                    [--init_size INIT_SIZE] [--batch_size BATCH_SIZE]
                    [--batch_mode {naive,clustering}]
                    [--stop_ratio STOP_RATIO] [--stop_size STOP_SIZE]
                    [--max_iter MAX_ITER] [--save_cpt_stride SAVE_CPT_STRIDE]
                    [--load_checkpoint]
                    [--forget_protocol {forget_first,forget_random,min_oob_uncertainty,max_oob_uncertainty,min_oob_error,min_loo_error}]
                    [--forget_cutoff FORGET_CUTOFF]
                    [--

In [10]:
# We aim to execute explorative active learning on the BACE classification dataset, which is divided into two sets: one for active learning data selection and the other for validation using a scaffold split (50:50 ratio). For this task, we're employing a random forest as our surrogate model, while RDKit features serve as the molecular descriptors. Performance evaluation occurs every 10 iterations of active learning, using ROC-AUC as the chosen metric. We've set the random seed to 0, and the results will be stored in the 'bace' directory.
# Here's the command to achieve this:
!molalkit_run --data_public bace --metrics roc-auc --learning_type explorative --model_config_selector RandomForest_RDKitNorm_Config --split_type scaffold_order --split_sizes 0.5 0.5 --evaluate_stride 10 --seed 0 --save_dir bace --n_jobs 4

100%|█████████████████████████████████████| 1513/1513 [00:00<00:00, 2414.83it/s]
Total scaffolds = 671 | 
split 0 scaffolds = 75 | 
split 1 scaffolds = 596 | 
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 668 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done 748 out of 755 | elapsed:   11.0s remaining:    0.1s
[Parallel(n_jobs=4)]: Done 755 out of 755 | elapsed:   11.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   2 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 728 tasks      | elapsed:    9.3s
[Parallel(n_jobs=4)]: Done 756 out of 756 | elapsed:    9.6s finished
Start a new active learning run.
[Parallel(n_jobs=4)]: Using backe

In [1]:
from molalkit.args import ActiveLearningArgs
from molalkit.al.learner import ActiveLearner


arguments = [
    '--data_public', 'bace',
    '--metrics', 'roc-auc',
    '--learning_type', 'explorative',
    '--model_config_selector', 'RandomForest_RDKitNorm_Config',
    '--split_type', 'scaffold_order',
    '--split_sizes', '0.5', '0.5',
    '--evaluate_stride', '10',
    '--seed', '0',
    '--save_dir', 'bace',
    '--n_jobs', '4'
]
args = ActiveLearningArgs().parse_args(arguments)
active_learner = ActiveLearner(save_dir=args.save_dir,
                               selection_method=args.selection_method,
                               forgetter=args.forgetter,
                               model_selector=args.model_selector,
                               dataset_train_selector=args.data_train_selector,
                               dataset_pool_selector=args.data_pool_selector,
                               dataset_val_selector=args.data_val_selector,
                               metrics=args.metrics,
                               top_k_id=args.top_k_id,
                               model_evaluators=args.model_evaluators,
                               dataset_train_evaluators=args.data_train_evaluators,
                               dataset_pool_evaluators=args.data_pool_evaluators,
                               dataset_val_evaluators=args.data_val_evaluators,
                               yoked_learning_only=args.yoked_learning_only,
                               stop_size=args.stop_size,
                               evaluate_stride=args.evaluate_stride,
                               kernel=args.kernel_selector,
                               save_cpt_stride=args.save_cpt_stride,
                               seed=args.seed,
                               logger=args.logger)
active_learner.run(max_iter=args.max_iter)

100%|██████████| 1513/1513 [00:00<00:00, 2199.25it/s]
Total scaffolds = 671 | 
split 0 scaffolds = 75 | 
split 1 scaffolds = 596 | 
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 