# Active Learning

**Intro goes here**

## Imports

In [1]:
import time
from pathlib import Path
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import math
from tqdm import tqdm
from itertools import product
import logging
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor

logging.basicConfig(
    format="[%(asctime)s] %(levelname)s: %(message)s",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ],
)

logging.getLogger("deepchem").setLevel(logging.WARNING)
log = logging.getLogger(__name__)

## Defining some useful functions

In [2]:
def build_virtual_library() -> pd.DataFrame:
    """Constructs a virtual library by coupling building blocks from the input smi files.

    Returns:
        pd.DataFrame: A DataFrame containing the molecular objects and SMILES strings of the products.
    """
    log.info("Building virtual library")
    reaction_smarts = (
        "N[c:4][c:3]C(O)=O.[#6:1][NH2].[#6:2]C(=O)[OH]>>[C:2]c1n[c:4][c:3]c(=O)n1[C:1]"
    )
    bb_types = ["aminobenzoic", "carboxylic_acids", "primary_amines"]
    rxn = AllChem.ReactionFromSmarts(reaction_smarts)

    building_blocks = []
    for bb in bb_types:
        smil = []
        with open(Path(f"data/{bb}_100.smi"), "r") as f:
            for line in f.readlines():
                smiles, _ = line.split()
                smil.append(smiles)
        building_blocks.append(smil)

    total_prods = math.prod([len(x) for x in building_blocks])

    product_list = []
    for reagents in tqdm(product(*building_blocks), total=total_prods):
        reagent_mol_list = [Chem.MolFromSmiles(x) for x in reagents]
        products = rxn.RunReactants(reagent_mol_list)
        if products:
            Chem.SanitizeMol(products[0][0])
            product_list.append(products[0][0])
    library = pd.DataFrame(
        product_list, index=[Chem.MolToSmiles(m) for m in product_list], columns=["mol"]
    )
    library["slow_scores"] = np.NaN
    library["model_scores"] = np.NaN
    log.info("Virtual library built with %s products", len(product_list))
    return library


def slow_function(smiles: pd.Series) -> np.array:
    """The slow scoring function. This function takes too long to score the entire library.

    Args:
        smiles (str): SMILES string to be scored.

    Returns:
        np.arary: The scores of the input SMILES strings.
    """
    log.info("Scoring %s compounds with slow scoring function", len(smiles))
    time.sleep(10)
    return np.random.rand(len(smiles))


def create_morgan_fingerprints(library: pd.DataFrame) -> pd.DataFrame:
    """Creates Morgan fingerprints for the input library.

    Args:
        library (pd.DataFrame): The input library.

    Returns:
        pd.DataFrame: The Morgan fingerprints of the input library.
    """
    log.info("Creating Morgan fingerprints")
    fps = [
        list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2))
        for mol in tqdm(library.mol)
    ]
    return pd.DataFrame(
        fps, columns=[f"fp_{x}" for x in range(len(fps[0]))], index=library.index
    )


def train_ml_model(library: pd.DataFrame) -> RandomForestRegressor:
    """Trains a random forest regressor model on slow scores for the input library.

    Args:
        library (pd.DataFrame): The input library.

    Returns:
        RandomForestRegressor: The trained random forest regressor model.
    """
    log.info("Training model with slow scores")
    scored = library[~library["slow_scores"].isna()]
    X = fingerprints.loc[scored.index]
    y = scored["slow_scores"]
    regressor = RandomForestRegressor()
    regressor.fit(X, y)
    return regressor


def score_library(
    library: pd.DataFrame, regressor: RandomForestRegressor
) -> pd.DataFrame:
    """Scores the entire library with the trained model.

    Args:
        library (pd.DataFrame): The input library.
        regressor (RandomForestRegressor): The trained random forest regressor model.

    Returns:
        pd.DataFrame: The input library with the model scores.
    """
    log.info("Scoring entire library with model")
    library["model_scores"] = regressor.predict(fingerprints)
    return library

## Active learning pipeline

In [3]:
# Create the virtual library
library = build_virtual_library()

# Create morgan fingerprints for the library
fingerprints = create_morgan_fingerprints(library)

# Select initial random sample
initial_sample = library.sample(1000)

# Score the initial sample
initial_scores = slow_function(initial_sample.index)

# Save the slow scores
library.loc[initial_sample.index, "slow_scores"] = initial_scores

# Train the initial ML model
initial_model = train_ml_model(library)

# Use the model to score the entire virtual library
library = score_library(library, initial_model)

# Run active learning
for al_round in range(5):
    log.info("Active learning round %s", al_round + 1)
    log.info("Current top score: %s", library["slow_scores"].max())
    # Select the top scoring 1000 molecules with no slow scores
    top_1000 = (
        library[library["slow_scores"].isna()]
        .sort_values("model_scores", ascending=False)
        .head(1000)
    )
    # Score the top 1000 molecules with the slow function
    slow_scores = slow_function(top_1000.index)
    # Save the slow scores
    library.loc[top_1000.index, "slow_scores"] = slow_scores
    # Retrain the ML model
    model = train_ml_model(library)
    # Score the entire library with the new model
    library = score_library(library, model)

# How to evaluate the performance?

[2024-02-13 10:35:52,350] INFO: Building virtual library
100%|██████████| 1000000/1000000 [01:40<00:00, 9984.00it/s]
[2024-02-13 10:37:40,947] INFO: Virtual library built with 132500 products
[2024-02-13 10:37:40,948] INFO: Creating Morgan fingerprints
100%|██████████| 132500/132500 [01:43<00:00, 1274.46it/s]
[2024-02-13 10:40:21,788] INFO: Scoring 1000 compounds with slow function
[2024-02-13 10:40:31,819] INFO: Training model with slow scores
[2024-02-13 10:40:35,007] INFO: Scoring entire library with model
[2024-02-13 10:40:36,048] INFO: Active learning round 0
[2024-02-13 10:40:36,069] INFO: Scoring 1000 compounds with slow function
[2024-02-13 10:40:46,077] INFO: Training model with slow scores
[2024-02-13 10:40:52,400] INFO: Scoring entire library with model
[2024-02-13 10:40:53,594] INFO: Active learning round 1
[2024-02-13 10:40:53,616] INFO: Scoring 1000 compounds with slow function
[2024-02-13 10:41:03,624] INFO: Training model with slow scores
[2024-02-13 10:41:14,171] INFO:

In [5]:
library.sort_values("slow_scores", ascending=False).head(10)

Unnamed: 0,mol,slow_scores,model_scores
C[C@H](NC(=O)CN)c1nc2cc(=O)[nH]cc2c(=O)n1C(=N)NC[C@H](N)C(=O)O,<rdkit.Chem.rdchem.Mol object at 0x175dda3b0>,0.999838,0.606798
COC(=O)[C@H](N)Cc1nc2c(C(=O)O)cccc2c(=O)n1C(=O)NC[C@H](N)C(=O)O,<rdkit.Chem.rdchem.Mol object at 0x2ba0512a0>,0.999586,0.901857
N[C@H](CO)c1nc2ccc3cn[nH]c3c2c(=O)n1[C@H]1CN[C@H](C(=O)O)C1,<rdkit.Chem.rdchem.Mol object at 0x29f5e6810>,0.99954,0.717455
COc1ccc2nc([C@@H](N)CO)n(C(=O)[C@H](N)CC(=O)O)c(=O)c2c1,<rdkit.Chem.rdchem.Mol object at 0x2a843e880>,0.999469,0.603959
N=C(NC[C@H](N)C(=O)O)n1c(CNC(=O)CCN)nc2nc[nH]c2c1=O,<rdkit.Chem.rdchem.Mol object at 0x286670ac0>,0.999022,0.68515
CCn1c(=O)ncc2c(=O)n([C@@](C)(CO)C(=O)O)c(Cn3cnnc3N)nc21,<rdkit.Chem.rdchem.Mol object at 0x17e0e25e0>,0.998761,0.813038
CN(C)C[C@@H](C(=O)O)n1c([C@H](N)c2nnn[nH]2)nc2[nH]c(=O)c(C#N)cc2c1=O,<rdkit.Chem.rdchem.Mol object at 0x29f543840>,0.998732,0.850484
C[C@@H](O)[C@@H](C(=O)O)n1c([C@@H](N)CCO)nc2cnncc2c1=O,<rdkit.Chem.rdchem.Mol object at 0x153293b50>,0.998539,0.573075
NC[C@@H](CO)c1nc2nccnc2c(=O)n1CC(=O)NCCC(=O)O,<rdkit.Chem.rdchem.Mol object at 0x17e074200>,0.998528,0.546959
NCCC(=O)NCc1nc2ncncc2c(=O)n1[C@@H]1CN[C@H](C(=O)O)C1,<rdkit.Chem.rdchem.Mol object at 0x1653b2180>,0.9985,0.623078
