# Binding Affinity Prediction with ML-Based Docking

Download the data and create the environment

In [None]:
import os
os.makedirs("data", exist_ok=True)
os.makedirs("figures", exist_ok=True)

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install() # kernel will die and restart. This is expected.

In [None]:
import condacolab
condacolab.check()

In [None]:
# Updating the base environment, this will take a bit (~2min)
!mamba env update -n base -f env.yml

In [None]:
!pip install py3Dmol #For rendering

In [None]:
from google.colab import output
output.enable_custom_widget_manager() # for output rendering


## Implementation
1) prepare the Protein and compound database for Docking and _deomonstrate Docking of a single compound_.

2) use this pipeline to dock batches of compounds and _actively learn a Gaussian Process_ surrogate model of the docking score for efficient screening.

### Input preparation

Here we consider a case of rigid docking with a known binding site. The information of the binding site will be provided by the crystallized ligand in the PDB entry.

In a real case scenario, if the binding site is unknown, the research of the binding site can be done through various techniques or be achieved through blind docking. Recently, ML methods \(e.g. DiffDock\) have shown great potential in this task.



In [None]:
import mdtraj
import numpy as np
import os
from utils import *


os.makedirs("sdf_inputs", exist_ok=True)
os.makedirs("smina_inputs", exist_ok=True)



In [None]:
traj = mdtraj.load("6vhn.pdb")

def get_protein_ligand_idxs(traj ,resname=None):
    protein = traj.top.select("protein")
    resname = "not protein" if not resname else resname
    ligand = traj.top.select(resname)
    return protein, ligand

def save_trimmed_pdb(path, traj, idxs):
    traj.atom_slice(idxs).save_pdb(path)


receptor, ligand = get_protein_ligand_idxs(traj, "not protein and not water")

save_trimmed_pdb("ligand.pdb", traj,ligand)
#save_trimmed_pdb("receptor.pdb", traj,receptor)



### Preparing a pdb

For the docking, we need to prepare a pdb file of the protein and the ligand.
The protein pdb can be prepared  by performing the following steps:

- Removed the ligand from the pdb file
- Deleted all the water molecules/solvent from the pdb file
- Converted residues to standard residues  
- Completed sidechains
- Added hydrogens to the protein to the correct protonation state (ph 7.4)
- Added charges to the protein (Gasteiger model)
- Changed names of the residues to AMBER ff14Sb names

Multiple programs can be used to complete this steps, e.g. med-chem programs including Maestro, Chimera, etc. or python libraries such as openmm and pbdfixer

In [None]:
import py3Dmol
# First we assign the py3Dmol.view as view
view=py3Dmol.view()
# The following lines are used to add the addModel class
# to read the PDB files of chain B and C
view.addModel(open('6vhn_prepared.pdb', 'r').read(),'pdb')
view.addModel(open('6vhn.pdb', 'r').read(),'pdb')
# Zooming into all visualized structures
view.zoomTo()
# Here we set the background color as white and set the cartoon style
view.setBackgroundColor('white')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
# And we finally visualize the structures using the command below
view.show()

In [None]:
def read_pdb_with_ob(file):
    """Read a molecule file with open babel

    Args:
        file (Union[str os.PathLike]): pdb input file

    Returns:
        mols (list): list of molecules found in the input file
    """

    try:
        from openbabel import pybel
    except ImportError:
        raise ImportError("Pybel is required for reading openbabel molecules")
    mols = [m for m in pybel.readfile(format="pdb",filename=file)]
    return mols

def prepare_ob_mols(ligand, outpath, overwrite=False):
    from openbabel import pybel
    out = pybel.Outputfile(format="pdbqt" , filename=outpath,  overwrite=overwrite)
    ligand.addh()
    if not ligand.OBMol.HasNonZeroCoords():
        ligand.make3D()
    ligand.calccharges(model="gasteiger")
    out.write(ligand)
    out.close()

ligand_mol= read_pdb_with_ob("ligand.pdb")


In [None]:
ligand_mol[0]

In [None]:
prepare_ob_mols(ligand_mol[0], "smina_inputs/ligand.pdbqt", overwrite=True)

In [None]:
prep=Preprocessor()
prep.prepare_receptor("6vhn_prepared.pdb", "smina_inputs/receptor.pdbqt")
#prep.prepare_ligand("data/ligand.pdb", "smina_inputs/ligand.pdbqt", in_format="pdb")

## Binding box creation


In [None]:
ligand=mdtraj.load("ligand.pdb")
def create_box_from_ligand(ligand):
    xyz=ligand.xyz[0] * 10  # convert to Angstrom from nm
    pocket_center = (xyz.max(axis=0) + xyz.min(axis=0)) / 2
    pocket_size = xyz.max(axis=0) - xyz.min(axis=0) + 5
    return Box.from_array(pocket_center, pocket_size)

box=create_box_from_ligand(ligand)
box

In [None]:
from utils import Docking

docker=Docking("smina_inputs/receptor.pdbqt", box)

In [None]:
os.makedirs("outputs", exist_ok=True)
text=docker.dock_one("smina_inputs/ligand.pdbqt", "outputs/ligand_out.sdf")
docker.parse_output(text)

In [None]:
view = py3Dmol.view()
view.addModel(open('6vhn_prepared.pdb', 'r').read())
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
view.addModel(open('outputs/ligand_out.sdf', 'r').read())
view.setStyle({'model': -1}, {"stick" :  {'color': "yellow"}})
view.zoomTo()
view.show()

In [None]:
poses=dm.read_sdf("outputs/ligand_out.sdf", as_df=True, mol_column="mols", n_jobs=-1)
poses

In [None]:
dm.viz.to_image(poses["mols"])

### Using molecules from the Enamine Hinge Binders Library

In [None]:
import datamol as dm
df_mols = dm.read_sdf("Enamine_Hinge_Binders_Library_plated_24000cmds_20210316 (1).sdf", as_df=True, mol_column="mols", n_jobs=-1)
docker.parse_mol_to_pbdqt(df_mols["mols"][0]) # write mol as pdbqt



In [None]:
text=docker.dock_one("smina_inputs/mol_0.pdbqt", "outputs/poses_0.sdf")
docker.parse_output(text)
poses=dm.read_sdf("outputs/poses_0.sdf", as_df=True, mol_column="mols", n_jobs=-1)
poses

In [None]:
dm.viz.to_image(poses["mols"])

In [None]:
df_mols.head()

In [None]:
df_mols["fp"]=df_mols["mols"].apply(lambda x : dm.to_fp(x))
df_mols

In [None]:
docker=Docking("smina_inputs/receptor.pdbqt", box)

In [None]:
docker.dock_multiple_mols(
        df_mols["mols"].tolist()[:5], list(range(5))
)

In [None]:
poses = dm.read_sdf("smina_outputs/poses.sdf", as_df=True, mol_column="mols", n_jobs=-1, sanitize=False)
poses.sort_values("minimizedAffinity",inplace=True)
poses

In [None]:
dm.viz.to_image(poses["mols"].tolist()[:10])

In [None]:
from ipywidgets import interact, Dropdown

def view_mol(molecule):
  view = py3Dmol.view(
      data=Chem.MolToMolBlock(molecule),
      #style={"sphere": {"scale" : 0.3}}
  )
  view.setStyle({"stick" : {}})
  view.addStyle({"sphere": {"scale" : 0.21}})
  view.zoomTo()
  return view.show()

mols=poses["mols"].tolist()
affs=poses["minimizedAffinity"].tolist()
smiles=poses["smiles"].tolist()

dropdown = Dropdown(
    options=[(f"{smile}:{aff} kcal/mol",mol) for aff,smile,mol in zip(affs,smiles,mols)],
    value = mols[0], description="Selection"
)
interact(
    view_mol,
    molecule=dropdown
)

In [None]:
def create_py3d_model(sdf_file):
  molecules=dm.read_sdf(sdf_file, remove_hs=False)
  view = py3Dmol.view()
  view.addModel(open('6vhn_prepared.pdb', 'r').read())
  view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
  for mol in molecules:
    view.addModel(Chem.MolToMolBlock(mol,confId=0), "sdf")
    view.setStyle({'model': -1}, {"stick" :  {}})
  view.zoomTo()
  return view

view=create_py3d_model("smina_outputs/poses.sdf")
view.show()

## Active Learning

To train ML models in a supervised manner we need:

- Labeled data
- A model

In the drug-discovery pipeline, data and their labels tend to be scarce. Hence, structure-based drug discovery is often conducted in low-data regime as **generating new data is expensive** and time consuming.

To tacke this problem, **active learning** is often used to choose the next samples to expensively annotate to learn better models.

In this paradigm, the model actively selects the data that it will learn from. Instead of feeding it a predefined set of training data, the model has the ability to choose the most informative samples for its training, resulting in a more efficient and effective learning process.

An **active learning** workflow usually involves the following components \(Fig. 4\):

- A ML surrogate **model**
- An **oracle** function to label unlabeled datapoints
- An **objective** function to select the new samples to be labeled (maximizing the uncertainty, maximizing some score, or other, more complex functions)

<img src="http://drive.google.com/uc?export=view&id=1FMDzm7pOt238ByYJwhiSHQkoqsA1EKqb" alt="drawing" width="500"/>

__Fig. 4:__ Generic active learning workflow



Here we will construct a simple active learning loop to make the most out of a given budget of function calls to the (_somewhat_) expensive docking program _SMINA_.

In the previous parts, we constructed the *oracle* function that will label the molecules. To complete the *active learning workflow*, we need a *model* and an *objective* function.

- Model: We will use a simple Gaussian Process on ecfp fingerprints
- Objective function: We will maximize the uncertainty of the GP on the binding free energy prediction



In [None]:
def get_random_idxs(df, n=10, seed=42):
    # Select molecules to create AL seed dataset
    np.random.seed(seed)
    return np.random.randint(0, len(df), n)


df = init_df_fields(df_mols)
df.head()

In [None]:
from sklearn.gaussian_process.kernels import RBF

def train_gp(df) -> GaussianProcessRegressor:
    # retrieve for all labeled molecules the fingerprints and affinity labels
    X = np.vstack(df["fp"][df["sampled"]>=1].tolist())
    Y = np.vstack(df["true_affinity"][df["sampled"]>=1].tolist())
    # fit GP
    return GaussianProcessRegressor(kernel=RBF(length_scale=2.0,
                                               length_scale_bounds=(1e-1, 20.0)),
                                     random_state=0).fit(X,Y)

def predict_with_gp(df, gp):
    X = np.vstack(df["fp"].tolist())
    mean, std = gp.predict(X, return_std=True)
    df["pred_affinity"] = mean
    df["uncertainty"] = std
    return df

def samples_next(df, n: int = 10, sort_by_uncertainty = True) -> List[int]:
    original_df = df
    if sort_by_uncertainty:
        # largest uncertainty on top (aquisition function)
        ascending=False
        name="uncertainty"
    else:
        # best binders on top (most negative binding free energy)
        ascending=True
        name="pred_affinity"
    return df.sort_values(name, ascending=ascending)["idxs"].tolist()[:n]



In [None]:
def get_results(output_dir, idxs):
    # retrieve binding affinity of optimal conformer for all labeled molecules
    values = []
    key = "minimizedAffinity"
    for idx in idxs:
        poses = dm.read_sdf(os.path.join(output_dir, f"poses_{idx}.sdf"),
                            as_df=True, mol_column="mols", n_jobs=-1,
                            sanitize=False)
        poses = poses.sort_values("minimizedAffinity",inplace=False)
        values.append(poses["minimizedAffinity"][0])
    return values

def format_df(df, affinities, sampled_idxs, iteration):
    # save label from oracle
    df["true_affinity"][sampled_idxs] = affinities
    df["sampled"][sampled_idxs] = iteration
    return df


In [None]:
from copy import deepcopy

N_OF_AL_ITERATIONS = 5
FIRST_LOOP=True  # get random idxs at the first loop
N_OF_ORACLE_CALLS=3  # AL batch size
SELECT_BY_UNCERTAINTY=True
SEED = 42

docker=Docking("smina_inputs/receptor.pdbqt", box, num_poses=3)

ultimate_df = deepcopy(df)
for iteration in range(N_OF_AL_ITERATIONS):
    if FIRST_LOOP:
        FIRST_LOOP = False
        sampled_idxs= get_random_idxs(ultimate_df, n=N_OF_ORACLE_CALLS,
                                      seed=SEED).tolist()

    print(f"Selected idxs: {sampled_idxs}")

    # Create iteration directory
    output_dir = f"al_loop_{iteration}"
    os.makedirs(output_dir,exist_ok=True)

    # Select molecules to dock and dock them
    # (really slow on colab)
    mols_to_dock=ultimate_df["mols"].to_numpy()[sampled_idxs]
    docker.dock_multiple_mols(mols_to_dock, sampled_idxs, output_dir)

    # Get and store results
    affinities = get_results(output_dir, sampled_idxs)
    print(f"Obtained affinities in AL interation {iteration}: {affinities}")
    ultimate_df = format_df(ultimate_df, affinities, sampled_idxs, iteration+1)

    # re-train and use surrogate model
    GP = train_gp(ultimate_df)
    ultimate_df = predict_with_gp(ultimate_df, GP)
    sampled_idxs = samples_next(ultimate_df, N_OF_ORACLE_CALLS, SELECT_BY_UNCERTAINTY)


