# Step 3. Predict docking poses

In [1]:
!pip install pandas numpy matplotlib loguru py3dmol rdkit ipywidgets

Collecting loguru
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting py3dmol
  Downloading py3Dmol-2.4.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading py3Dmol-2.4.2-py2.py3-none-any.whl (7.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstal

In [2]:
import time
from loguru import logger
import os, shutil
from google.colab import files
import zipfile
import requests
from google.colab import userdata
import json
import py3Dmol
from rdkit import Chem
import ipywidgets as widgets
from IPython.display import display
import glob
import random

In [3]:
def prepare_output_directory(output):
    """
    Prepare the output directory
    output: str, the output directory
    return: None
    """
    # overwrite the output directory
    # delete the output directory if it exists
    if os.path.exists(output):
        shutil.rmtree(output)
    os.makedirs(output)

## Upload target protein file

In [4]:
# overwrite the output directory
protein_dir = "/content/output/esmfold_result"
prepare_output_directory(protein_dir)

In [5]:
uploaded = files.upload()

Saving predicted_protein.pdb to predicted_protein.pdb


In [6]:
# Move the uploaded file to the target folder
for filename in uploaded.keys():
    !mv "{filename}" "{protein_dir}/{filename}"

In [7]:
# predicted target protein file path
protein_file_path = os.path.join(protein_dir, list(uploaded.keys())[0])
print(protein_file_path)

/content/output/esmfold_result/predicted_protein.pdb


## Upload ligand files

In [8]:
# overwrite the output directory
ligand_dir = "/content/output/molmim_result"
prepare_output_directory(ligand_dir)

In [9]:
# upload clean_mol.zip downloaded at the end of Step 2
uploaded = files.upload()

Saving clean_mols.zip to clean_mols.zip


In [10]:
zip_filename = list(uploaded.keys())[0]
print(zip_filename)

clean_mols.zip


In [11]:
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(ligand_dir)

In [12]:
!ls {ligand_dir}

molecule_0.sdf	molecule_2.sdf	molecule_4.sdf	molecule_6.sdf	molecule_8.sdf
molecule_1.sdf	molecule_3.sdf	molecule_5.sdf	molecule_7.sdf


## Setup output directory for docking

In [13]:
# overwrite the output directory
docking_dir = "/content/output/diffdock_result"
prepare_output_directory(docking_dir)

## 3.4 Predict

In [14]:
# Load all SDF files from the specified directory
sdf_files = [f for f in os.listdir(ligand_dir) if f.endswith(".sdf")]

# Sort ligand files based on the numeric part in the filename (molecule_0, molecule_1, molecule_2 ....)
sdf_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))

# Add a prefix directory path to each file in sdf_files
sdf_files = [os.path.join(ligand_dir, f) for f in sdf_files]

# get name of the sdf files
ligand_names = [os.path.basename(f).split(".")[0] for f in sdf_files]

print(sdf_files)
print(ligand_names)

['/content/output/molmim_result/molecule_0.sdf', '/content/output/molmim_result/molecule_1.sdf', '/content/output/molmim_result/molecule_2.sdf', '/content/output/molmim_result/molecule_3.sdf', '/content/output/molmim_result/molecule_4.sdf', '/content/output/molmim_result/molecule_5.sdf', '/content/output/molmim_result/molecule_6.sdf', '/content/output/molmim_result/molecule_7.sdf', '/content/output/molmim_result/molecule_8.sdf']
['molecule_0', 'molecule_1', 'molecule_2', 'molecule_3', 'molecule_4', 'molecule_5', 'molecule_6', 'molecule_7', 'molecule_8']


In [15]:
API_KEY = userdata.get('API_KEY')
print(API_KEY)
header_auth = f"Bearer {API_KEY}"
print(header_auth)

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
Bearer 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


In [16]:
def _upload_asset(input):
    assets_url = "https://api.nvcf.nvidia.com/v2/nvcf/assets"

    headers = {
        "Authorization": header_auth,
        "Content-Type": "application/json",
        "accept": "application/json",
    }

    s3_headers = {
        "x-amz-meta-nvcf-asset-description": "diffdock-file",
        "content-type": "text/plain",
    }

    payload = {
        "contentType": "text/plain",
        "description": "diffdock-file"
    }

    response = requests.post(
        assets_url, headers=headers, json=payload, timeout=30
    )

    response.raise_for_status()

    asset_url = response.json()["uploadUrl"]
    asset_id = response.json()["assetId"]

    response = requests.put(
        asset_url,
        data=input,
        headers=s3_headers,
        timeout=300,
    )

    response.raise_for_status()
    return asset_id

In [17]:
invoke_url = "https://health.api.nvidia.com/v1/biology/mit/diffdock"

In [18]:
# get asset-uploading URL & upload assets for target protein
with open(protein_file_path, "r") as file:
    pdb_content = file.read()
    protein_id = _upload_asset(pdb_content)
print(protein_id)

ebf9ffcf-bd51-4b61-968f-5308f876e8da


In [19]:
sdf_files = sdf_files[:2]
ligand_names = ligand_names[:2]
print(sdf_files)
print(ligand_names)

['/content/output/molmim_result/molecule_0.sdf', '/content/output/molmim_result/molecule_1.sdf']
['molecule_0', 'molecule_1']


In [20]:
# iterating over input files for DiffDock inference request submissions
for ligand_file_path, ligand_name in zip(sdf_files, ligand_names):
    print(f"************ {ligand_name} ****************")
    # get asset-uploading URL & upload assets for ligand
    with open(ligand_file_path, "r") as file:
        sdf_content = file.read()
        ligand_id = _upload_asset(sdf_content)
    print(f"ligand_id: {ligand_id}")

    # DiffDock inference
    headers = {
        "Content-Type": "application/json",
        "NVCF-INPUT-ASSET-REFERENCES": ",".join([protein_id, ligand_id]),
        "Authorization": header_auth
    }

    payload = {
        "ligand": ligand_id,
        "ligand_file_type": "sdf",
        "protein": protein_id,
        "num_poses": 3,
        "time_divisions": 20,
        "steps": 18,
        "save_trajectory": False,
        "is_staged": True
    }

    start = time.time()
    response = requests.post(invoke_url, headers=headers, json=payload)
    end = time.time()
    logger.debug(f"{ligand_name} took {end - start:.2f} seconds")

    response.raise_for_status()

    result = response.json()

    # save result to output.json
    docking_ligand_dir = os.path.join(docking_dir, ligand_name)
    prepare_output_directory(docking_ligand_dir)
    with open(f"{docking_ligand_dir}/output.json", "w") as f:
        json.dump(result, f)

    # save ligand positions
    for i, ligand_geometry in enumerate(result["ligand_positions"]):
        with open("{}/pose_{}_confidence_{:.2f}.sdf".format(docking_ligand_dir, i, result["position_confidence"][i]), "w") as f:
            f.write(ligand_geometry)

************ molecule_0 ****************
ligand_id: a2450c80-4a3d-4f05-932c-a6bb154f2b54


[32m2024-11-10 00:16:52.166[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 2>[0m:[36m31[0m - [34m[1mmolecule_0 took 2.84 seconds[0m


************ molecule_1 ****************
ligand_id: e7044b05-1270-4700-9b90-e90a8944da75


[32m2024-11-10 00:16:54.961[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 2>[0m:[36m31[0m - [34m[1mmolecule_1 took 2.54 seconds[0m


## 3.5 Visualize the docking poses

In [21]:
# assume we select molecule_0
ligand_name = "molecule_0"

docking_ligand_dir = os.path.join(docking_dir, ligand_name)
# take a look at the JSON output file
with open(f"{docking_ligand_dir}/output.json", "r") as f:
    result = json.load(f)
result.keys()

dict_keys(['trajectory', 'ligand_positions', 'position_confidence', 'status', 'protein', 'ligand'])

- `trajectory`: diffusion trajectory (empty unless `save_trajectory` is set to `True`)
- `ligand_positions`: a list of docking poses
- `ligand_scores`: a list of confidence scores for each docking pose
- `protein`: input protein
- `ligand`: input ligand

Confidence score the logits of the probability that the docked pose has a RMSD < 2A compared to ground truth. Interpretation of confidence score (c) is based on the guideline provided by [github authors](https://github.com/gcorso/DiffDock?tab=readme-ov-file#faq--).
```
c > 0 : high confidence
-1.5 < c < 0: moderate confidence
c < -1.5: low confidence
```

Visusalize docked poses and [confidence score](https://github.com/gcorso/DiffDock#faq--)

In [22]:
# defining a function for color definitions for visualization
def ansi_color(text, color):
    """Color text for console output"""
    colors = {
        "red": "\033[31m",
        "green": "\033[32m",
        "yellow": "\033[33m",
        "blue": "\033[34m",
        "magenta": "\033[35m",
        "cyan": "\033[36m",
        "white": "\033[37m",
        "reset": "\033[0m"
    }
    return f"{colors[color]}{text}{colors['reset']}"

In [23]:
# loading dock poses from the output SDF files extracted from the output.json 'positions' field
def load_poses_from_sdf(directory):
    sdf_files = glob.glob(f"{directory}/*.sdf")
    poses = []

    for sdf_file in sdf_files:
        supplier = Chem.SDMolSupplier(sdf_file)
        for mol in supplier:
            if mol is not None:
                poses.append(mol)
    return poses

In [24]:
# visualising the docking poses in an interactive manner, browsing docked poses using an embedded slider
def update_viewer(pose_index):

    view = py3Dmol.view(width=1200, height=900)

    # Add the protein model
    view.addModel(protein_pdb, 'pdb')
    view.setStyle({'model': 0}, {'cartoon': {'color': 'white', 'opacity': 0.7}})
    view.setViewStyle({'style':'outline','color':'black','width':0.03})
    Prot=view.getModel()
    Prot.setStyle({'cartoon':{'arrows':True, 'tubes':True, 'style':'oval', 'color':'white'}})
    view.addSurface(py3Dmol.VDW,{'opacity':0.4,'color':'white'})

    # Add the selected docking pose
    pose = poses[pose_index]
    pose_block = Chem.MolToMolBlock(pose)
    # color = "#"+''.join([random.choice('0123456789ABCDEF') for _ in range(6)])
    view.addModel(pose_block, 'mol')
    view.setStyle({'model': 1}, {'stick': {'radius': 0.3, 'colorscheme': 'magentaCarbon'}})
    view.addSurface(py3Dmol.VDW, {'opacity': 0.7, 'colorscheme': 'magentaCarbon'}, {'model': 1})
    score = round(confidence_scores[pose_index], 3)
    score_color = "green" if score > 0 else "blue" if score >= -1.5 else "red"
    print(f"Loaded {ansi_color(ligand_name, 'magenta')} with confidence score: {ansi_color(confidence_scores[pose_index], score_color)}")
    view.zoomTo()
    return view.update()


In [25]:
# Load the protein model
with open(protein_file_path, 'r') as f:
    protein_pdb = f.read()

# Specify the directory containing the dock poses in SDF format for a specific ligand
poses = load_poses_from_sdf(docking_ligand_dir)

# Verify the number of poses loaded
print(f"Number of poses loaded: {len(poses)}")

Number of poses loaded: 3


In [26]:
# Load confidence scores from output.json
output_json_path = os.path.join(docking_ligand_dir, 'output.json')
with open(output_json_path, 'r') as file:
    data = json.load(file)
    confidence_scores = data['position_confidence']  # list of floats
print(confidence_scores)

[-1.8653544187545776, -2.9984545707702637, -3.446190118789673]


In [27]:
# Create a slider widget
pose_slider = widgets.IntSlider(
    value=0,
    min=0,
    max=len(poses) - 1,
    step=1,
    description='Pose:',
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Link the slider to the viewer update function
widgets.interact(update_viewer, pose_index=pose_slider)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Pose:', max=2), Output()), _dom…