In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("14_Run_Drug_Docking")

from config import *
from helper_functions import zip_res_range, unzip_res_range, flatten, pdb2df, df2pdb

import pandas as pd
pd.options.display.max_columns = 999

import numpy as np

import glob
import os

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

import subprocess as sp
import time
from scipy.spatial import distance

This notebook first parses a provided list of candidate drugs that target one or more human interactors of SARS-CoV-2 (this is a static input, but this setup could theoretically be generalized to any list of protein-drug pairs). Then protein-ligand docking for each pair is run in smina and the top-ranked docked conformation from each pair is retained.


- Inputs:
  - Krogan_Drug_Candidates.txt
  - COVID_19_Interactome.txt
  - Models.txt
  - Proteins.txt
  - [Prot]\_[Source].pdb (Undocked Structures)


- Outputs:
  - [Drug].pdb (Undocked Drug Structures)
  - [Drug].svg (2D Image of Drug Structure)
  - [P1]\_[Drug].pdb (top ranked drug docking for each protein-drug pair)


- Dependencies:
  - Must be run after 04_Select_Models
  - Must have smina installed locally (available through conda)
  - Must have openbabel installed locally (available through conda)

# Parse and Clean Drug List

In [3]:
# Downloaded from Gordon et al Nature 2020 Supplemental Data
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2286-9/MediaObjects/41586_2020_2286_MOESM9_ESM.xlsx
drugs = pd.read_csv("{0}/Krogan_Drug_Candidates.txt".format(input_dir), sep="\t")

In [4]:
# Clean entries with multiple gene names in one row
# Based on manual inspection of all cases
rows = []
for row in drugs.values:
    row = list(row)
    
    # Simple case, multiple genes separated by slash
    if("/" in row[1]):
        targets = row[1].split("/")
        if(len(targets[1]) <= 2):
            targets = (targets[0], targets[0][:-1] + targets[-1])
            targets = [x if x != "EIF4EH" else "EIF4H" for x in targets]
        else:
            pass
    # Special case for genes separated by space
    elif(" " in row[1]):
        # Separate out "NUPs RAE1" entry to include
        # all NUP proteins + RAE1
        if("NUPs" in row[1]):
            targets = ['NUP210', 'NUP214', 'NUP62', 'NUP54', 'NUP88', 'NUP58', 'NUP98', 'RAE1']
        else:
            targets = [row[1]]
    # These are mostly groups of gene described by function (e.g. "Cell Entry")
    # We ignore these, but there are also a handful of genes with extra spaces
    # in the gene name.
    else:
        targets = [row[1].strip()]
    # Special case to interpret NDUFs as all NDUF Proteins
    if(targets == ["NDUFs"]):
        targets = ["NDUFAF2", "NDUFAF1", "NDUFB9"]
    
    # Flatten table for separate rows for all targets
    targets = set(flatten([x.strip().split("\n") for x in targets]))
    for t in targets:
        rows.append([row[0], t] + row[2:])
drugs = pd.DataFrame(rows, columns=list(drugs))

In [7]:
# Match by human interacting gene name
interactions = pd.read_csv("{0}/COVID19_Interactome.txt".format(input_dir), sep="\t")

m = drugs.join(interactions.set_index("PreyGene"), on="Human Gene", how="inner")

# Select / Rename columns and save
m = m[["Compound Name", "Human Gene", "Preys", "Bait", "Structures (PDB)", "Drug Status", "Activity Description", "Activity Description.1", "Reference", "Smiles", "ZINC_ID", "Purchase notes", "Source"]]
m.columns = ["Compound Name", "Human Gene", "Human ID", "Viral ID", "Human PDBs", "Drug Status", "Activity Type", "Activity", "Reference", "Smiles", "ZINC_ID", "Purchase Notes", "Source"]

m.to_csv("{0}/Krogan_Drug_Candidates.txt".format(output_dir), sep="\t", index=None)

# Generate PDB files for each ligand

In [9]:
if(not os.path.exists("{0}/Ligands".format(output_dir))):
    os.mkdir("{0}/Ligands".format(output_dir))
if(not os.path.exists("{0}/Ligands/Images".format(output_dir))):
    os.mkdir("{0}/Ligands/Images".format(output_dir))

In [10]:
for name, smiles in tqdm_notebook(drugs.drop_duplicates(["Compound Name"])[["Compound Name", "Smiles"]].values):
    # This SMILES String is improperly formatted. I looked up the "correct" string here...
    # https://www.medchemexpress.com/dBET6.html?src=google-product&gclid=EAIaIQobChMIqYf5m5ul6QIVT8DICh20VQh-EAAYASAAEgLmkPD_BwE
    if(name == "dBET6"):
        smiles = "O=C(NCCCCCCCCNC(COC1=CC=CC(C(N2C(CC3)C(NC3=O)=O)=O)=C1C2=O)=O)C[C@H]4C5=NN=C(C)N5C6=C(C(C)=C(C)S6)C(C7=CC=C(Cl)C=C7)=N4"
    
    # Use OpenBabel to create PDB File
    print os.system("obabel -:\"{0}\" --gen3d -opdb -O {1}/Ligands/{2}.pdb -d".format(smiles, output_dir, name.replace("-", "_").replace(" ", "_").split("(")[0]))
    
    # Use OpenBabel to create 2D svg image of ligand
    print os.system("obabel -:\"{0}\" --gen3d -osvg -O {1}/Ligands/Images/{2}.svg -d".format(smiles, output_dir, name.replace("-", "_").replace(" ", "_").split("(")[0]))
    #os.system("obabel -i pdb {0} -o pdb -O {0} -d".format(smiles, name.replace("-", "_").replace(" ", "_").split("(")[0]))

HBox(children=(IntProgress(value=0, max=69), HTML(value=u'')))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0



In [11]:
# Total Number of Potentially Dockable Examples
# with specific (viral-prot, human-prot, drug) pairs
# (Cases where Viral ID is NaN have non-specific human
# drug target specified (e.g. "Viral Transcription"))
len(m[~pd.isnull(m["Viral ID"])])

12

# Clean Undocked Structures for Docking

In [12]:
# Make sure there are no hydrogens in the protein structures
# used for docking
for f in tqdm_notebook(glob.glob("{0}/Undocked_Structures/*".format(output_dir))):
    os.system("obabel -i pdb {0} -o pdb -O {0} -d".format(f))

HBox(children=(IntProgress(value=0, max=15), HTML(value=u'')))




# Run Ligand Docking

In [13]:
if(not os.path.exists("{0}/Docked_Ligands".format(output_dir))):
    os.mkdir("{0}/Docked_Ligands".format(output_dir))
if(not os.path.exists("{0}/Docked_Ligands/sub_batches/".format(output_dir))):
    os.mkdir("{0}/Docked_Ligands/sub_batches/".format(output_dir))
if(not os.path.exists("{0}/Docked_Ligands/ranked_poses/".format(output_dir))):
    os.mkdir("{0}/Docked_Ligands/ranked_poses/".format(output_dir))

In [14]:
# Read in list of drug pairs
drug_pairs = pd.read_csv("{0}/Krogan_Drug_Candidates.txt".format(output_dir), sep="\t")

# Read in model info
models = pd.read_csv("{0}/Models.txt".format(output_dir), sep="\t")

# Read in Protein info for total sequence length
proteins = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t")
uni2seq = proteins.set_index("ID")["Sequence"].to_dict()

In [19]:
# Test loop for running docking with one process at a time
# NOTE: For now we just use this to get a summary of how
#       many docks will be run, and run the main batch
#       in parallel.

undockable = []
low_coverage = []
docked = []
for drug, uni in tqdm_notebook(drug_pairs[~pd.isnull(drug_pairs["Human ID"])][["Compound Name", "Human ID"]].values):
    drug = drug.replace("-", "_").replace(" ", "_").split("(")[0]
    
    # Grap Drug PDB
    drug_f = glob.glob("{0}/Ligands/{1}*".format(output_dir, drug))[0]
    
    # Grab Human Protein PDB (skip if no PDB available)
    try:
        uni_f = glob.glob("{0}/Undocked_Structures/{1}*".format(output_dir, uni))[0]
    except IndexError:
        print uni, drug
        undockable.append((drug, uni))
        continue
    
    # Check coverage of selected Human PDB
    # NOTE: Assumes only one Human Structure available for
    #       each protein
    total_len = len(uni2seq[uni])
    resi_covered = models[models["ID"] == uni]["Resi_Covered"].map(lambda x: len(unzip_res_range(x))).values[0]
    coverage = resi_covered / float(total_len)
    
    # If coverage is low, do not attempt docking
    # (would be inaccurate / bias predicted binding site)
    if(not coverage >= 0.33):
        low_coverage.append((drug, uni))
        continue
    docked.append((drug, uni))
    
    if(len(docked) > 1):
        continue
    
    print "smina -r {0} -l {1} --autobox_ligand {0} --autobox_add 10 -o {2}/Docked_Ligands/{3}_{4}.pdb --exhaustiveness 8 --num_modes 10 --cpu 1".format(uni_f, drug_f, output_dir, uni, drug)
    print os.system("smina -r {0} -l {1} --autobox_ligand {0} --autobox_add 10 -o {2}/Docked_Ligands/{3}_{4}.pdb --exhaustiveness 8 --num_modes 10 --cpu 1".format(uni_f, drug_f, output_dir, uni, drug))

HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))

smina -r /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/Undocked_Structures/P12268_6UAJ_A.pdb -l /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/Ligands/Sanglifehrin_A.pdb --autobox_ligand /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/Undocked_Structures/P12268_6UAJ_A.pdb --autobox_add 10 -o /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/Docked_Ligands/P12268_Sanglifehrin_A.pdb --exhaustiveness 8 --num_modes 10 --cpu 1

0


In [20]:
print len(undockable)
print len(low_coverage)
print len(docked)

0
0
12


In [28]:
# Run Ligand Docking in Loop
# For the setup here we want to run N different docking trials for each drug-target pair
# producing up to 1000 docked conformations for each trial. We will then merge the results
# from all trials and retain the top N poses overall.
#
# Conceptually there should be no difference between repeating the same job N times, vs.
# just changing the exhaustiveness parameter to N*exhaustiveness (and possibly upping the
# num_mores parameter as well). We just do it in separate jobs here so that the results
# for each drug-target pair come in evenly.
#
# NOTE: I think the setup here might not be 100% safe for the server. While it's running
#       I start getting too many files open errors. (Mainly seems to break the running
#       JuPyter server)

i_num = 1
finished_processes = []
processes = []

max_processes = 5
# For this setup we will do 10 trials per drug-target pair. This could be set arbitrarilly
# high or low. From past experience, a single trial should be sufficient to obtain good docking
# results. We only increase the number of trials here for added roubustness.
for i_num in range(10):
    print "Docking Iteration", i_num
    
    # Iterate over all Docking Inputs
    for drug, uni in tqdm_notebook(drug_pairs[~pd.isnull(drug_pairs["Human ID"])][["Compound Name", "Human ID"]].values):
        # If this drug-target pair has already been docked on this iteration in the past, just skip it
        if(os.path.exists("{0}/Docked_Ligands/sub_batches/{1}_{2}_{3}.pdb".format(output_dir, uni, drug, i_num))):
            continue
        
        
        # Grab Drug PDB
        drug = drug.replace("-", "_").replace(" ", "_").split("(")[0]
        drug_f = glob.glob("{0}/Ligands/{1}*".format(output_dir, drug))[0]
        
        # Try to Grab Human PDB
        try:
            uni_f = glob.glob("{0}/Undocked_Structures/{1}*".format(output_dir, uni))[0]
        except IndexError:
            print uni
            continue
        
        # Check coverage of selected Human PDB
        # NOTE: Assumes only one Human Structure available for
        #       each protein
        total_len = len(uni2seq[uni])
        resi_covered = models[models["ID"] == uni]["Resi_Covered"].map(lambda x: len(unzip_res_range(x))).values[0]
        coverage = resi_covered / float(total_len)
        
        # If coverage is low, do not attempt docking
        # (would be inaccurate / bias predicted binding site)
        if(not coverage >= 0.33):
            continue
        
        # Block new jobs if too many running already
        while(True):
            # If fewer than N (5) jobs running currently, submit next docking job
            if(len(processes) <= max_processes):
                cmd = "nice smina -r {0} -l {1} --autobox_ligand {0} --autobox_add 10 -o {2}/Docked_Ligands/sub_batches/{3}_{4}_{5}_in_progress.pdb --exhaustiveness 40 --num_modes 1000 --cpu 5 --seed {5}".format(uni_f, drug_f, output_dir, uni, drug, i_num)
                p = sp.Popen(cmd, shell=True)
                processes.append({"p":p, "cmd":cmd, "start_time":time.time(), "end_time":None, "in":(drug, uni), "out_f":"{0}/Docked_Ligands/sub_batches/{1}_{2}_{3}_in_progress.pdb".format(output_dir, uni, drug, i_num)})
                break
            
            # Otherwise wait until previous jobs have finished before starting a new docking job
            else:
                # Sleep for 30 seconds to give processes a change to finish
                time.sleep(30)
                
                # List of actively running processes (to be built)
                new_processes = []
                
                # Iterate over current process list
                for p in processes:
                    # If the process hasn't exited yet add it back
                    # into the list
                    if(p["p"].poll() is None):
                        new_processes.append(p)
                    # Otherwise, if exit code indicates and error
                    # print out error message
                    elif(p["p"].poll() != 0):
                        p["end_time"] = time.time()
                        print "Error", p["p"].poll()
                        print "cmd:", p["cmd"]
                        print "RunTime:", (p["end_time"] - p["start_time"])
                        print
                        finished_processes.append(p)
                    # otherwise, process finished successfully
                    # Print out total run-time
                    else:
                        p["end_time"] = time.time()
                        print "Finished Docking", p["in"], "in", (p["end_time"] - p["start_time"])
                        os.system("mv {0} {1}".format(p["out_f"], p["out_f"].replace("_in_progress", "")))
                        finished_processes.append(p)
                
                # Update active process list
                processes = new_processes

Docking Iteration 0


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 1


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 2


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 3


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 4


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 5


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 6


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 7


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 8


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


Docking Iteration 9


HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))




In [29]:
# Iterate over remaining processes to finish
# NOTE: The main reason this is necessary is because I rename
#       the output file after the process is fully done.
while(processes):
    # Sleep for 30 seconds to give processes a change to finish
    time.sleep(30)
    
    # List of actively running processes (to be built)
    new_processes = []
    
    # Iterate over current process list
    for p in processes:
        # If the process hasn't exited yet add it back
        # into the list
        if(p["p"].poll() is None):
            new_processes.append(p)
        # Otherwise, if exit code indicates and error
        # print out error message
        elif(p["p"].poll() != 0):
            p["end_time"] = time.time()
            print "Error", p["p"].poll()
            print "cmd:", p["cmd"]
            print "RunTime:", (p["end_time"] - p["start_time"])
            print
            finished_processes.append(p)
        # otherwise, process finished successfully
        # Print out total run-time
        else:
            p["end_time"] = time.time()
            print "Finished Docking", p["in"], "in", (p["end_time"] - p["start_time"])
            os.system("mv {0} {1}".format(p["out_f"], p["out_f"].replace("_in_progress", "")))
            finished_processes.append(p)
    
    # Update active process list
    processes = new_processes

In [30]:
# Kill everything left just in case
for p in finished_processes:
    try:
        p["p"].terminate()
    except:
        pass

In [31]:
# Parse Ligand Docking Results
#
# Reads in all docking results from N independent trials and creates
# a ranked poses describing the top 100 scoring poses accross all
# trials for each drug-target pair. Also creates a single .pdb under
# docked_ligands that contains these 100 top poses in one .pdb
#
for drug, uni in tqdm_notebook(drug_pairs[~pd.isnull(drug_pairs["Human ID"])][["Compound Name", "Human ID"]].values):
    # Parse Inputs
    drug = drug.replace("-", "_").replace(" ", "_").split("(")[0]
    drug_f = glob.glob("{0}/Ligands/{1}*".format(output_dir, drug))[0]
    
    try:
        uni_f = glob.glob("{0}/Undocked_Structures/{1}*".format(output_dir, uni))[0]
    except IndexError:
        continue
    
    # Load all docking sub-batches
    outs = glob.glob("{0}/Docked_Ligands/sub_batches/{1}_{2}_*.pdb".format(output_dir, uni, drug))
    
    if(len(outs) == 0):
        continue
    
    # Read in all docked poses
    model2data = dict()
    for out in outs:
        with open(out, "r") as f:
            for l in f:
                if(l[:6] == "MODEL "):
                    model = (out, int(l.strip().split()[-1]))
                    lines = ["MODELSTART\n"]
                    socre = 0
                    centroid = np.zeros(3)
                    n_atoms = 0
                elif(l[:6] == "REMARK"):
                    score = float(l.strip().split()[-1])
                    lines.append(l)
                elif(l[:6] == "COMPND"):
                    lines.append(l.replace("UNNAMED", drug.upper()))
                elif(l[:6] == "HETATM"):
                    lines.append(l)
                    centroid += np.array([float(l[30:38]), float(l[38:46]), float(l[46:54])])
                    n_atoms += 1
                elif(l[:6] == "END   "):
                    lines.append(l)
                elif(l[:6] == "ENDMDL"):
                    lines.append(l)
                    model2data[model] = (lines, score, centroid / float(n_atoms))
                else:
                    pass
    
    # Select top poses from "distinct" binding sites
    #out = open("Docked_Ligands/{0}_{1}.pdb".format(uni, drug), "w+")
    model_num = 1
    cur_centroids = []
    out = open("{0}/Docked_Ligands/{1}_{2}.pdb".format(output_dir, uni, drug), "w+")
    for k, v in sorted(model2data.iteritems(), key=lambda (k, v): v[1]):
        if(len(cur_centroids) == 0):
            out2 = open("{0}/Docked_Ligands/ranked_poses/{1}_{2}_{3}.pdb".format(output_dir, uni, drug, model_num), "w+")
            out.write("".join(v[0]).replace("MODELSTART", "MODEL{0:>9}".format(model_num)))
            out2.write("".join(v[0]).replace("MODELSTART", "MODEL{0:>9}".format(model_num)))
            out2.close()
            model_num += 1
            cur_centroids.append(v[2])
        elif(min([distance.euclidean(v[2], x) for x in cur_centroids]) >= 1):
            out2 = open("{0}/Docked_Ligands/ranked_poses/{1}_{2}_{3}.pdb".format(output_dir, uni, drug, model_num), "w+")
            out.write("".join(v[0]).replace("MODELSTART", "MODEL{0:>9}".format(model_num)))
            out2.write("".join(v[0]).replace("MODELSTART", "MODEL{0:>9}".format(model_num)))
            out2.close()
            model_num += 1
            cur_centroids.append(v[2])
        if(model_num > 100):
            break
    out.close()

HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))




In [35]:
# Parse Ligand Docking Results
#
# Read in all scores accross all trials for each drug-target pair
#
pair2scores = dict()
for drug, uni in tqdm_notebook(drug_pairs[~pd.isnull(drug_pairs["Human ID"])][["Compound Name", "Human ID"]].values):
    # Parse Inputs
    drug = drug.replace("-", "_").replace(" ", "_").split("(")[0]
    drug_f = glob.glob("{0}/Ligands/{1}*".format(output_dir, drug))[0]
    
    try:
        uni_f = glob.glob("{0}/Undocked_Structures/{1}*".format(output_dir, uni))[0]
    except IndexError:
        continue
    
    # Load all docking sub-batches
    outs = glob.glob("{0}/Docked_Ligands/sub_batches/{1}_{2}_*.pdb".format(output_dir, uni, drug))
    
    if(len(outs) == 0):
        continue
    
    scores = sorted(flatten([[float(x.split()[-1]) for x in sp.check_output("grep minimizedAffinity " + out, shell=True).split("\n") if not x == ""] for out in outs]))
    
    pair2scores[(drug, uni)] = scores

HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))


