In [17]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("08_Fetch_Top_Docks")

from config import *
from helper_functions import zip_res_range, unzip_res_range

import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my
import subprocess as sp

import glob
import os

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

This notebook selects tht top-scored docking output from each of the HADDOCK runs to use in downstream analysis. It additionally calculates the interface residues from these top-scored docking outputs, compares them against the original ECLAIR predictions, and produces the Docking_Summary.txt. It also updates the Interface_Summary.txt to add interface annotations from the docking results.

**NOTE:** At this stage AFTER docking is already run, docked outputs are filtered to only retain for downstream analysis docked interactions whose structures had sufficient coverage, or included a high-confidence interface prediction that was used to guide docking. Practically speaking this filtering step should instead be applied before docking is run at all to save time.

- Inputs:
  - [P1]\_[P2] (Interaction Haddock Run Directory created under "Docking_Runs")
  - Interface_Summary.txt
  - Proteins.txt
  - Models.txt


- Outputs:
  - [P1]\_[P2]_top_dock.pdb
  - Docking_Summary
  - Interface_Summary.txt (updated)


- Dependencies:
  - Must be run after 07_Run_PPI_Docking and 03_Generate_Proteins
  - Calls irescalc.py
    - **NOTE:** irescalc.py *may not* be currently properly extraced from the Yu Lab's server and may not run successfully in this repository. The raw code is provided, but it itself calls several separate dependencies, and I have not been able to thoroughly confirm there are no specifics to our machine still linked to it.
    - I *believe* it should be functional, but if any end user encounters errors runngin irescalc.py from this repository please contact the authors.
    - Requires NACCESS installed locally

# Select Top Docked Poses from HADDOCK Trials

In [4]:
# Create output directory for docked structures
if(not os.path.exists(docking_dir)):
    os.mkdir(docking_dir)

In [11]:
# Read in final output file summary for all docking attempts
docking_trials = glob.glob("{0}/*/run1/structures/it1/water/file.list".format(raw_docking_dir))
print len(docking_trials)

7


In [12]:
# Compare against list of all interactions for which docking was attempted
attempted = glob.glob("{0}/*".format(raw_docking_dir))

In [13]:
# NOTE: From my experience a small number of protein structures will fail during topology generation
#       so its worth checking the difference between what was attempted and what actually produced
#       a final output to troubleshoot any issues.
s1 = set(["/".join(x.split("/")[:-5]) for x in docking_trials])
s2 = set(attempted)
print len(s1)
print len(s2)

# Print any attempts with no final output
for x in s2.difference(s1):
    print x

7
7


In [22]:
# Select top-ranked structure from each finished docking run
summary = []
best_structures = dict()
for f in tqdm_notebook(docking_trials):
    # (Low quality homology models retroactively removed)
    # (This manual step would need to be updated if real
    #  PDB structures or better homology models become
    #  available for these proteins)
    if("nsp2" in f or "nsp4" in f or "orf6" in f or "orf9c" in f):
        print "Skipping", f, "manually filterd homology model"
        continue
    
    p1 = f.split("/")[-6].split("_")[0]
    p2 = f.split("/")[-6].split("_")[1]
    
    # Identify the top-ranked file from the docking trial summary file
    # This file contains list of ranked docking outputs + scores...
    #
    # "PREVIT:COVID19nsp8_O00566_1w.pdb"  { 28.8738 }
    # "PREVIT:COVID19nsp8_O00566_18w.pdb"  { 28.9561 }
    for i, l in enumerate(open(f).readlines()):
        name = l.split(":")[1].split("\"")[0]
        score = float(l.split(":")[1].split("{")[1].split("}")[0].strip())
        
        if(i == 0):
            best_structures[f.split("/")[-6]] = os.path.dirname(f) + "/" + name
        
        attempt = int(name.split("_")[-1].split("w")[0])
        summary.append([p1, p2, attempt, score, os.path.abspath(os.path.dirname(f) + "/" + name), i+1])
        
summary = pd.DataFrame(summary, columns=["P1", "P2", "Attempt", "Score", "File", "Rank"])

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

In [23]:
name2score = dict()
for f in tqdm_notebook(docking_trials):
    # (Low quality homology models retroactively removed)
    if("nsp2" in f or "nsp4" in f or "orf6" in f or "orf9c" in f):
        #print "Skipping", f
        continue
    
    p1 = f.split("/")[-6].split("_")[0]
    p2 = f.split("/")[-6].split("_")[1]
    
    # Identify the top-ranked file from the docking trial summary file
    # This file contains list of ranked docking outputs + scores...
    #
    # "PREVIT:COVID19nsp8_O00566_1w.pdb"  { 28.8738 }
    # "PREVIT:COVID19nsp8_O00566_18w.pdb"  { 28.9561 }
    for i, l in enumerate(open(f).readlines()):
        name = l.split(":")[1].split("\"")[0]
        score = float(l.split(":")[1].split("{")[1].split("}")[0].strip())
        
        attempt = int(name.split("_")[-1].split("w")[0])
        
        name2score[(p1, p2, attempt)] = score

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

In [16]:
# Copy files to output directory
for k, v in best_structures.iteritems():
    os.system("cp {0} {1}/{2}_top_dock.pdb".format(v, docking_dir, k))

In [30]:
# Calculate Interface Residues
# NOTE: By default this is done for ALL 200 docking outputs
#       from each trial not just the top-ranked output we use
#       for downstream analysis. This may not be necessary for
#       all use-cases and adds significant run-time if the
#       interfaces from all docking outputs are not desired.
#       
#       e.g. The provided example input would run in ~10 seconds
#       using only the top-ranked output compared to ~30 minutes
#       using ALL outputs.
pbar = tqdm_notebook(total=len(summary))
irescalc_path = "{0}/irescalc.py".format(script_dir)
def calc_ires(f, c1="A", c2="B"):
    try:
        pbar.update()
        ires1, ires2 = sp.check_output("{0} {1} -c1 {2} -c2 {3}".format(irescalc_path, f, "A", "B"), shell=True).split("\n")[:2]

        return ires1, ires2
    except KeyboardInterrupt:
        raise
    except:
        raise
        return np.nan, np.nan
# FUNCTION END
tmp = summary["File"].map(calc_ires)
summary["P1_Ires"] = [x[0] for x in tmp]
summary["P2_Ires"] = [x[1] for x in tmp]

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

In [40]:
# Add Comparison Against ECLIAR Ires
ires_summary = pd.read_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t")
ires_summary = ires_summary[ires_summary["Source"] == "ECLAIR"]
interaction2ires = ires_summary.set_index(["P1", "P2"])[["P1_Ires", "P2_Ires"]].to_dict(orient="index")

# Calculates interface similarity with ECLAIR results
# and calculates the fraciton of ECLAIR predicted
# interfaces retained in the docked interface
def calc_stats(args):
    #print len(args)
    #print args
    p1, p2, ires1, ires2 = args

    # Format Sets
    if(pd.isnull(ires1)):
        ires1 = set()
    else:
        ires1 = set(ires1.split(","))
    if(pd.isnull(ires2)):
        ires2 = set()
    else:
        ires2 = set(ires2.split(","))

    # Fetch Eclair Ires / Format Sets
    real_ires1 = interaction2ires[(p1, p2)]["P1_Ires"]
    real_ires2 = interaction2ires[(p1, p2)]["P2_Ires"]

    if(pd.isnull(real_ires1)):
        real_ires1 = set()
    else:
        real_ires1 = set(real_ires1.split(","))
    if(pd.isnull(real_ires2)):
        real_ires2 = set()
    else:
        real_ires2 = set(real_ires2.split(","))

    # Calculate Jaccard Similarity
    j1 = np.nan
    try:
        j1 = len(ires1.intersection(real_ires1)) / float(len(ires1.union(real_ires1)))
    except ZeroDivisionError:
        pass

    j2 = np.nan
    try:
        j2 = len(ires2.intersection(real_ires2)) / float(len(ires2.union(real_ires2)))
    except ZeroDivisionError:
        pass

    # Calculate Recall
    r1 = np.nan
    try:
        r1 = len(ires1.intersection(real_ires1)) / float(len(real_ires1))
    except ZeroDivisionError:
        pass

    r2 = np.nan
    try:
        r2 = len(ires2.intersection(real_ires2)) / float(len(real_ires2))
    except ZeroDivisionError:
        pass


    return j1, j2, r1, r2
# FUNCTION END
tmp = summary[["P1", "P2", "P1_Ires", "P2_Ires"]].apply(calc_stats, axis=1)
summary["P1_Jaccard"] = [x[0] for x in tmp]
summary["P2_Jaccard"] = [x[1] for x in tmp]
summary["P1_Recall"] = [x[2] for x in tmp]
summary["P2_Recall"] = [x[3] for x in tmp]

In [42]:
# Save Summary (Raw version with all docks included + no coverage limits)
# NOTE: Separate Raw / Filtered versions are theoretically
#       only necessary here because coverage thresholds were
#       applied after initial docking using all available
#       structures. (i.e. in the future this coverage filter
#       should isntead be applied suring the select models
#       step to avoid even running the docking trials in cases
#       where they will be filtered anyway.)
summary[["P1", "P2", "Attempt", "File", "Rank", "Score", "P1_Jaccard", "P1_Recall", "P1_Ires", "P2_Jaccard", "P2_Recall", "P2_Ires"]].to_csv("{0}/Docking_Summary_Raw.txt".format(output_dir), sep="\t", index=None)

# Add Docking Source to Interface Summary

In [43]:
# Read Docking Summary
docking_summary = pd.read_csv("{0}/Docking_Summary_Raw.txt".format(output_dir), sep="\t")

# Read Interface Summary (remove any docking entries in case already been added)
interface_summary = pd.read_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t")
print len(interface_summary)
interface_summary = interface_summary[~(interface_summary["Source"] == "Docking")]
print len(interface_summary)

10
10


In [44]:
# Only keep the entries from the top-ranked dock
docking_summary = docking_summary[docking_summary["Rank"] == 1]

In [45]:
# Remove "Unmapped" Residues From SIFTS Mapping (reported as negative values)
docking_summary["P2_Ires"] = docking_summary["P2_Ires"].map(lambda x: ",".join([a for a in x.split(",") if not "-" in a]))

# Check for Proteins that used "Alternate" column in original Human PDB
# I only know of one of these and have created a map to manually correct it
docking_summary[docking_summary["P2_Ires"].map(lambda x: not all([str.isdigit(a) for a in x.split(",")]))]

Unnamed: 0,P1,P2,Attempt,File,Rank,Score,P1_Jaccard,P1_Recall,P1_Ires,P2_Jaccard,P2_Recall,P2_Ires


In [46]:
# Prep Docking Summary data in Interface Summary Format
id2seq = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t").set_index("ID")["Sequence"].to_dict()
tmp = []
for p1, p2, ires1, ires2 in docking_summary[["P1", "P2", "P1_Ires", "P2_Ires"]].values:
    tmp.append([p1, p2, "Docking", len(id2seq[p1]), len(ires1.split(",")), ires1, len(id2seq[p2]), len(ires2.split(",")), ires2])
tmp = pd.DataFrame(tmp, columns=list(interface_summary))

In [47]:
# Concatenate results / save new summary
interface_summary = pd.concat([interface_summary, tmp]).sort_values(["P1", "P2", "Source"], ascending=True)
interface_summary.to_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t", index=None)

# Filter Docking Resuts Based on Structural Coverage

In [48]:
# NOTE: This filtering step should really be done before docking
#       to save time.

In [49]:
# Only report docking results from structures where either...
# 1. Available structure covers at least 33% of the protein
# 2. A high-confidence ECLAIR prediction in the structure could
#    be used as a guide during docking.

In [50]:
# WARNING: This partially relies on the assumption that each human protein
#          only interacts with one viral protein (current data) and therefore
#          only one model exists for each protein.

# Read in models used Summary
models = pd.read_csv("{0}/Models.txt".format(output_dir), sep="\t")

# Read in dictionary of protein ID to Protein Sequences
id2seq = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t").set_index("ID")["Sequence"].to_dict()

# Add coverage Info
models["Len"] = models["ID"].map(lambda x: len(id2seq[x]))
models["N_Covered"] = models["Resi_Covered"].map(lambda x: len(unzip_res_range(x)))
models["Coverage"] = models["N_Covered"] / models["Len"]

# Summarize Protein to coverage info
# Store coverage percentage, N residues covered, and list of residues covered
id2coverage = models.set_index("ID")[["Coverage", "N_Covered", "Resi_Covered"]].apply(lambda x: (x[0], x[1], set([int(x) for x in unzip_res_range(x[2])])), axis=1).to_dict()

In [51]:
# Read in original ECLAIR preds
preds = pd.read_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t")
preds = preds[preds["Source"] == "ECLAIR"]

# Parse as dictionary
def do(iresA, iresB):
    if(pd.isnull(iresA)):
        iresA = set([])
    else:
        iresA = set([int(x) for x in str(iresA).split(",")])
    
    if(pd.isnull(iresB)):
        iresB = set([])
    else:
        iresB = set([int(x) for x in str(iresB).split(",")])
    
    return (iresA, iresB)
# FUNCTION END
inter2ECLAIR_preds = preds[preds["Source"] == "ECLAIR"].set_index(["P1", "P2"])[["P1_Ires", "P2_Ires"]].apply(lambda x: do(*x), axis=1).to_dict()

In [52]:
# Iterate over each interaction to figure out if the docking results
# should be used
coverage_thresh = 0.33
n_covered_thresh = 50
n_ires_thresh = 1

dockable_inters = set()
for p1, p2 in inter2ECLAIR_preds.keys():
    usable1 = False
    usable2 = False
    
    # Condition 0 - Has structures
    if(not p1 in id2coverage):
        continue
    if(not p2 in id2coverage):
        continue
    
    # Condition 1 - Sufficient Structural Coverage
    usable1 = usable1 or id2coverage[p1][0] >= coverage_thresh
    usable2 = usable2 or id2coverage[p2][0] >= coverage_thresh
    
    # Condition 2 - High Confidence Ires
    covered_preds1 = inter2ECLAIR_preds[(p1, p2)][0].intersection(id2coverage[p1][2])
    n_covered1 = id2coverage[p1][1]
    usable1 = usable1 or (len(covered_preds1) >= n_ires_thresh and n_covered1 >= n_covered_thresh)
    
    covered_preds2 = inter2ECLAIR_preds[(p1, p2)][1].intersection(id2coverage[p2][2])
    n_covered2 = id2coverage[p2][1]
    usable2 = usable2 or (len(covered_preds2) >= n_ires_thresh and n_covered2 >= n_covered_thresh)
    
    if(usable1 and usable2):
        dockable_inters.add((p1, p2))

In [53]:
# Filter and re-save interface summary
keep_summary = interface_summary[(interface_summary["Source"] == "ECLAIR") | (interface_summary[["P1", "P2"]].apply(lambda x: tuple(x) in dockable_inters, axis=1))]
print keep_summary["Source"].value_counts()
keep_summary.to_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t", index=None)

ECLAIR     10
Docking     7
Name: Source, dtype: int64


In [54]:
# Save filterd docking summary (with all docking attempts)
summary[summary[["P1", "P2"]].apply(lambda x: tuple(x) in dockable_inters, axis=1)][["P1", "P2", "Attempt", "File", "Rank", "Score", "P1_Jaccard", "P1_Recall", "P1_Ires", "P2_Jaccard", "P2_Recall", "P2_Ires"]].to_csv("{0}/Docking_Summary.txt".format(output_dir), sep="\t", index=None)