In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("Fetch_Top_Docks")

import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from mjm_tools import zip_res_range, unzip_res_range

# Paramaters for file locations

In [2]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Original HADDOCK Docking Output Location
orig_dir = "{0}/COVID19_Docking/Haddock_Output".format(base_dir)

# Destination directory to store top-ranked docked conformations
docked_dir = "{0}/Data/Docked_Structures".format(base_dir)

# Select Top Docked Poses from HADDOCK Trials

In [3]:
# Create output directory for docked structures
if(not os.path.exists(docked_dir)):
    os.mkdir(docked_dir)

In [4]:
# Read in final output file summary for all docking attempts
docking_trials = glob.glob("{0}/*/run1/structures/it1/water/file.list".format(orig_dir))
print len(docking_trials)

265


In [5]:
# Compare against list of all interactions for which docking was attempted
attempted = glob.glob("{0}/*".format(orig_dir))

In [6]:
# NSP9 seems to fail during topology generation
# And two orf8 attempts did not run
s1 = set(["/".join(x.split("/")[:-5]) for x in docking_trials])
s2 = set(attempted)
print len(s1)
print len(s2)
for x in s2.difference(s1):
    print x

265
283
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q99567
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q9BVL2
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_P61962
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q7Z3B4
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q8N0X7
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19orf8_P00750
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q9NZL9
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q15056
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_P13984
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q8TD19
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q96F45
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_P37198
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_P35658
/home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp9_Q86YT

In [14]:
# Select top-ranked structure from each finished docking run
summary = []
best_structures = dict()
for f in tqdm_notebook(docking_trials):
    # (Low quality homology models retroactively removed)
    if("nsp2" in f or "nsp4" in f or "orf6" in f or "orf9c" in f):
        #print "Skipping", f
        continue
    
    p1 = f.split("/")[-6].split("_")[0]
    p2 = f.split("/")[-6].split("_")[1]
    
    # Identify the top-ranked file from the docking trial summary file
    # This file contains list of ranked docking outputs + scores...
    #
    # "PREVIT:COVID19nsp8_O00566_1w.pdb"  { 28.8738 }
    # "PREVIT:COVID19nsp8_O00566_18w.pdb"  { 28.9561 }
    for i, l in enumerate(open(f).readlines()):
        name = l.split(":")[1].split("\"")[0]
        score = float(l.split(":")[1].split("{")[1].split("}")[0].strip())
        
        if(i == 0):
            best_structures[f.split("/")[-6]] = os.path.dirname(f) + "/" + name
        
        attempt = int(name.split("_")[-1].split("w")[0])
        summary.append([p1, p2, attempt, score, os.path.abspath(os.path.dirname(f) + "/" + name), i+1])
        
summary = pd.DataFrame(summary, columns=["P1", "P2", "Attempt", "Score", "File", "Rank"])

HBox(children=(IntProgress(value=0, max=265), HTML(value=u'')))




In [23]:
name2score = dict()
for f in tqdm_notebook(docking_trials):
    # (Low quality homology models retroactively removed)
    if("nsp2" in f or "nsp4" in f or "orf6" in f or "orf9c" in f):
        #print "Skipping", f
        continue
    
    p1 = f.split("/")[-6].split("_")[0]
    p2 = f.split("/")[-6].split("_")[1]
    
    # Identify the top-ranked file from the docking trial summary file
    # This file contains list of ranked docking outputs + scores...
    #
    # "PREVIT:COVID19nsp8_O00566_1w.pdb"  { 28.8738 }
    # "PREVIT:COVID19nsp8_O00566_18w.pdb"  { 28.9561 }
    for i, l in enumerate(open(f).readlines()):
        name = l.split(":")[1].split("\"")[0]
        score = float(l.split(":")[1].split("{")[1].split("}")[0].strip())
        
        attempt = int(name.split("_")[-1].split("w")[0])
        
        name2score[(p1, p2, attempt)] = score

HBox(children=(IntProgress(value=0, max=265), HTML(value=u'')))

In [15]:
# Copy files to output directory
for k, v in best_structures.iteritems():
    os.system("cp {0} {1}/{2}_top_dock.pdb".format(v, docked_dir, k))

In [16]:
# Calculate Interface Residues
pbar = tqdm_notebook(total=len(summary))
def calc_ires(f, c1="A", c2="B"):
    try:
        pbar.update()
        ires1, ires2 = my.call("python /home/resources/mjm_tools/irescalc.py {0} -c1 {1} -c2 {2}".format(f, "A", "B"))

        return ires1, ires2
    except KeyboardInterrupt:
        raise
    except:
        return np.nan, np.nan
# FUNCTION END
tmp = summary["File"].map(calc_ires)
summary["P1_Ires"] = [x[0] for x in tmp]
summary["P2_Ires"] = [x[1] for x in tmp]

HBox(children=(IntProgress(value=0, max=44200), HTML(value=u'')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [20]:
# Add Comparison Against ECLIAR Ires
ires_summary = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_summary = ires_summary[ires_summary["Source"] == "ECLAIR"]
interaction2ires = ires_summary.set_index(["P1", "P2"])[["P1_Ires", "P2_Ires"]].to_dict(orient="index")

# Calculates interface similarity with ECLAIR results
# and calculates the fraciton of ECLAIR predicted
# interfaces retained in the docked interface
def calc_stats(args):
    #print len(args)
    #print args
    p1, p2, ires1, ires2 = args

    # Format Sets
    if(pd.isnull(ires1)):
        ires1 = set()
    else:
        ires1 = set(ires1.split(","))
    if(pd.isnull(ires2)):
        ires2 = set()
    else:
        ires2 = set(ires2.split(","))

    # Fetch Eclair Ires / Format Sets
    real_ires1 = interaction2ires[(p1, p2)]["P1_Ires"]
    real_ires2 = interaction2ires[(p1, p2)]["P2_Ires"]

    if(pd.isnull(real_ires1)):
        real_ires1 = set()
    else:
        real_ires1 = set(real_ires1.split(","))
    if(pd.isnull(real_ires2)):
        real_ires2 = set()
    else:
        real_ires2 = set(real_ires2.split(","))

    # Calculate Jaccard Similarity
    j1 = np.nan
    try:
        j1 = len(ires1.intersection(real_ires1)) / float(len(ires1.union(real_ires1)))
    except ZeroDivisionError:
        pass

    j2 = np.nan
    try:
        j2 = len(ires2.intersection(real_ires2)) / float(len(ires2.union(real_ires2)))
    except ZeroDivisionError:
        pass

    # Calculate Recall
    r1 = np.nan
    try:
        r1 = len(ires1.intersection(real_ires1)) / float(len(real_ires1))
    except ZeroDivisionError:
        pass

    r2 = np.nan
    try:
        r2 = len(ires2.intersection(real_ires2)) / float(len(real_ires2))
    except ZeroDivisionError:
        pass


    return j1, j2, r1, r2
# FUNCTION END
tmp = summary[["P1", "P2", "P1_Ires", "P2_Ires"]].apply(calc_stats, axis=1)
summary["P1_Jaccard"] = [x[0] for x in tmp]
summary["P2_Jaccard"] = [x[1] for x in tmp]
summary["P1_Recall"] = [x[2] for x in tmp]
summary["P2_Recall"] = [x[3] for x in tmp]

In [106]:
# Save Summary (Raw version with all docks)
# NOTE: Separate Raw / Filtered versions are theoretically
#       only necessary here because coverage thresholds were
#       applied after initial docking using all available
#       structures. (i.e. in the future this should be done
#       earlier)
summary[["P1", "P2", "Attempt", "File", "Rank", "Score", "P1_Jaccard", "P1_Recall", "P1_Ires", "P2_Jaccard", "P2_Recall", "P2_Ires"]].to_csv("{0}/Data/Docking_Summary_Raw.txt".format(base_dir), sep="\t", index=None)

# Add Docking Source to Interface Summary

In [37]:
# Read Docking Summary
docking_summary = pd.read_csv("{0}/Data/Docking_Summary.txt".format(base_dir), sep="\t")

# Read Interface Summary (remove any docking entries in case already been added)
interface_summary = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
print len(interface_summary)
interface_summary = interface_summary[~(interface_summary["Source"] == "Docking")]
print len(interface_summary)

332
332


In [38]:
# Only keep the entries from the top-ranked dock
docking_summary = docking_summary[docking_summary["Rank"] == 1]

In [39]:
# Remove "Unmapped" Residues From SIFTS Mapping (reported as negative values)
docking_summary["P2_Ires"] = docking_summary["P2_Ires"].map(lambda x: ",".join([a for a in x.split(",") if not "-" in a]))

# Check for Proteins that used "Alternate" column in original Human PDB
# I only know of one of these and have created a map to manually correct it
docking_summary[docking_summary["P2_Ires"].map(lambda x: not all([str.isdigit(a) for a in x.split(",")]))]

Unnamed: 0,P1,P2,Attempt,File,Rank,Score,P1_Jaccard,P1_Recall,P1_Ires,P2_Jaccard,P2_Recall,P2_Ires


In [44]:
# Prep Docking Summary data in Interface Summary Format
id2seq = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t").set_index("ID")["Sequence"].to_dict()
tmp = []
for p1, p2, ires1, ires2 in docking_summary[["P1", "P2", "P1_Ires", "P2_Ires"]].values:
    tmp.append([p1, p2, "Docking", len(id2seq[p1]), len(ires1.split(",")), ires1, len(id2seq[p2]), len(ires2.split(",")), ires2])
tmp = pd.DataFrame(tmp, columns=list(interface_summary))

In [45]:
# Concatenate results / save new summary
interface_summary = pd.concat([interface_summary, tmp]).sort_values(["P1", "P2", "Source"], ascending=True)
interface_summary.to_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t", index=None)

# Filter Docking Resuts Based on Structural Coverage

In [107]:
# NOTE: This filtering step should really be done before docking
#       to save time.

In [46]:
# Only report docking results from structures where either...
# 1. Available structure covers at least 33% of the protein
# 2. A high-confidence ECLAIR prediction in the structure could
#    be used as a guide during docking.

In [69]:
# WARNING: This partially relies on the assumption that each human protein
#          only interacts with one viral protein (current data) and therefore
#          only one model exists for each protein.

# Read in models used Summary
models = pd.read_csv("{0}/Data/Models.txt".format(base_dir), sep="\t")

# Read in dictionary of protein ID to Protein Sequences
id2seq = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t").set_index("ID")["Sequence"].to_dict()

# Add coverage Info
models["Len"] = models["ID"].map(lambda x: len(id2seq[x]))
models["N_Covered"] = models["Resi_Covered"].map(lambda x: len(unzip_res_range(x)))
models["Coverage"] = models["N_Covered"] / models["Len"]

# Summarize Protein to coverage info
# Store coverage percentage, N residues covered, and list of residues covered
id2coverage = models.set_index("ID")[["Coverage", "N_Covered", "Resi_Covered"]].apply(lambda x: (x[0], x[1], set([int(x) for x in unzip_res_range(x[2])])), axis=1).to_dict()

In [70]:
# Read in original ECLAIR preds
preds = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
preds = preds[preds["Source"] == "ECLAIR"]

# Parse as dictionary
def do(iresA, iresB):
    if(pd.isnull(iresA)):
        iresA = set([])
    else:
        iresA = set([int(x) for x in str(iresA).split(",")])
    
    if(pd.isnull(iresB)):
        iresB = set([])
    else:
        iresB = set([int(x) for x in str(iresB).split(",")])
    
    return (iresA, iresB)
# FUNCTION END
inter2ECLAIR_preds = preds[preds["Source"] == "ECLAIR"].set_index(["P1", "P2"])[["P1_Ires", "P2_Ires"]].apply(lambda x: do(*x), axis=1).to_dict()

In [100]:
# Iterate over each interaction to figure out if the docking results
# should be used
coverage_thresh = 0.33
n_covered_thresh = 50
n_ires_thresh = 1

dockable_inters = set()
for p1, p2 in inter2ECLAIR_preds.keys():
    if(p2 == "O00203"):
        print "HERE"
    usable1 = False
    usable2 = False
    
    # Condition 0 - Has structures
    if(not p1 in id2coverage):
        continue
    if(not p2 in id2coverage):
        continue
    
    # Condition 1 - Sufficient Structural Coverage
    usable1 = usable1 or id2coverage[p1][0] >= coverage_thresh
    usable2 = usable2 or id2coverage[p2][0] >= coverage_thresh
    
    if(p2 == "O00203"):
        print usable2
    
    # Condition 2 - High Confidence Ires
    covered_preds1 = inter2ECLAIR_preds[(p1, p2)][0].intersection(id2coverage[p1][2])
    n_covered1 = id2coverage[p1][1]
    usable1 = usable1 or (len(covered_preds1) >= n_ires_thresh and n_covered1 >= n_covered_thresh)
    
    covered_preds2 = inter2ECLAIR_preds[(p1, p2)][1].intersection(id2coverage[p2][2])
    n_covered2 = id2coverage[p2][1]
    usable2 = usable2 or (len(covered_preds2) >= n_ires_thresh and n_covered2 >= n_covered_thresh)
    
    if(p2 == "O00203"):
        print usable2
    
    if(usable1 == False):
        print p1, p2, usable2
    if(usable1 and usable2):
        dockable_inters.add((p1, p2))

HERE
True
True
COVID19N Q9HCE1 False


In [103]:
# Filter and re-save interface summary
keep_summary = interface_summary[(interface_summary["Source"] == "ECLAIR") | (interface_summary[["P1", "P2"]].apply(lambda x: tuple(x) in dockable_inters, axis=1))]
print keep_summary["Source"].value_counts()
keep_summary.to_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t", index=None)

ECLAIR     332
Docking    138
Name: Source, dtype: int64


In [112]:
# Save filterd docking summary (with all docking attempts)
summary[summary[["P1", "P2"]].apply(lambda x: tuple(x) in dockable_inters, axis=1)][["P1", "P2", "Attempt", "File", "Rank", "Score", "P1_Jaccard", "P1_Recall", "P1_Ires", "P2_Jaccard", "P2_Recall", "P2_Ires"]].to_csv("{0}/Data/Docking_Summary.txt".format(base_dir), sep="\t", index=None)

In [185]:
# Save modified summary for use in website
# Only retain top rank and reformat docked file names
# to point to their location in the Data folder with no
# base dir
summary_web = summary[summary["Rank"] == 1].copy()
summary_web["File"] = summary_web[["P1", "P2"]].apply(lambda x: glob.glob("{0}/Data/Docked_Structures/*{1}*{2}*".format(base_dir, x[0], x[1]))[0].replace(base_dir + "/", ""), axis=1)
summary_web.to_csv("{0}/Data/Docking_Summary_Charles.txt".format(base_dir), sep="\t", index=None)