In [4]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("15_Calculate_Drug_Interfaces")

from config import *
from helper_functions import pdb2df, odds_ratio

import pandas as pd
pd.options.display.max_columns = 999

import numpy as np

import glob
import os

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

import subprocess as sp

This notebook calculates the interface residues from all drug-protein docked outputs (summarized in Drug_Docking_Ires_Summary.txt). It then calculates the log odds enrichment for co-occurence of ligand binding site residues for the drug-protein pair with interface residues for the human-viral protein-protein interaction (summarized in Drug_Interface_Enrichment.txt).


- Inputs:
  - Krogan_Drug_Candidates.txt
  - [P1]\_[Drug].pdb (top ranked drug docking for each protein-drug pair)
  - [Prot]\_[Source].pdb (Undocked Structures)
  - Proteins.txt


- Outputs:
  - Drug_Docking_Ires_Summary.txt
  - Drug_Interface_Enrichment.txt


- Dependencies:
  - Must be run after 14_Run_Drug_Docking
  - Calls irescalc_ligand.py
    - **NOTE:** irescalc_ligand.py *may not* be currently properly extraced from the Yu Lab's server and may not run successfully in this repository. The raw code is provided, but it itself calls several separate dependencies, and I have not been able to thoroughly confirm there are no specifics to our machine still linked to it.
    - I *believe* it should be functional, but if any end user encounters errors runngin irescalc_ligand.py from this repository please contact the authors.
    - Requires NACCESS installed locally

# Calculate Ligand Interface Residues

In [2]:
drug_pairs = pd.read_csv("{0}/Krogan_Drug_Candidates.txt".format(output_dir), sep="\t")

In [5]:
summary = []
pbar = tqdm_notebook(total=len(glob.glob("{0}/Docked_Ligands/ranked_poses/*".format(output_dir))))
for drug, uni in tqdm_notebook(drug_pairs[~pd.isnull(drug_pairs["Human ID"])][["Compound Name", "Human ID"]].values):
    # Parse Inputs
    drug_orig = drug
    drug = drug.replace("-", "_").replace(" ", "_").split("(")[0]
    drug_f = glob.glob("{0}/Ligands/{1}*".format(output_dir, drug))[0]
    
    try:
        uni_f = glob.glob("{0}/Undocked_Structures/{1}*".format(output_dir, uni))[0]
    except IndexError:
        continue
    
    # Load all ranked docking poses
    outs = glob.glob("{0}/Docked_Ligands/ranked_poses/{1}_{2}_*.pdb".format(output_dir, uni, drug))
    
    for f in outs:
        rank = int(f.split("_")[-1].split(".")[0])
        #print drug, uni, rank
        #continue
        if(not rank <= 10):
            continue
        score = float(open(f, "r").readlines()[1].split()[-1])
        ires = sp.check_output("python irescalc_ligand.py {0} {1}".format(uni_f, f), shell=True).strip()
        
        summary.append([uni, drug_orig, rank, score, ires])
        pbar.update()
summary = pd.DataFrame(summary, columns=["Human_Protein", "Compound_Name", "Rank", "Score", "Ires"])

HBox(children=(IntProgress(value=0, max=1034), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=12), HTML(value=u'')))




In [15]:
# Sort Summary By Drug Target Pair / Rank
summary = summary.sort_values(["Human_Protein", "Compound_Name", "Rank"])

In [22]:
# Save Summary
summary.to_csv("{0}/Drug_Docking_Ires_Summary.txt".format(output_dir), sep="\t", index=None)

In [23]:
summary = pd.read_csv("{0}/Drug_Docking_Ires_Summary.txt".format(output_dir), sep="\t")

# Calculate Enrichment For Protein / Ligand IRES Overlap

In [24]:
# Read in Protein Interfaces (from docked results)
p_ires = pd.read_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t")
p_ires = p_ires[p_ires["Source"] == "Docking"]

# Read in Ligand Interfaces
l_ires = pd.read_csv("{0}/Drug_Docking_Ires_Summary.txt".format(output_dir), sep="\t")

In [25]:
# Merge the data (use left join to retain all ligand docks performed)
merged = l_ires.join(p_ires[["P1", "P2", "Source", "P2_Ires"]].set_index("P2"), on="Human_Protein", how="left")

In [26]:
# Read in Protein --> Sequence Map
uni2seq = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t").set_index("ID")["Sequence"].to_dict()

In [27]:
# Calculate Binding Site Similarity and Recall
# Recall is the fraction of docked PPI interfaces
# that also appear in the docked drug binding site
def do(x):
    iresA, iresB = x
    if(pd.isnull(iresA)):
        iresA = set()
    else:
        iresA = set([int(x)-1 for x in iresA.split(",")])
    if(pd.isnull(iresB)):
        iresB = set()
    else:
        iresB = set([int(x)-1 for x in iresB.split(",")])
    
    return len(iresA.intersection(iresB)) / float(len(iresA.union(iresB))), len(iresA.intersection(iresB)) / float(len(iresA))
# FUNCTION END
tmp = merged[["Ires", "P2_Ires"]].apply(do, axis=1)

merged["Jaccard"] = [x[0] for x in tmp]
merged["Ligand Recall"] = [x[1] for x in tmp]

In [29]:
# Construct Coverage Masks for all of the
# human structures used in the docking
uni2coverage_mask = dict()
for uni in uni2seq:
    uni2coverage_mask[uni] = np.zeros(len(uni2seq[uni]))
    try:
        uni_f = glob.glob("{0}/Undocked_Structures/{1}_*".format(output_dir, uni))[0]
    except IndexError:
        continue
    
    pdb_df = pdb2df(uni_f)
    resis = list(pdb_df["Residue ID"].unique() - 1)
    resis = [x for x in resis if x >= 0]
    
    uni2coverage_mask[uni][resis] = 1

In [30]:
# Calculate Log Odds Enrichment for Overlap between
# Drug / Protein Interfaces
uni2piresmask = dict()
uni2liresmask = dict()
def do(x):
    uni, iresA, iresB, rank = x
    mask1 = np.zeros(len(uni2seq[uni]))
    mask2 = np.zeros(len(uni2seq[uni]))
    
    if(pd.isnull(iresA)):
        iresA = set()
    else:
        iresA = set([int(x)-1 for x in iresA.split(",")])
    if(pd.isnull(iresB)):
        iresB = set()
    else:
        iresB = set([int(x)-1 for x in iresB.split(",")])
    
    mask1[sorted(iresA)] = 1
    mask2[sorted(iresB)] = 1
    
    coverage_mask = uni2coverage_mask[uni] == 1
    
    #print mask1[coverage_mask]
    #print type(mask1[coverage_mask])
    #print set(mask1[coverage_mask])
    #print set(mask2[coverage_mask])
    #1/0
    
    OR, up, low, p = odds_ratio(mask1[coverage_mask], mask2[coverage_mask], two_sided=False, error="CI")
    OR, up, low = np.log2([OR, up, low])
    
    if(rank == 1 and sum(mask2) != 0):
        if(not uni in uni2piresmask.keys()):
            uni2piresmask[uni] = mask1[coverage_mask] == 1
            uni2liresmask[uni] = mask2[coverage_mask] == 1
        else:
            print "HERE"
            uni2piresmask[uni] = uni2piresmask[uni] | (mask1[coverage_mask] == 1)
            uni2liresmask[uni] = uni2liresmask[uni] | (mask2[coverage_mask] == 1)
    elif(rank == 1):
        print uni
    return OR, up, low, p
# FUNCTION END
tmp = merged[["Human_Protein", "Ires", "P2_Ires", "Rank"]].progress_apply(do, axis=1)
merged["Log2 Odds Ratio"] = [x[0] for x in tmp]
merged["Lower CI"] = [x[2] for x in tmp]
merged["Upper CI"] = [x[1] for x in tmp]
merged["p-value"] = [x[3] for x in tmp]

HBox(children=(IntProgress(value=0, max=4), HTML(value=u'')))

HERE
HERE
HERE
HERE
HERE
HERE


In [32]:
# Overall Enrichment Across all Genes
pires_mask = np.concatenate([uni2piresmask[uni] for uni in uni2piresmask.keys()])
lires_mask = np.concatenate([uni2liresmask[uni] for uni in uni2piresmask.keys()])

OR, up, low, p = odds_ratio(pires_mask, lires_mask, two_sided=False, error="SE")
OR, up, low = np.log2([OR, up, low])
    
print OR, up, low, p

2.16682159602 2.50534147082 1.82830172121 7.72472086297e-11


In [33]:
# Check for cases with perfect overlap at the interface
merged[(pd.isnull(merged["Log2 Odds Ratio"]))*(merged["Ligand Recall"] != 0)]

  .format(op=op_str, alt_op=unsupported[op_str]))


Unnamed: 0,Human_Protein,Compound_Name,Rank,Score,Ires,P1,Source,P2_Ires,Jaccard,Ligand Recall,Log2 Odds Ratio,Lower CI,Upper CI,p-value
63,P21964,Entacapone,4,-6.258568,8890191193194223224,COVID19nsp7,Docking,"55,56,58,84,85,86,87,88,90,191,193,194,223,224...",0.411765,1.0,,,,


In [34]:
# Reformat / Save
to_save = merged.copy()
to_save.columns = ["Human_Protein", "Compound_Name", "Docking_Rank", "Score", "Drug_Ires", "Viral_Interactor", "Source", "Protein_Ires", "Jaccard_Similarity", "Drug_Ires_Coverage", "Log2OR", "LowerCI", "UpperCI", "p-value"]
to_save = to_save[["Human_Protein", "Compound_Name", "Viral_Interactor", "Docking_Rank", "Score", "Drug_Ires", "Protein_Ires", "Source", "Jaccard_Similarity", "Drug_Ires_Coverage", "Log2OR", "LowerCI", "UpperCI", "p-value"]]

to_save.to_csv("{0}/Drug_Interface_Enrichment.txt".format(output_dir), sep="\t", index=None)

In [35]:
a = pd.read_csv("{0}/Drug_Interface_Enrichment.txt".format(output_dir), sep="\t")
a[a["Docking_Rank"] == 1]

Unnamed: 0,Human_Protein,Compound_Name,Viral_Interactor,Docking_Rank,Score,Drug_Ires,Protein_Ires,Source,Jaccard_Similarity,Drug_Ires_Coverage,Log2OR,LowerCI,UpperCI,p-value
0,P12268,Merimepodib,COVID19nsp14,1,-7.910938,"70,71,93,94,95,251,255,274,275,276,325,326,327...","36,37,38,39,40,41,42,45,238,242,371,392,394,39...",Docking,0.0,0.0,,,,
10,P12268,Mycophenolic acid,COVID19nsp14,1,-7.458368,70274275276325326327365366387388,"36,37,38,39,40,41,42,45,238,242,371,392,394,39...",Docking,0.0,0.0,,,,
20,P12268,Ribavirin,COVID19nsp14,1,-6.812905,5254355358380382477484,"36,37,38,39,40,41,42,45,238,242,371,392,394,39...",Docking,0.0,0.0,,,,
30,P12268,Sanglifehrin A,COVID19nsp14,1,-11.20606,"52,57,58,60,62,236,238,239,240,241,242,243,264...","36,37,38,39,40,41,42,45,238,242,371,392,394,39...",Docking,0.036364,0.1,0.462343,-1.354888,2.279575,0.3377957
40,P19784,Silmitasertib,COVID19N,1,-10.96942,4647484951525467115159161176,"47,48,49,50,51,121,123,124,127,159,161,176,179...",Docking,0.194444,0.583333,4.119633,2.641715,5.59755,2.270367e-06
50,P19784,TMCB,COVID19N,1,-7.31831,46474849525467115116119159161176,"47,48,49,50,51,121,123,124,127,159,161,176,179...",Docking,0.157895,0.461538,3.343205,1.933497,4.752912,4.792419e-05
60,P21964,Entacapone,COVID19nsp7,1,-7.196985,737781849495247250251252,"55,56,58,84,85,86,87,88,90,191,193,194,223,224...",Docking,0.125,0.3,2.56271,0.798355,4.327065,0.008444046
70,P27448,Ruxolitinib,COVID19orf9b,1,-6.998371,170171173175232233293296297299,"60,61,62,63,64,65,70,81,134,135,136,137,138,13...",Docking,0.0,0.0,,,,
80,P27448,ZINC95559591,COVID19orf9b,1,-7.280588,"60,61,62,63,70,85,134,135,136,138,139,182,195,...","60,61,62,63,64,65,70,81,134,135,136,137,138,13...",Docking,0.27907,0.8,5.290677,3.688174,6.89318,2.809835e-08
90,Q92769,Apicidin,COVID19nsp5,1,-7.705235,2899100150151205206271272302,"24,26,27,28,30,32,82,85,86,89,90,92,93,94,96,9...",Docking,0.133333,0.6,3.79107,2.202148,5.379992,4.34517e-05
