In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("12_Run_SARS1_SARS2_ddG_Predictions")

from config import *
from helper_functions import pdb2df

import pandas as pd
pd.options.display.max_columns = 999

import numpy as np

import glob
import os

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

from collections import defaultdict
import scipy

This notebook is a wrapper to that calls a PyRosetta protocol to predict SARS1 --> SARS2 ddG value for all docked interactions. The prediciton is based on using the Rosetta energy funciton to model the change in energy between the bound and uncound forms of the SARS-CoV and SARS-CoV-2 interaction. SARS-CoV docked structures are generated by applying all of the SARS1 --> SARS2 sequence deviations (in reverse) to the SARS-CoV-2 docked structure.


- Inputs:
  - Docking_Summary.txt
  - Viral_Muts.txt
  - [P1]\_[P2] (Interaction Haddock Run Directory created under "Docking_Runs")


- Outputs:
  - [P1]\_[P2]  (directory and summary logs per interaction created under ddG_Mutated_Structures)
  - ddG_All.txt
  - ddG_Summary.txt


- Dependencies:
  - Must be run after 06_Compil_Viral_Mutations and 07_Run_PPI_Docking
  - Calls Mutant_ddG_Calc.py

# Run ddG Calculations for all docked interactions

In [2]:
# Read in docking summary (we could theoretcally
# want to calculate this value on all docking attempts
# but for here we will just filter to the top-ranked)
ppi_docking = pd.read_csv("{0}/Docking_Summary.txt".format(output_dir), sep="\t")

ppi_docking = ppi_docking[ppi_docking["Rank"] == 1]

In [3]:
# Read in Viral Muts
# For this ddG calculation we start with our docked SARS2-Human
# protein and want to introduce mutations to change the SARS2 protein
# into the SARS1 equivalent. We need to know which mutations to make.
viral2muts = defaultdict(set)

viral_muts = pd.read_csv("{0}/Viral_Muts.txt".format(output_dir), sep="\t")
for prot, pos, aa_ref, aa_alt in viral_muts[["COVID_ID", "COVID_Pos", "COVID_AA", "SARS_AA"]].values:
    if(not (aa_alt == "" or pd.isnull(aa_alt))):
        viral2muts[prot].add((pos, aa_alt))

In [5]:
def run_ddg_calc(p1, p2, pdb, rank, muts, trials, interface_cutoff, ddg_dir=ddg_dir):
    cmd = "nice python {0}/Mutant_ddG_Calc.py {1} {2} {3} {4} {5} {6} {7} {8}".format(script_dir, p1, p2, pdb, rank, muts, trials, interface_cutoff, ddg_dir)
    #print cmd
    p = sp.Popen(cmd, shell=True)
    return {"p":p, "cmd":cmd, "start_time":time.time(), "end_time":None, "in":(p1, p2, rank)}
# FUNCTION END

In [7]:
import subprocess as sp
import time
# Run Ligand Docking in Loop
i_num = 1
finished_processes = []
processes = []

max_processes = 25

trials = 50
interface_cutoff = 8.0
# Iterate over all Docking Inputs
for p1, p2, pdb, rank in tqdm_notebook(ppi_docking.sort_values(["Rank", "P1", "P2"])[["P1", "P2", "File", "Rank"]].values):
    if(os.path.exists("{0}/ddG_Mutated_Structures/Summary_Logs/{1}_{2}_{3}".format(output_dir, p1, p2, rank))):
        continue
    
    # Obtain list of viral muts
    muts = viral2muts[p1]
    
    # Obtain coverage info for viral protein in docked structure
    # (Could theoretically be done ahead of time / all in one go
    #  particularly if running for all docked trials. This setup
    #  of reading in the pdb is slower than necessary. e.g. we
    #  have a table describing the coverage of all models used
    #  could read that in instead, and create a uni2coverage map)
    pdb_df = pdb2df(pdb)
    resis = set(pdb_df[pdb_df["Chain"] == "A"]["Residue ID"].unique())
    
    # Obtain list of all viral sequence deviations covered by the available structure
    muts = [x for x in muts if x[0] in resis]
    
    # If we don't have any deviations between SARS1 and SARS2, this calculation
    # is meaningless so we skip it
    if(len(muts) == 0):
        continue
    
    # Parse Muts as input for sub-process call to run the actual
    # ddG calculation
    muts = ",".join("_".join([str(y) for y in x]) for x in muts)
    
    # Block new jobs if too many running already
    while(True):
        # Only submit next ddG calculation job if we aren't
        # already at max allowed processes
        if(len(processes) <= max_processes):
            p = run_ddg_calc(p1, p2, pdb, rank, muts, trials, interface_cutoff)
            processes.append(p)
            break
        # Otherwise we wait for processes to finish
        else:
            # Give time for jobs to finish
            time.sleep(5)
            
            # Parse currently running jobs to remove any that
            # have finished
            new_processes = []
            for p in processes:
                # Process not finished, keep in list
                if(p["p"].poll() is None):
                    new_processes.append(p)
                # Process ended in error
                elif(p["p"].poll() != 0):
                    p["end_time"] = time.time()
                    print "Error", p["p"].poll()
                    print "cmd:", p["cmd"]
                    print "RunTime:", (p["end_time"] - p["start_time"])
                    print
                    finished_processes.append(p)
                # Process ended successfully
                else:
                    p["end_time"] = time.time()
                    print "Finished ddG", p["in"], "in", (p["end_time"] - p["start_time"])
                    finished_processes.append(p)
            processes = new_processes
            
# Wait to completion of all processes
while(True):
    if(len(processes) == 0):
        break
    else:
        new_processes = []
        for p in processes:
            if(p["p"].poll() is None):
                new_processes.append(p)
            elif(p["p"].poll() != 0):
                p["end_time"] = time.time()
                print "Error", p["p"].poll()
                print "cmd:", p["cmd"]
                print "RunTime:", (p["end_time"] - p["start_time"])
                print
                finished_processes.append(p)
            else:
                p["end_time"] = time.time()
                print "Finished ddG", p["in"], "in", (p["end_time"] - p["start_time"])
                finished_processes.append(p)
        processes = new_processes
        time.sleep(5)

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

  df.header = header
  df.tailer = tailer


nice python /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Scripts_and_Notebooks/Mutant_ddG_Calc.py COVID19N P19784 /home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19N_P19784/run1/structures/it1/water/COVID19N_P19784_3w.pdb 1 345_Q,334_H,349_N,267_Q,290_D 50 8.0 /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/ddG_Mutated_Structures/
nice python /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Scripts_and_Notebooks/Mutant_ddG_Calc.py COVID19N Q9NR30 /home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19N_Q9NR30/run1/structures/it1/water/COVID19N_Q9NR30_53w.pdb 1 345_Q,334_H,349_N,267_Q,290_D 50 8.0 /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/ddG_Mutated_Structures/
nice python /home/sdw95/3D_SARS2/git_hub/3D_SARS2/Scripts_and_Notebooks/Mutant_ddG_Calc.py COVID19nsp14 P12268 /home/sdw95/3D_SARS2/COVID19_Docking/Haddock_Output/COVID19nsp14_P12268/run1/structures/it1/water/COVID19nsp14_P12268_2w.pdb 1 220_S,307_S,319_S,228_N,293_S,305_V,345_E,396_A,510_I,374_H,294_V,304_R,14_I,134_T,212_K,493_Q,26

In [70]:
# Process kill all block (just in case running need to be
# interrupted)
for p in processes:
    p["p"].kill()

# Parse whole interaction ddG values

In [35]:
# Read raw ddG summaries
ddGs = pd.concat([pd.read_csv(x, sep="\t") for x in tqdm_notebook(glob.glob("{0}/ddG_Mutated_Structures/Summary_Logs/*".format(output_dir)))])
ddGs = ddGs.sort_values(["P1", "P2", "Docking_Rank", "ddG_Trial"])
ddGs.head()

HBox(children=(IntProgress(value=0, max=6), HTML(value=u'')))




Unnamed: 0,P1,P2,Docking_Rank,ddG_Trial,WT_Score,WT_dG,Mut_Score,Mut_dG,ddG,pdbfile
0,COVID19N,P19784,1,0,830.353909,26.839849,832.546962,27.832565,0.992716,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
1,COVID19N,P19784,1,1,830.487593,26.350872,832.701522,27.987124,1.636253,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
2,COVID19N,P19784,1,2,830.487593,26.973533,832.546962,27.12012,0.146587,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
3,COVID19N,P19784,1,3,830.15796,26.6439,832.399944,26.973102,0.329202,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
4,COVID19N,P19784,1,4,830.15796,26.6439,832.546962,27.832565,1.188665,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...


In [36]:
# Save merged summary
ddGs.to_csv("{0}/ddG_All.txt".format(output_dir), sep="\t", index=None)

In [37]:
# Combine individual ddG trials per interaction
# to obtain an average ddG / determine if the trials
# are significantly non-zero
summary = []
for g in tqdm_notebook(ddGs[ddGs["Docking_Rank"] <= 1].groupby(["P1", "P2"])):
    p1, p2 = g[0]
    if(p2 == "ACE2"):
        continue
    
    #tmp = g[1][g[1]["ddG"] != 0]
    #if(len(tmp) == 0):
    tmp = g[1]
    wt_score, wt_dg, mut_score, mut_dg, ddg = tmp[["WT_Score", "WT_dG", "Mut_Score", "Mut_dG", "ddG"]].mean(axis=0)
    ddg = -ddg
    wt_score_std, wt_dg_std, mut_score_std, mut_dg_std, ddg_std = tmp[["WT_Score", "WT_dG", "Mut_Score", "Mut_dG", "ddG"]].std(axis=0)
    best_file = g[1][(g[1]["ddG"] - ddg) == (g[1]["ddG"] - ddg).min()]["pdbfile"].values[0]
    try:
        p = scipy.stats.norm.sf(abs(ddg / ddg_std))
    except ZeroDivisionError:
        p = 1
    summary.append([p1.replace("COVID19", ""), p2, wt_score, wt_score_std, wt_dg, wt_dg_std, mut_score, mut_score_std, mut_dg, mut_dg_std, ddg, ddg_std, p, best_file])
summary = pd.DataFrame(summary, columns=["P1", "P2", "WT_Score", "WT_Score_Std", "WT_dG", "WT_dG_Std", "Mut_Score", "Mut_Score_Std", "Mut_dG", "Mut_dG_Std", "ddG", "ddG_Std", "p-value", "pdbfile"])

HBox(children=(IntProgress(value=0, max=6), HTML(value=u'')))




In [38]:
summary[(summary["p-value"] <= 0.05)&(summary["ddG"].abs() > 0.1)].sort_values("ddG")

Unnamed: 0,P1,P2,WT_Score,WT_Score_Std,WT_dG,WT_dG_Std,Mut_Score,Mut_Score_Std,Mut_dG,Mut_dG_Std,ddG,ddG_Std,p-value,pdbfile
5,orf9b,P27448,778.863864,0.635778,38.503195,0.760165,780.767278,0.848614,43.929987,0.99226,-5.426791,1.179252,2.093343e-06,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
4,nsp5,Q92769,834.090284,0.810092,7.23996,1.222415,841.151636,0.491191,10.172278,0.734988,-2.932318,1.435805,0.02056182,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
1,N,Q9NR30,553.755042,0.221785,170.418358,0.259249,552.88699,0.032203,167.859992,0.032203,2.558366,0.262053,8.132089e-23,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...
2,nsp14,P12268,910.062887,0.685269,51.174839,1.757494,890.094069,0.905133,21.927102,1.628844,29.247737,2.406983,2.826374e-34,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...


In [39]:
# Calculate Z-score (comparing accross all interactions)
# Correct normalization by removing two extreme outliers
print summary["ddG"].mean(), summary["ddG"].std()
summary["z-score"] = (summary["ddG"] - summary["ddG"].mean()) / summary["ddG"].std()
tmp = summary[summary["z-score"].abs() <= 3]
print len(tmp)
print tmp["ddG"].mean(), tmp["ddG"].std()
summary["z-score"] = (summary["ddG"] - tmp["ddG"].mean()) / tmp["ddG"].std()

3.57162699446 12.8492879705
6
3.57162699446 12.8492879705


In [40]:
# Check most significant hits
summary[((summary["z-score"].abs() >= 1)&(summary["p-value"] <= 0.05))].sort_values("ddG")

Unnamed: 0,P1,P2,WT_Score,WT_Score_Std,WT_dG,WT_dG_Std,Mut_Score,Mut_Score_Std,Mut_dG,Mut_dG_Std,ddG,ddG_Std,p-value,pdbfile,z-score
2,nsp14,P12268,910.062887,0.685269,51.174839,1.757494,890.094069,0.905133,21.927102,1.628844,29.247737,2.406983,2.826374e-34,/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/d...,1.998252


In [42]:
# Save Summary
summary.to_csv("{0}/ddG_Summary.txt".format(output_dir), sep="\t", index=None)