In [7]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("13_Run_Interface_ddG_Scanning")

from config import *

import pandas as pd
pd.options.display.max_columns = 999

import glob
import os
import time
import scipy

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

from collections import defaultdict

This notebook is a wrapper to that calls a PyRosetta perform interface scanning mutagenesis across all docked interactions to predict the ddG impact of all possible mutations along the interface.


- Inputs:
  - Docking_Summary.txt
  - Pop_Vars.txt
  - [P1]\_[P2] (Interaction Haddock Run Directory created under "Docking_Runs")


- Outputs:
  - [P1]\_[P2]  (directory and summary logs per interaction created under ddG_Single_Mutants)
  - Hotspot_Scored_Mutants.txt


- Dependencies:
  - Must be run after 05_Fetch_Population_Variants and 07_Run_PPI_Docking
  - Calls Ires_ddG_Scanning.py

# Run ddG for Interface Scanning

In [4]:
# Read in docking summary (we could theoretcally
# want to calculate this value on all docking attempts
# but for here we will just filter to the top-ranked)
ppi_docking = pd.read_csv("{0}/Docking_Summary.txt".format(output_dir), sep="\t")

ppi_docking = ppi_docking[ppi_docking["Rank"] == 1]

In [9]:
# Wrapper to call ires scanning script
def run_ddg_scanning(p1, p2, pdb, mut, rank, iresA, iresB, trials, interface_cutoff, out_base=out_base):
    #os.system("rm Data/ddG_Single_Mutants/Raw_Outputs/{0}_{1}/*".format(p1, p2))
    cmd = "nice python Ires_ddG_Scanning.py --pdb_filename {0} --partners A_B --mutant_aa {1} --trials {2} --output True --trial_output Raw_Outputs/{7}_{8}/ddG_{3}_mut_to_{2} --iresA {4} --iresB {5} --interface_cutoff {6} --out_base {9}".format(pdb, mut, trials, os.path.basename(pdb.split(".")[0]) ,iresA, iresB, interface_cutoff, p1, p2, out_base)
    p = sp.Popen(cmd, shell=True)
    return {"p":p, "cmd":cmd, "start_time":time.time(), "end_time":None, "in":(p1, p2, rank)}
# FUNCTION END

In [14]:
# Wrapper to call ires scanning script
def run_ddg_scanning(p1, p2, pdb, mut, rank, iresA, iresB, trials, interface_cutoff, out_base=ddg_singles_dir):
    #os.system("rm ddG_Single_Mutants/Raw_Outputs/{0}_{1}/*".format(p1, p2))
    cmd = "nice python {0}/Ires_ddG_Scanning.py --pdb_filename {1} --partners A_B --mutant_aa {2} --trials {3} --output True --trial_output Raw_Outputs/{8}_{9}/ddG_{4}_mut_to_{2} --iresA {5} --iresB {6} --interface_cutoff {7} --out_base {10}".format(script_dir, pdb, mut, trials, os.path.basename(pdb.split(".")[0]) ,iresA, iresB, interface_cutoff, p1, p2, out_base)
    p = sp.Popen(cmd, shell=True)
    return {"p":p, "cmd":cmd, "start_time":time.time(), "end_time":None, "in":(p1, p2, rank)}
# FUNCTION END

In [12]:
import subprocess as sp
# Run Ligand Docking in Loop
i_num = 1
finished_processes = []
processes = []

max_processes = 60

trials = 10
interface_cutoff = 8.0
# Iterate over all Docking Inputs
for p1, p2, pdb, rank, ires1, ires2, attempt in tqdm_notebook(ppi_docking[ppi_docking["Rank"] == 1].sort_values(["Rank", "P1", "P2"], ascending=[True, True, True])[["P1", "P2", "File", "Rank", "P1_Ires", "P2_Ires", "Attempt"]].values):
    #if(attempt < 39):
    #    continue
    for mut in tqdm_notebook("ARNDCEQGHILKMFPSTWYV"):
        # If expected output from job already exists, skip
        if(len(glob.glob("{0}Raw_Outputs/{1}_{2}/*_to_{3}*".format(ddg_singles_dir, p1, p2, mut))) == 10):
            continue
        #break
        
        # Block new jobs if too many running already
        while(True):
            if(len(processes) <= max_processes):
                p = run_ddg_scanning(p1, p2, pdb, mut, rank, ires1, ires2, trials, interface_cutoff)
                processes.append(p)
                break
            else:
                new_processes = []
                for p in processes:
                    if(p["p"].poll() is None):
                        new_processes.append(p)
                    elif(p["p"].poll() != 0):
                        p["end_time"] = time.time()
                        print "Error", p["p"].poll()
                        print "cmd:", p["cmd"]
                        print "RunTime:", (p["end_time"] - p["start_time"])
                        print
                        finished_processes.append(p)
                    else:
                        p["end_time"] = time.time()
                        print "Finished ddG", p["in"], "in", (p["end_time"] - p["start_time"])
                        finished_processes.append(p)
                processes = new_processes
                time.sleep(5)
        #break
    #break
# Wait to completion
while(True):
    if(len(processes) == 0):
        break
    else:
        new_processes = []
        for p in processes:
            if(p["p"].poll() is None):
                new_processes.append(p)
            elif(p["p"].poll() != 0):
                p["end_time"] = time.time()
                print "Error", p["p"].poll()
                print "cmd:", p["cmd"]
                print "RunTime:", (p["end_time"] - p["start_time"])
                print
                finished_processes.append(p)
            else:
                p["end_time"] = time.time()
                print "Finished ddG", p["in"], "in", (p["end_time"] - p["start_time"])
                finished_processes.append(p)
        processes = new_processes
        time.sleep(5)

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

Finished ddG ('COVID19N', 'Q9NR30', 1) in 1066.42793393
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1071.48714495
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1076.54881716
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1076.54247189
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1081.61584306
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1081.59877992
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1086.64345098
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1101.59791088
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1111.67959189
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1116.71767807
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1116.62893891
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1126.68233204
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1131.71160507
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1136.77966022
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1146.87349105
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1181.91901898
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1226.99431109
Finished ddG ('COVID19N', 'Q9NR30', 1) in 1236.9

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

Finished ddG ('COVID19N', 'Q9NR30', 1) in 1252.02229095
Finished ddG ('COVID19N', 'P19784', 1) in 1602.55913115
Finished ddG ('COVID19N', 'P19784', 1) in 1607.69696188
Finished ddG ('COVID19N', 'P19784', 1) in 1607.67146778
Finished ddG ('COVID19N', 'P19784', 1) in 1607.66710091
Finished ddG ('COVID19N', 'P19784', 1) in 1612.75581717
Finished ddG ('COVID19N', 'P19784', 1) in 1617.73885703
Finished ddG ('COVID19N', 'P19784', 1) in 1622.79052591
Finished ddG ('COVID19N', 'P19784', 1) in 1658.89338183
Finished ddG ('COVID19N', 'P19784', 1) in 1668.88255692
Finished ddG ('COVID19N', 'P19784', 1) in 1694.01750612
Finished ddG ('COVID19N', 'P19784', 1) in 1693.99665904
Finished ddG ('COVID19N', 'P19784', 1) in 1839.09440088
Finished ddG ('COVID19N', 'P19784', 1) in 1844.22632909
Finished ddG ('COVID19N', 'P19784', 1) in 1844.20859408
Finished ddG ('COVID19N', 'P19784', 1) in 1854.24301696
Finished ddG ('COVID19N', 'P19784', 1) in 1879.2366879
Finished ddG ('COVID19N', 'P19784', 1) in 1884.30

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

Finished ddG ('COVID19N', 'P19784', 1) in 1924.49358797
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1954.07955098
Finished ddG ('COVID19nsp7', 'P21964', 1) in 700.804628134
Finished ddG ('COVID19nsp7', 'P21964', 1) in 705.806303978
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1778.92568707
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1763.84014392
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1780.58200598
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1770.47480893
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1816.87423921
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1786.80262089
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1801.88630486
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1766.73682499
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1791.80738688
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1822.19855404
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1807.13781905
Finished ddG ('COVID19nsp15', 'P62330', 1) in 1842.26920605
Finished ddG ('COVID19nsp15', 'P62330', 1) in 

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

Finished ddG ('COVID19nsp15', 'P62330', 1) in 2027.70244098
Finished ddG ('COVID19nsp15', 'P62330', 1) in 2102.859694
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3204.31625009
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3204.24129891
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3249.35791183
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3259.3270781
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3269.99172997
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3269.94251299
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3274.96003103
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3274.90143704
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3279.96993804
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3299.96328092
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3312.12617016
Finished ddG ('COVID19nsp15', 'P62330', 1) in 2080.41544604
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3327.25727391
Finished ddG ('COVID19nsp14', 'P12268', 1) in 3372.26529479
Finished ddG ('COVID19nsp14', 'P12268', 1) 

In [16]:
import subprocess as sp
# Run Ligand Docking in Loop
i_num = 1
finished_processes = []
processes = []

max_processes = 60

trials = 10
interface_cutoff = 8.0
# Iterate over all Docking Inputs
for p1, p2, pdb, rank, ires1, ires2, attempt in tqdm_notebook(ppi_docking[ppi_docking["Rank"] == 1].sort_values(["Rank", "P1", "P2"], ascending=[True, True, True])[["P1", "P2", "File", "Rank", "P1_Ires", "P2_Ires", "Attempt"]].values):
    #if(attempt < 39):
    #    continue
    for mut in tqdm_notebook("ARNDCEQGHILKMFPSTWYV"):
        # If expected output from job already exists, skip
        if(len(glob.glob("{0}Raw_Outputs/{1}_{2}/*_to_{3}*".format(ddg_singles_dir, p1, p2, mut))) == 10):
            continue
        #break
        
        # Block new jobs if too many running already
        while(True):
            if(len(processes) <= max_processes):
                p = run_ddg_scanning(p1, p2, pdb, mut, rank, ires1, ires2, trials, interface_cutoff)
                processes.append(p)
                break
        break
    break

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))




In [29]:
for p1, p2 in tqdm_notebook(ppi_docking[ppi_docking["Rank"] == 1].sort_values(["Rank", "P1", "P2"], ascending=[True, True, True])[["P1", "P2"]].drop_duplicates().values):
    orig = "/home/sdw95/3D_SARS2/Data/ddG_Single_Mutants/Raw_Outputs/{0}_{1}/".format(p1, p2)
    dest = "/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/ddG_Single_Mutants/Raw_Outputs/{0}_{1}/".format(p1, p2)
    
    print os.system("cp {0} {1} -r".format(orig, dest))
    
    
    orig = "/home/sdw95/3D_SARS2/Data/ddG_Single_Mutants/Structures/{0}_{1}/".format(p1, p2)
    dest = "/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/ddG_Single_Mutants/Structures/{0}_{1}/".format(p1, p2)
    
    print os.system("cp {0} {1} -r".format(orig, dest))
    
    
    orig = "/home/sdw95/3D_SARS2/Data/ddG_Single_Mutants/Summaries/{0}_{1}_1*".format(p1, p2)
    dest = "/home/sdw95/3D_SARS2/git_hub/3D_SARS2/Output/ddG_Single_Mutants/Summaries/".format(p1, p2)
    
    print os.system("cp {0} {1}".format(orig, dest))

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0



array([['COVID19N', 'P19784'],
       ['COVID19N', 'Q9NR30'],
       ['COVID19nsp14', 'P12268'],
       ['COVID19nsp15', 'P62330'],
       ['COVID19nsp5', 'Q92769'],
       ['COVID19nsp7', 'P21964'],
       ['COVID19orf9b', 'P27448']], dtype=object)

# Parse Interface Scanning ddG Results

In [13]:
inter2ddG = defaultdict()
fail = set()

# Iterate over all docks
for p1, p2, pdb, rank, ires1, ires2, attempt in tqdm_notebook(ppi_docking[ppi_docking["Rank"] == 1].sort_values(["Rank", "P1", "P2"])[["P1", "P2", "File", "Rank", "P1_Ires", "P2_Ires", "Attempt"]].values):
    #if(os.path.exists("ddG_Single_Mutants/Summaries/{0}_{1}_{2}_Mean.txt".format(p1, p2, rank))):
    #    print "Exists"
    #    continue
    
    # Grab all of the Raw Outputs
    # These should be 20 mut_X cases describing all posible mutagenesis
    # each with 10 trials (to account for noise in the side chain optimization)
    fs = glob.glob("{0}Raw_Outputs/{1}_{2}/*".format(ddg_singles_dir, p1, p2))
    fs = [x for x in fs if int(os.path.basename(x).split("_")[3][:-1]) == attempt]
    
    # We expect 200 output files, if we don't see this many then
    # something is wrong
    # NOTE: This number subject to change if ddG trial number changed
    if(len(fs) != 200):
        print p1, p2, rank, attempt, len(fs)
        #os.system("rm ddG_Single_Mutants/Summaries/{0}_{1}_{2}_*".format(p1, p2, rank))
        fail.add((p1, p2))
    
    # If there are no output files something went really wrong
    if(len(fs) == 0):
        print "ERROR", p1, p2
        continue
    
    # Create a dictionary to store all of the results per mutation type
    m2tmp = defaultdict(lambda: None)
    for f in sorted(fs, key=lambda x: int(x.split("_")[-1].split(".")[0])):
        # Which AA mutation does this scanning mutageneis include
        m = f.split("to_")[1].split("_")[0]
        
        # Replicate number (1-10)
        r = int(f.split("_")[-1].split(".")[0])
        
        # Read in the Data as a DataFrame (name ddG column based
        # on mutation / replicate)
        data = pd.read_csv(f, sep="\t", names=["Mut", "ddG_{0}_{1}".format(m, r)])
        
        # Parse out  reference and position from mutation name
        data["Ref"] = data["Mut"].map(lambda x: x[0])
        data["Pos"] = data["Mut"].map(lambda x: int(x[1:-1]))
        
        # Assign the protein identifier (A or B for Viral / Human respectively)
        # The results are always listed in order with all A mutants followed by all
        # B mutants, but are not specifically labeled.
        #
        # We know which interface residues are considered in the mutageneis, so we
        # can work out how many rows should be A / B based on the number of interface
        # residues on each chain
        data["Chain"] = ["A"]*len(ires1.split(",")) + ["B"]*len(ires2.split(","))
        
        # Subset columns (i.e. Drop the Mut Column)
        data = data[["Chain", "Ref", "Pos", "ddG_{0}_{1}".format(m, r)]]
        
        # If there is no information saved for this mutation type yet,
        # then just store this data
        if(m2tmp[m] is None):
            m2tmp[m] = data
        # Otherwise, join the data for this replicate with the previously
        # saved data
        else:
            m2tmp[m] = m2tmp[m].join(data.set_index(["Ref", "Pos", "Chain"]), on=["Ref", "Pos", "Chain"])
    
    # Combine all results into one final dataset describing
    # scanning mutagenesis attempts for all AA mutations on
    # this interaction
    data = None
    
    # We should have 20 entries here corresponding to 20 AA mutant
    # options
    if(len(m2tmp) != 20):
        print "ERROR:", p1, p2, len(m2tmp)
    
    # Iterate over each AA mutant and join the results together
    for m in m2tmp:
        if(data is None):
            data = m2tmp[m]
        else:
            data = data.join(m2tmp[m].set_index(["Chain", "Ref", "Pos"]), on=["Chain", "Ref", "Pos"])
    
    # Store the raw summary containing data for each indicitual replicate
    data.to_csv("{0}Summaries/{1}_{2}_{3}_Full.txt".format(ddg_singles_dir, p1, p2, rank), sep="\t", index=None)
    
    # Compile replicate per AA mutant into an average
    for res in set([x.split("_")[1] for x in data if "ddG_" in x]):
        # Subset data with only the columns for the 10 replicates for
        # this AA residue
        tmp = data[[x for x in data if "_{0}_".format(res) in x]]
        
        # Calculate Average / Std accross the 10 replicates
        data["ddG_{0}_avg".format(res)] = tmp.mean(axis=1)
        data["ddG_{0}_std".format(res)] = tmp.std(axis=1)
        
        # Calculate the Z-score relative to all other same AA mutations
        # accross the entire interface
        data["ddG_{0}_z".format(res)] = (tmp.mean(axis=1) - tmp.mean(axis=1).mean()) / tmp.mean(axis=1).std()
    
    # Select only the averaged columns and save
    data = data[list(data)[:3] + [x for x in data if "_avg" in x or "_std" in x or "_z" in x]]
    data.to_csv("{0}Summaries/{1}_{2}_{3}_Mean.txt".format(ddg_singles_dir, p1, p2, rank), sep="\t", index=None)

HBox(children=(IntProgress(value=0, max=7), HTML(value=u'')))

COVID19N P19784 1 3 10
ERROR: COVID19N P19784 1
COVID19N Q9NR30 1 53 10
ERROR: COVID19N Q9NR30 1
COVID19nsp14 P12268 1 2 10
ERROR: COVID19nsp14 P12268 1
COVID19nsp15 P62330 1 4 10
ERROR: COVID19nsp15 P62330 1
COVID19nsp5 Q92769 1 10 10
ERROR: COVID19nsp5 Q92769 1
COVID19nsp7 P21964 1 5 10
ERROR: COVID19nsp7 P21964 1
COVID19orf9b P27448 1 31 10
ERROR: COVID19orf9b P27448 1



In [31]:
# Reformat as simple table of muts (1 row = 1 mut)
ddg_singles = []

# Iterate over all the interaction summaries
for f in tqdm_notebook(glob.glob("{0}/Summaries/*Mean*".format(ddg_singles_dir))):
    # Read in Data for this interaction
    tmp = pd.read_csv(f, sep="\t")
    
    # Identify P1, P2
    p1, p2 = os.path.basename(f).split("_")[:2]
    
    # Create tmp table where each mut is one row
    tmp2 = []
    # Iterate over each row
    for i in range(len(tmp)):
        row = tmp.ix[i]
        # Iterate over all muts
        for mut in "ACEDGFIHKMLNQPSRTWVY":
            # Pull out the columns we care about / add in Alt column
            tmp2.append([p1, p2] + list(row[["Chain", "Ref", "Pos"]]) + [mut, row["ddG_{0}_avg".format(mut)], row["ddG_{0}_std".format(mut)]])
    # Merge into one DataFrame
    tmp2 = pd.DataFrame(tmp2, columns=["P1", "P2", "Chain", "Ref", "Pos", "Alt", "ddG", "std"])
    
    # Calculate p-values for if the single mutant ddG value
    # is significantly non-zero over the 10 trials
    def do(ddg, std):
        try:
            p = scipy.stats.norm.sf(abs(ddg / std))
        except ZeroDivisionError:
            p = 1
        return p
    # FUNCTION END
    tmp2["p-value"] = tmp2[["ddG", "std"]].apply(lambda x: do(*x), axis=1)
    
    # Iterate over each chain serparately to normalize the z-scores
    # NOTE: This is not the way I did this for the previous compilation
    #       I should probably go back and change the previous block's
    #       calculation for z-score, but I don't think those z-scores
    #       are used in any downstream work.
    for chain in ["A", "B"]:
        # Calculate Z-score of all mutations relative to all mutants
        ddg = tmp2[tmp2["Chain"] == chain]["ddG"].mean()
        std = tmp2[tmp2["Chain"] == chain]["ddG"].std()
        tmp2.ix[tmp2["Chain"] == chain, "z-score"] = tmp2[tmp2["Chain"] == chain]["ddG"].map(lambda x: (x - ddg) / std)
        
        # Calculate Z-score of all mutations relative only to same
        # AA mutations
        # NOTE: Might also be worth calulating Z-score relative to all other
        #       mutations at that position
        for mut in "ACEDGFIHKMLNQPSRTWVY":
            ddg = tmp2[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut)]["ddG"].mean()
            std = tmp2[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut)]["ddG"].std()
            tmp2.ix[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut), "z-score (same AA)"] = tmp2[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut)]["ddG"].map(lambda x: (x - ddg) / std)
    
    ddg_singles.append(tmp2)

HBox(children=(IntProgress(value=0, max=138), HTML(value=u'')))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated





In [33]:
# Concatenate all the results
ddg_singles = pd.concat(ddg_singles)

In [34]:
# Read in Popvar Data to label which mutants are real population variants
pop_vars = pd.read_csv("{0}/Pop_Vars.txt".format(output_dir), sep="\t")
pop_vars = set(pop_vars[["UniProt", "AA_Ref", "AA_Pos", "AA_Alt"]].apply(tuple, axis=1).values)

  interactivity=interactivity, compiler=compiler, result=result)


In [35]:
# Label which mutations are population variants
# NOTE: Unsure why I didn't also add a column to label SARS1-SARS2 deviations
ddg_singles["is_pop_var"] = ddg_singles[["Chain", "P2", "Ref", "Pos", "Alt"]].apply(lambda x: x[0] == "B" and tuple(x[1:]) in pop_vars, axis=1)

In [37]:
ddg_singles["is_pop_var"].sum()

2023

In [38]:
# Check if there is any enrichment for significant z-score difference
# mutants to be population variants
exposure_mask = ddg_singles["z-score (same AA)"].abs() >= 1
case_mask = ddg_singles["is_pop_var"] == True

odds_ratio(exposure_mask, case_mask, log_odds=True, verbose=True)

               Case  Non-Case
Exposed       170.0   18350.0
Non-Exposed  1853.0  162427.0


  .format(op=op_str, alt_op=unsupported[op_str]))


(-0.30031672855009905,
 -0.072649974086963567,
 -0.5279834830132345,
 0.0097265655690781383)

In [41]:
# Save summary of single mutant normalized ddG values
ddg_singles.to_csv("{0}Hotspot_Scored_Mutants.txt".format(ddg_singles_dir), sep="\t", index=None)