In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("13_Run_Interface_ddG_Scanning")

from config import *
from helper_functions import pdb2df

import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os
import sys

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from mjm_tools import zip_res_range, unzip_res_range

from collections import defaultdict

In [5]:
import time
import scipy

# Paramaters for file locations

In [5]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Base output directory for mutated structures
# NOTE: Trailing "/" here is necessary based on
#       other code in this notebook.
out_base = "{0}/Data/ddG_Single_Mutants/".format(base_dir)

In [6]:
# Make necessary paths
if(not os.path.exists("{0}".format(out_base))):
    os.mkdir("{0}".format(out_base))
if(not os.path.exists("{0}Raw_Outputs".format(out_base))):
    os.mkdir("{0}Raw_Outputs".format(out_base))
if(not os.path.exists("{0}Summaries".format(out_base))):
    os.mkdir("{0}Summaries".format(out_base))
if(not os.path.exists("{0}Structures".format(out_base))):
    os.mkdir("{0}Structures".format(out_base))

# Run ddG for Interface Scanning

In [7]:
# Read in docking summary (we could theoretcally
# want to calculate this value on all docking attempts
# but for here we will just filter to the top-ranked)
ppi_docking = pd.read_csv("{0}/Data/Docking_Summary.txt".format(base_dir), sep="\t")

ppi_docking = ppi_docking[ppi_docking["Rank"] == 1]

In [8]:
# Wrapper to call ires scanning script
def run_ddg_scanning(p1, p2, pdb, mut, rank, iresA, iresB, trials, interface_cutoff, out_base=out_base):
    #os.system("rm Data/ddG_Single_Mutants/Raw_Outputs/{0}_{1}/*".format(p1, p2))
    cmd = "nice python Ires_ddG_Scanning.py --pdb_filename {0} --partners A_B --mutant_aa {1} --trials {2} --output True --trial_output Raw_Outputs/{7}_{8}/ddG_{3}_mut_to_{1} --iresA {4} --iresB {5} --interface_cutoff {6} --out_base {7}".format(pdb, mut, trials, os.path.basename(pdb.split(".")[0]) ,iresA, iresB, interface_cutoff, p1, p2, out_base)
    p = sp.Popen(cmd, shell=True)
    return {"p":p, "cmd":cmd, "start_time":time.time(), "end_time":None, "in":(p1, p2, rank)}
# FUNCTION END

In [10]:
import subprocess as sp
# Run Ligand Docking in Loop
i_num = 1
finished_processes = []
processes = []

max_processes = 60

trials = 10
interface_cutoff = 8.0
# Iterate over all Docking Inputs
for p1, p2, pdb, rank, ires1, ires2, attempt in tqdm_notebook(ppi_docking[ppi_docking["Rank"] == 1].sort_values(["Rank", "P1", "P2"], ascending=[True, True, True])[["P1", "P2", "File", "Rank", "P1_Ires", "P2_Ires", "Attempt"]].values):
    #if(attempt < 39):
    #    continue
    for mut in tqdm_notebook("ARNDCEQGHILKMFPSTWYV"):
        # If expected output from job already exists, skip
        if(len(glob.glob("{0}Raw_Outputs/{1}_{2}/*_to_{3}*".format(out_base, p1, p2, mut))) == 10):
            continue
        #break
        
        # Block new jobs if too many running already
        while(True):
            if(len(processes) <= max_processes):
                p = run_ddg_scanning(p1, p2, pdb, mut, rank, ires1, ires2, trials, interface_cutoff)
                processes.append(p)
                break
            else:
                new_processes = []
                for p in processes:
                    if(p["p"].poll() is None):
                        new_processes.append(p)
                    elif(p["p"].poll() != 0):
                        p["end_time"] = time.time()
                        print "Error", p["p"].poll()
                        print "cmd:", p["cmd"]
                        print "RunTime:", (p["end_time"] - p["start_time"])
                        print
                        finished_processes.append(p)
                    else:
                        p["end_time"] = time.time()
                        print "Finished ddG", p["in"], "in", (p["end_time"] - p["start_time"])
                        finished_processes.append(p)
                processes = new_processes
                time.sleep(5)
        #break
    #break
# Wait to completion
while(True):
    if(len(processes) == 0):
        break
    else:
        new_processes = []
        for p in processes:
            if(p["p"].poll() is None):
                new_processes.append(p)
            elif(p["p"].poll() != 0):
                p["end_time"] = time.time()
                print "Error", p["p"].poll()
                print "cmd:", p["cmd"]
                print "RunTime:", (p["end_time"] - p["start_time"])
                print
                finished_processes.append(p)
            else:
                p["end_time"] = time.time()
                print "Finished ddG", p["in"], "in", (p["end_time"] - p["start_time"])
                finished_processes.append(p)
        processes = new_processes
        time.sleep(5)

HBox(children=(IntProgress(value=0, max=138), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))

HBox(children=(IntProgress(value=0, max=20), HTML(value=u'')))




# Parse Interface Scanning ddG Results

In [17]:
inter2ddG = defaultdict()
fail = set()

# Iterate over all docks
for p1, p2, pdb, rank, ires1, ires2, attempt in tqdm_notebook(ppi_docking[ppi_docking["Rank"] == 1].sort_values(["Rank", "P1", "P2"])[["P1", "P2", "File", "Rank", "P1_Ires", "P2_Ires", "Attempt"]].values):
    #if(os.path.exists("Data/ddG_Single_Mutants/Summaries/{0}_{1}_{2}_Mean.txt".format(p1, p2, rank))):
    #    print "Exists"
    #    continue
    
    # Grab all of the Raw Outputs
    # These should be 20 mut_X cases describing all posible mutagenesis
    # each with 10 trials (to account for noise in the side chain optimization)
    fs = glob.glob("{0}Raw_Outputs/{1}_{2}/*".format(out_base, p1, p2))
    fs = [x for x in fs if int(os.path.basename(x).split("_")[3][:-1]) == attempt]
    
    # We expect 200 output files, if we don't see this many then
    # something is wrong
    # NOTE: This number subject to change if ddG trial number changed
    if(len(fs) != 200):
        print p1, p2, rank, attempt, len(fs)
        #os.system("rm Data/ddG_Single_Mutants/Summaries/{0}_{1}_{2}_*".format(p1, p2, rank))
        fail.add((p1, p2))
    
    # If there are no output files something went really wrong
    if(len(fs) == 0):
        print "ERROR", p1, p2
        continue
    
    # Create a dictionary to store all of the results per mutation type
    m2tmp = defaultdict(lambda: None)
    for f in sorted(fs, key=lambda x: int(x.split("_")[-1].split(".")[0])):
        # Which AA mutation does this scanning mutageneis include
        m = f.split("to_")[1].split("_")[0]
        
        # Replicate number (1-10)
        r = int(f.split("_")[-1].split(".")[0])
        
        # Read in the Data as a DataFrame (name ddG column based
        # on mutation / replicate)
        data = pd.read_csv(f, sep="\t", names=["Mut", "ddG_{0}_{1}".format(m, r)])
        
        # Parse out  reference and position from mutation name
        data["Ref"] = data["Mut"].map(lambda x: x[0])
        data["Pos"] = data["Mut"].map(lambda x: int(x[1:-1]))
        
        # Assign the protein identifier (A or B for Viral / Human respectively)
        # The results are always listed in order with all A mutants followed by all
        # B mutants, but are not specifically labeled.
        #
        # We know which interface residues are considered in the mutageneis, so we
        # can work out how many rows should be A / B based on the number of interface
        # residues on each chain
        data["Chain"] = ["A"]*len(ires1.split(",")) + ["B"]*len(ires2.split(","))
        
        # Subset columns (i.e. Drop the Mut Column)
        data = data[["Chain", "Ref", "Pos", "ddG_{0}_{1}".format(m, r)]]
        
        # If there is no information saved for this mutation type yet,
        # then just store this data
        if(m2tmp[m] is None):
            m2tmp[m] = data
        # Otherwise, join the data for this replicate with the previously
        # saved data
        else:
            m2tmp[m] = m2tmp[m].join(data.set_index(["Ref", "Pos", "Chain"]), on=["Ref", "Pos", "Chain"])
    
    # Combine all results into one final dataset describing
    # scanning mutagenesis attempts for all AA mutations on
    # this interaction
    data = None
    
    # We should have 20 entries here corresponding to 20 AA mutant
    # options
    if(len(m2tmp) != 20):
        print "ERROR:", p1, p2, len(m2tmp)
    
    # Iterate over each AA mutant and join the results together
    for m in m2tmp:
        if(data is None):
            data = m2tmp[m]
        else:
            data = data.join(m2tmp[m].set_index(["Chain", "Ref", "Pos"]), on=["Chain", "Ref", "Pos"])
    
    # Store the raw summary containing data for each indicitual replicate
    data.to_csv("{0}Summaries/{1}_{2}_{3}_Full.txt".format(out_base, p1, p2, rank), sep="\t", index=None)
    
    # Compile replicate per AA mutant into an average
    for res in set([x.split("_")[1] for x in data if "ddG_" in x]):
        # Subset data with only the columns for the 10 replicates for
        # this AA residue
        tmp = data[[x for x in data if "_{0}_".format(res) in x]]
        
        # Calculate Average / Std accross the 10 replicates
        data["ddG_{0}_avg".format(res)] = tmp.mean(axis=1)
        data["ddG_{0}_std".format(res)] = tmp.std(axis=1)
        
        # Calculate the Z-score relative to all other same AA mutations
        # accross the entire interface
        data["ddG_{0}_z".format(res)] = (tmp.mean(axis=1) - tmp.mean(axis=1).mean()) / tmp.mean(axis=1).std()
    
    # Select only the averaged columns and save
    data = data[list(data)[:3] + [x for x in data if "_avg" in x or "_std" in x or "_z" in x]]
    data.to_csv("{0}Summaries/{1}_{2}_{3}_Mean.txt".format(out_base, p1, p2, rank), sep="\t", index=None)

HBox(children=(IntProgress(value=0, max=138), HTML(value=u'')))




In [31]:
# Reformat as simple table of muts (1 row = 1 mut)
ddg_singles = []

# Iterate over all the interaction summaries
for f in tqdm_notebook(glob.glob("{0}/Summaries/*Mean*".format(out_base))):
    # Read in Data for this interaction
    tmp = pd.read_csv(f, sep="\t")
    
    # Identify P1, P2
    p1, p2 = os.path.basename(f).split("_")[:2]
    
    # Create tmp table where each mut is one row
    tmp2 = []
    # Iterate over each row
    for i in range(len(tmp)):
        row = tmp.ix[i]
        # Iterate over all muts
        for mut in "ACEDGFIHKMLNQPSRTWVY":
            # Pull out the columns we care about / add in Alt column
            tmp2.append([p1, p2] + list(row[["Chain", "Ref", "Pos"]]) + [mut, row["ddG_{0}_avg".format(mut)], row["ddG_{0}_std".format(mut)]])
    # Merge into one DataFrame
    tmp2 = pd.DataFrame(tmp2, columns=["P1", "P2", "Chain", "Ref", "Pos", "Alt", "ddG", "std"])
    
    # Calculate p-values for if the single mutant ddG value
    # is significantly non-zero over the 10 trials
    def do(ddg, std):
        try:
            p = scipy.stats.norm.sf(abs(ddg / std))
        except ZeroDivisionError:
            p = 1
        return p
    # FUNCTION END
    tmp2["p-value"] = tmp2[["ddG", "std"]].apply(lambda x: do(*x), axis=1)
    
    # Iterate over each chain serparately to normalize the z-scores
    # NOTE: This is not the way I did this for the previous compilation
    #       I should probably go back and change the previous block's
    #       calculation for z-score, but I don't think those z-scores
    #       are used in any downstream work.
    for chain in ["A", "B"]:
        # Calculate Z-score of all mutations relative to all mutants
        ddg = tmp2[tmp2["Chain"] == chain]["ddG"].mean()
        std = tmp2[tmp2["Chain"] == chain]["ddG"].std()
        tmp2.ix[tmp2["Chain"] == chain, "z-score"] = tmp2[tmp2["Chain"] == chain]["ddG"].map(lambda x: (x - ddg) / std)
        
        # Calculate Z-score of all mutations relative only to same
        # AA mutations
        # NOTE: Might also be worth calulating Z-score relative to all other
        #       mutations at that position
        for mut in "ACEDGFIHKMLNQPSRTWVY":
            ddg = tmp2[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut)]["ddG"].mean()
            std = tmp2[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut)]["ddG"].std()
            tmp2.ix[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut), "z-score (same AA)"] = tmp2[(tmp2["Chain"] == chain)&(tmp2["Alt"] == mut)]["ddG"].map(lambda x: (x - ddg) / std)
    
    ddg_singles.append(tmp2)

HBox(children=(IntProgress(value=0, max=138), HTML(value=u'')))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated





In [33]:
# Concatenate all the results
ddg_singles = pd.concat(ddg_singles)

In [34]:
# Read in Popvar Data to label which mutants are real population variants
pop_vars = pd.read_csv("{0}/Data/Pop_Vars.txt".format(base_dir), sep="\t")
pop_vars = set(pop_vars[["UniProt", "AA_Ref", "AA_Pos", "AA_Alt"]].apply(tuple, axis=1).values)

  interactivity=interactivity, compiler=compiler, result=result)


In [35]:
# Label which mutations are population variants
# NOTE: Unsure why I didn't also add a column to label SARS1-SARS2 deviations
ddg_singles["is_pop_var"] = ddg_singles[["Chain", "P2", "Ref", "Pos", "Alt"]].apply(lambda x: x[0] == "B" and tuple(x[1:]) in pop_vars, axis=1)

In [37]:
ddg_singles["is_pop_var"].sum()

2023

In [38]:
# Check if there is any enrichment for significant z-score difference
# mutants to be population variants
exposure_mask = ddg_singles["z-score (same AA)"].abs() >= 1
case_mask = ddg_singles["is_pop_var"] == True

odds_ratio(exposure_mask, case_mask, log_odds=True, verbose=True)

               Case  Non-Case
Exposed       170.0   18350.0
Non-Exposed  1853.0  162427.0


  .format(op=op_str, alt_op=unsupported[op_str]))


(-0.30031672855009905,
 -0.072649974086963567,
 -0.5279834830132345,
 0.0097265655690781383)

In [41]:
# Save summary of single mutant normalized ddG values
ddg_singles.to_csv("{0}Hotspot_Scored_Mutants.txt".format(out_base), sep="\t", index=None)