In [21]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("Prep_Tables")

import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os
import sys

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from mjm_tools import zip_res_range, unzip_res_range

from collections import defaultdict

# Table 1 - ECLAIR Interface Annotations

In [22]:
ires_df = pd.read_csv("Data/Interface_Summary.txt", sep="\t")
ires_df = ires_df[ires_df["Source"] == "ECLAIR"]

In [23]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [24]:
proteins.head()

Unnamed: 0.1,Unnamed: 0,ID,Is_Viral,Gene_Name,Length,Sequence,UniProt,Gene Name
0,0,Q9Y312,False,AAR2,384,MAAVQMDPELAKRLFFEGATVVILNMPKGTEFGIDYNSWEVGPKFR...,Q9Y312,AAR2
1,1,Q9UDR5,False,AASS,926,MLQVHRTGLGRLGVSLSKGLHHKAVLAVRREDVNAWERRAPLAPKH...,Q9UDR5,AASS
2,2,Q9NY61,False,AATF,560,MAGPQPLALQLEQLLNPRPSEADPEADPEEATAARVIDRFDEGEDG...,Q9NY61,AATF
3,3,P33527,False,ABCC1,1531,MALRGFCSADGSDPLWDWNVTWNTSNPDFTKCFQNTVLVWVPCFYL...,P33527,ABCC1
4,4,Q9H845,False,ACAD9,621,MSGCGLFLRTTAAARACRGLVVSTANRRLLRTSPPVRAFAKELFLG...,Q9H845,ACAD9


In [25]:
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

In [26]:
ires_df = ires_df[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"]]
ires_df.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Len Viral", "N Ires Viral", "Ires Viral", "Len Human", "N Ires Human", "Ires Human"]

In [27]:
ires_df["Ires Viral"] = ires_df["Ires Viral"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")
ires_df["Ires Human"] = ires_df["Ires Human"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")

In [28]:
ires_df.sort_values(["Gene Viral", "Gene Human"]).to_csv("Tables/Supplemental_Table_1.txt", sep="\t", index=None)
#ires_df.sort_values(["Gene Viral", "Gene Human"]).to_excel("Tables/Supplemental_Table_1.xlsx", index=None)

# Table 2 - Docking Interface Annotations

In [29]:
ires_df = pd.read_csv("Data/Interface_Summary.txt", sep="\t")
ires_df = ires_df[ires_df["Source"] == "Docking"]

In [30]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [31]:
proteins.head()

Unnamed: 0.1,Unnamed: 0,ID,Is_Viral,Gene_Name,Length,Sequence,UniProt,Gene Name
0,0,Q9Y312,False,AAR2,384,MAAVQMDPELAKRLFFEGATVVILNMPKGTEFGIDYNSWEVGPKFR...,Q9Y312,AAR2
1,1,Q9UDR5,False,AASS,926,MLQVHRTGLGRLGVSLSKGLHHKAVLAVRREDVNAWERRAPLAPKH...,Q9UDR5,AASS
2,2,Q9NY61,False,AATF,560,MAGPQPLALQLEQLLNPRPSEADPEADPEEATAARVIDRFDEGEDG...,Q9NY61,AATF
3,3,P33527,False,ABCC1,1531,MALRGFCSADGSDPLWDWNVTWNTSNPDFTKCFQNTVLVWVPCFYL...,P33527,ABCC1
4,4,Q9H845,False,ACAD9,621,MSGCGLFLRTTAAARACRGLVVSTANRRLLRTSPPVRAFAKELFLG...,Q9H845,ACAD9


In [32]:
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

In [33]:
ires_df = ires_df.join(pd.read_csv("Data/Models.txt", sep="\t").set_index("ID")[["Source", "PDB_ID", "PDB_Chain", "ModBase_ID"]], on="P2", rsuffix=" Model")

In [34]:
ires_df = ires_df[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Source Model", "PDB_ID", "PDB_Chain", "ModBase_ID", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"]]
ires_df.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Structure Source Human", "PDB ID Human", "PDB Chain Human", "ModBase ID Human", "Len Viral", "N Ires Viral", "Ires Viral", "Len Human", "N Ires Human", "Ires Human"]

In [35]:
ires_df["Ires Viral"] = ires_df["Ires Viral"].map(lambda x: zip_res_range(x))
ires_df["Ires Human"] = ires_df["Ires Human"].map(lambda x: zip_res_range(x))

In [36]:
ires_df.sort_values(["Gene Viral", "Gene Human"]).to_csv("Tables/Supplemental_Table_2.txt", sep="\t", index=None)
#ires_df.sort_values(["Gene Viral", "Gene Human"]).to_excel("Tables/Supplemental_Table_2.xlsx", sep="\t", index=None)

# Table 3 - GnomAD Pop Vars

In [37]:
pop_vars = pd.read_csv("Data/Pop_Vars.txt", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
ires_df = pd.read_csv("Data/Interface_Summary.txt", sep="\t")
ires_df.drop_duplicates(["P1", "P2"])
uni2ires = defaultdict(set)
uni2ires.update(dict(ires_df[["P2", "P2_Ires"]].apply(lambda x: (x[0], set([int(y) for y in x[1].split(",")])) if type(x[1]) == str else (x[0], set()), axis=1).values))
pop_vars["Is_Interface"] = pop_vars[["UniProt", "AA_Pos"]].apply(lambda x: x[1] in uni2ires[x[0]], axis=1)

In [39]:
pop_vars[["Gene_Symbol", "UniProt", "Gene_ID", "Chrom", "Pos", "Ref", "Alt", "rsID", "AA_Pos", "AA_Ref", "AA_Alt", "gnomAD_AF", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score",  "Is_Interface"]].sort_values(["Chrom", "Pos"]).to_csv("Tables/Supplemental_Table_3.txt", sep="\t", index=None)
#pop_vars[["Gene_Symbol", "UniProt", "Gene_ID", "Chrom", "Pos", "Ref", "Alt", "rsID", "AA_Pos", "AA_Ref", "AA_Alt", "gnomAD_AF", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score",  "Is_Interface"]].sort_values(["Chrom", "Pos"]).to_excel("Tables/Supplemental_Table_3.xlsx", sep="\t", index=None)

# Table 4 - SARS --> COVID Vars

In [40]:
viral_muts = pd.read_csv("Data/Viral_Muts.txt", sep="\t")

In [41]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [42]:
viral_muts = viral_muts.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="COVID_ID", how="left", rsuffix=" A")

In [43]:
ires_df = pd.read_csv("Data/Interface_Summary.txt", sep="\t")
ires_df.drop_duplicates(["P1", "P2"])
uni2ires = defaultdict(set)
for k, v in ires_df[["P1", "P1_Ires"]].apply(lambda x: (x[0], set([int(y) for y in x[1].split(",")])) if type(x[1]) == str else (x[0], set()), axis=1).values:
    k = k.replace("C145A", "")
    uni2ires[k].update(v)
viral_muts["Is_Interface"] = viral_muts[["COVID_ID", "COVID_Pos"]].apply(lambda x: x[1] in uni2ires[x[0]], axis=1)

In [44]:
viral_muts = viral_muts[~viral_muts["COVID_ID"].map(lambda x: "145" in x)]

In [45]:
viral_muts = viral_muts[["Gene Name", "SARS_ID", "UniProt", "SARS_Pos", "SARS_AA", "COVID_Pos", "COVID_AA", "Is_Interface"]]
viral_muts.columns = ["Gene", "SARS_CoV UniProt", "SARS_CoV_2 UniProt", "SARS_CoV Pos", "SARS_CoV_AA", "SARS_CoV_2 Pos", "SARS_CoV_2 AA", "Is_Interface"]

In [46]:
viral_muts.sort_values(["Gene", "SARS_CoV Pos"]).to_csv("Tables/Supplemental_Table_4.txt", sep="\t", index=None)
#viral_muts.sort_values(["Gene", "SARS_CoV Pos"]).to_excel("Tables/Supplemental_Table_4.xlsx", sep="\t", index=None)

# Table 5 - Mut Enrichment

In [47]:
enrich1 = pd.read_csv("Data/Pop_Var_Enrichments.txt", sep="\t")
enrich1.head()

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar
0,COVID19E,O00203,,,,,0.0,0.0,419.0,675.0
1,COVID19E,O60885,-0.955397,-1.485105,-0.425689,0.001505,94.0,28.0,454.0,786.0
2,COVID19E,P25440,-1.078056,-1.623681,-0.532431,0.000577,80.0,29.0,300.0,392.0
3,COVID19E,Q6UX04,-0.073607,-1.264513,1.117298,0.459511,10.0,7.0,193.0,262.0
4,COVID19E,Q86VM9,,,,,0.0,0.0,473.0,480.0


In [48]:
enrich2 = pd.read_csv("Data/Viral_Mut_Enrichment.txt", sep="\t")
enrich2.head()

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar
0,COVID19E,O00203,,,,,0.0,0.0,3.0,72.0
1,COVID19E,O60885,,,,,0.0,0.0,3.0,72.0
2,COVID19E,P25440,,,,,0.0,0.0,3.0,72.0
3,COVID19E,Q6UX04,,,,,0.0,0.0,3.0,72.0
4,COVID19E,Q86VM9,,,,,0.0,0.0,3.0,72.0


In [49]:
enrich = enrich2.join(enrich1.set_index(["P1", "P2"]), on=["P1", "P2"], lsuffix=" Viral", rsuffix=" Human")

In [50]:
enrich

Unnamed: 0,P1,P2,LogOdds Viral,CI_Low Viral,CI_Up Viral,P-value Viral,Ires_NoVar Viral,Ires_Var Viral,NoIres_Var Viral,NoIres_NoVar Viral,LogOdds Human,CI_Low Human,CI_Up Human,P-value Human,Ires_NoVar Human,Ires_Var Human,NoIres_Var Human,NoIres_NoVar Human
0,COVID19E,O00203,,,,,0.0,0.0,3.0,72.0,,,,,0.0,0.0,419.0,675.0
1,COVID19E,O60885,,,,,0.0,0.0,3.0,72.0,-0.955397,-1.485105,-0.425689,0.001505,94.0,28.0,454.0,786.0
2,COVID19E,P25440,,,,,0.0,0.0,3.0,72.0,-1.078056,-1.623681,-0.532431,0.000577,80.0,29.0,300.0,392.0
3,COVID19E,Q6UX04,,,,,0.0,0.0,3.0,72.0,-0.073607,-1.264513,1.117298,0.459511,10.0,7.0,193.0,262.0
4,COVID19E,Q86VM9,,,,,0.0,0.0,3.0,72.0,,,,,0.0,0.0,473.0,480.0
5,COVID19E,Q8IWA5,,,,,0.0,0.0,3.0,72.0,,,,,0.0,0.0,285.0,421.0
6,COVID19M,O75439,,,,,0.0,0.0,21.0,201.0,0.699605,-0.516302,1.915512,0.171969,7.0,9.0,209.0,264.0
7,COVID19M,O95070,,,,,0.0,0.0,21.0,201.0,,,,,0.0,0.0,150.0,143.0
8,COVID19M,P05026,,,,,0.0,0.0,21.0,201.0,0.016488,-0.817446,0.850422,0.487028,29.0,14.0,84.0,176.0
9,COVID19M,P11310,,,,,0.0,0.0,21.0,201.0,-0.370807,-1.025323,0.283710,0.175702,44.0,24.0,146.0,207.0


In [51]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [52]:
enrich = enrich.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="P1", how="left", rsuffix=" Viral")
enrich = enrich.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="P2", how="left", rsuffix=" Human")

In [53]:
enrich = enrich[list(enrich)[-4:] + list(enrich)[2:-4]]
enrich.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human"] + list(enrich)[4:]

In [54]:
enrich.sort_values(["Gene Viral", "Gene Human"]).to_csv("Tables/Supplemental_Table_5.txt", sep="\t", index=None)
#enrich.sort_values(["Gene Viral", "Gene Human"]).to_excel("Tables/Supplemental_Table_5.xlsx", sep="\t", index=None)

# Table 6 - Disease Enrichment

In [55]:
summary = pd.read_csv("MedGen_Term_Enrichments.txt", sep="\t")

In [56]:
summary

Unnamed: 0,ID,A,B,C,D,Children,Parents,LogOR,Low,Up,p-value,Name,Significant Parents,Significant Children,Significant Ancestors,Significant Descendants
0,C1837406,0,332,2,20016,C1855299,"C0239399,C4025035",,,,,Upper limb undergrowth,,,"C4023165,C4021790,C4551464,C4020900,C4073181,C...",
1,C1837404,1,331,0,20018,,"C1398312,C0240635",,,,,"High, narrow palate",C0240635,,"C0026633,C0243057,C4021812,C4021817,C4021815,C...",
2,C4024743,0,332,1,20017,"C4021084,C1298695,C0338502","C0029131,C4024744",,,,,Aplasia/Hypoplasia of the optic nerve,C0029131,,C0029131,
3,C4024744,0,332,1,20017,"C4024740,C4024743","C4025804,C4024745",,,,,Aplasia/Hypoplasia affecting the fundus,,,,
4,C4024745,0,332,9,20009,"C4024744,C0026010,C4024739,C4024746,C0003119",C4022925,,,,,Aplasia/Hypoplasia affecting the eye,,,,
5,C4024746,0,332,6,20012,"C4024748,C1836890","C4024745,C4025842",,,,,Aplasia/Hypoplasia affecting the uvea,,,,
6,C4024748,0,332,6,20012,"C0003076,C0344539","C4025845,C4024739,C4024746",,,,,Aplasia/Hypoplasia of the iris,,,,
7,C0265677,0,332,1,20017,"C0432152,C0432149,C0432155","C0432163,C0000768,C1839326",,,,,Hemivertebrae,,,"C4024586,C4023165,C4021790",
8,C3554540,0,332,1,20017,,,,,,,Lymphoproliferative syndrome 2,,,,
9,C4022597,1,331,3,20015,"C0151611,C0522216,C4021577,C4022687,C3552825",C4021781,4.333144,1.589860,7.076428,0.004687,Abnormality of central nervous system electrop...,C4021781,C0151611,C4021781,C0151611


summary = summary[(summary["A"] != 0)|(summary["C"] != 0)]
def do(a, b, c, d):
    if([0] in [a, b, c, d]):
        return np.nan, np.nan, np.nan, np.nan
    exposure_mask = np.zeros(a + b + c + d)
    case_mask = np.zeros(a + b + c + d)
    exposure_mask[:a+b] = 1
    case_mask[:a] = 1
    case_mask[a+b:a+b+c] = 1

    OR, up, low, p = my.odds_ratio(exposure_mask, case_mask, log_odds=True)
    pbar.update()
    return OR, low, up, p
# FUNCTION END
pbar = tqdm_notebook(total=len(summary))
tmp = summary[["A", "B", "C", "D"]].apply(lambda x: do(*x), axis=1)
summary["LogOR"] = [x[0] for x in tmp]
summary["Low"] = [x[1] for x in tmp]
summary["Up"] = [x[2] for x in tmp]
summary["p-value"] = [x[3] for x in tmp]

medgen_names = pd.read_csv("MGCONSO.RRF", sep="|")

term2name = defaultdict(str)
term2name.update(medgen_names[(medgen_names["TS"] == "P")&(medgen_names["STT"] == "PF")].drop_duplicates("#CUI")[["#CUI", "STR"]].set_index("#CUI")["STR"].to_dict())

summary["Name"] = summary["ID"].map(lambda x: term2name[x])

#summary = summary[summary["A"] != 0]

sig_nodes = set()
node2parents = defaultdict(set)
node2children = defaultdict(set)
for id, child, parent, p in summary[["ID", "Children", "Parents", "p-value"]].values:
    if(p <= 0.05):
        sig_nodes.add(id)
    if(not pd.isnull(child)):
        for c in child.split(","):
            node2children[id].add(c)
    if(not pd.isnull(parent)):
        for p in parent.split(","):
            node2parents[id].add(p)
summary["Significant Parents"] = summary["ID"].map(lambda x: ",".join([y for y in node2parents[x] if y in sig_nodes]))
summary["Significant Children"] = summary["ID"].map(lambda x: ",".join([y for y in node2children[x] if y in sig_nodes]))

set([1, 3]).difference(set([1, 2, 3]))

In [57]:
summary[["ID", "Name", "A", "B", "C", "D", "Children", "Parents", "LogOR", "Low", "Up", "p-value", "Significant Ancestors", "Significant Descendants"]]


Unnamed: 0,ID,Name,A,B,C,D,Children,Parents,LogOR,Low,Up,p-value,Significant Ancestors,Significant Descendants
0,C1837406,Upper limb undergrowth,0,332,2,20016,C1855299,"C0239399,C4025035",,,,,"C4023165,C4021790,C4551464,C4020900,C4073181,C...",
1,C1837404,"High, narrow palate",1,331,0,20018,,"C1398312,C0240635",,,,,"C0026633,C0243057,C4021812,C4021817,C4021815,C...",
2,C4024743,Aplasia/Hypoplasia of the optic nerve,0,332,1,20017,"C4021084,C1298695,C0338502","C0029131,C4024744",,,,,C0029131,
3,C4024744,Aplasia/Hypoplasia affecting the fundus,0,332,1,20017,"C4024740,C4024743","C4025804,C4024745",,,,,,
4,C4024745,Aplasia/Hypoplasia affecting the eye,0,332,9,20009,"C4024744,C0026010,C4024739,C4024746,C0003119",C4022925,,,,,,
5,C4024746,Aplasia/Hypoplasia affecting the uvea,0,332,6,20012,"C4024748,C1836890","C4024745,C4025842",,,,,,
6,C4024748,Aplasia/Hypoplasia of the iris,0,332,6,20012,"C0003076,C0344539","C4025845,C4024739,C4024746",,,,,,
7,C0265677,Hemivertebrae,0,332,1,20017,"C0432152,C0432149,C0432155","C0432163,C0000768,C1839326",,,,,"C4024586,C4023165,C4021790",
8,C3554540,Lymphoproliferative syndrome 2,0,332,1,20017,,,,,,,,
9,C4022597,Abnormality of central nervous system electrop...,1,331,3,20015,"C0151611,C0522216,C4021577,C4022687,C3552825",C4021781,4.333144,1.589860,7.076428,0.004687,C4021781,C0151611


In [None]:
summary = summary[["ID", "Name", "A", "B", "C", "D", "Children", "Parents", "LogOR", "Low", "Up", "p-value", "Significant Ancestors", "Significant Descendants"]]
summary.columns = ["MedGenCUI", "Term Name", "Interactors_With_Term", "Interactors_Without_Term", "NonInteractors_With_Term", "NonInteractors_Without_Term", "Children Terms", "Parent Terms", "Log2OddsRatio", "Lower 95% CI", "Upper 95% CI", "p-value", "Significant Descendant Terms", "Significant Ancestor Terms"]

In [None]:
summary = summary.sort_values(["p-value"])

In [None]:
summary[(summary["p-value"] <= 0.05)&(summary["Significant Ancestor Terms"] == "")].sort_values("p-value")

In [None]:
summary.to_csv("Tables/Supplemental_Table_6.txt", sep="\t", index=None)
#summary.to_excel("Tables/Supplemental_Table_6.xlsx", sep="\t", index=None)

# Table 7 - SARS --> COVID

In [10]:
ddg = pd.read_csv("Data/ddG_Summary.txt", sep="\t")

In [11]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [12]:
ddg["P1"] = ddg["P1"].map(lambda x: "COVID19" + x)

In [13]:
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P1", how="left", rsuffix=" Viral")
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P2", how="left", rsuffix=" Human")

In [14]:
ddg = ddg[list(ddg)[-4:] + list(ddg)[2:-5]]

In [15]:
ddg.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human"] + [x.replace("WT", "SARS_CoV").replace("Mut", "SARS_CoV_2").replace("Score", "Complex_G") for x in list(ddg)[4:]]

In [16]:
ddg = ddg[[x for x in list(ddg) if not "std" in x.lower() or x == "ddG_Std"]].sort_values("ddG")

In [17]:
ddg["z-score"] = (ddg["ddG"] - ddg["ddG"].mean()) / ddg["ddG"].std()

In [18]:
ddg.to_csv("Tables/Supplemental_Table_7.txt", sep="\t", index=None)
#ddg.to_excel("Tables/Supplemental_Table_7.xlsx", sep="\t", index=None)

# Table 8 - Pop Var ddG

In [None]:
ddg = pd.read_csv("Table_8_Intermediate.txt", sep="\t")

In [None]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [None]:
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P1", how="left", rsuffix=" Viral")
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P2", how="left", rsuffix=" Human")

In [None]:
ddg = ddg[list(ddg)[-4:] + list(ddg)[2:-4]]

In [None]:
ddg = ddg[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Ref", "Pos", "Alt", "ddG", "std", "p-value", "z-score (same AA)", "z-score"]]
ddg.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Ref", "Pos", "Alt", "ddG", "std", "p-value", "z-score (Same AA)", "z-score (Any AA)"]

In [None]:
def do(x):
    x, p = x
    if(p > 0.05):
        return "Non-Hotspot"
    if(x >= 2):
        return "Strong Disruptive Hotspot"
    if(x >= 1):
        return "Disruptive Hotspot"
    if(x > -1):
        return "Non-Hotspot"
    if(x <= -2):
        return "Strong Stabilizing Hostpot"
    if(x <= -1):
        return "Stabilizing Hotspot"
# FUNCTION END
ddg["Interpretation"] = ddg[["z-score (Same AA)", "p-value"]].apply(do, axis=1)

In [None]:
ddg["Interpretation"].value_counts()

In [None]:
ddg.sort_values("z-score (Same AA)", ascending=False).to_csv("Tables/Supplemental_Table_8.txt", sep="\t", index=None)
#ddg.sort_values("z-score (Same AA)", ascending=False).to_excel("Tables/Supplemental_Table_8.xlsx", sep="\t", index=None)

# Trable 9 - Drug Binding

In [None]:
drugs = pd.read_csv("Data/Drug_Interface_Enrichmet.txt", sep="\t")
drugs = drugs[drugs["Docking_Rank"] == 1]

In [None]:
proteins = pd.read_csv("Data/Proteins.txt", sep="\t")

In [None]:
drugs = drugs.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Human_Protein", rsuffix=" Human")
drugs = drugs.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Viral_Interactor", rsuffix=" Viral")

In [None]:
all_drugs = pd.read_csv("Data/Krogan_Drug_Candidates.txt", sep="\t")
all_drugs

In [None]:
drugs = drugs.join(all_drugs.drop_duplicates("Compound Name").set_index("Compound Name")[["Smiles", "ZINC_ID"]], on="Compound_Name", how="left")

In [None]:
len(drugs)

In [None]:
drugs = drugs[["Gene Name", "UniProt", "Gene Name Viral", "UniProt Viral", "Compound_Name", "Smiles", "ZINC_ID", "Score", "Drug_Ires", "Protein_Ires", "Log2OR", "LowerCI", "UpperCI", "p-value"]]
drugs.columns = ["Gene Human", "UniProt Human", "Gene Viral", "UniProt Viral", "Compound Name", "Smiles", "ZINC_ID", "Drug Docking Score", "Drug Ires", "Protein Ires", "Log2OddsRatio", "Lower 95% CI", "Upper 95% CI", "p-value"]

In [None]:
drugs["Drug Ires"] = drugs["Drug Ires"].map(lambda x: zip_res_range(x))
drugs["Protein Ires"] = drugs["Protein Ires"].map(lambda x: zip_res_range(x) if type(x) == str else x)

In [None]:
drugs = drugs.sort_values(["Log2OddsRatio", "Protein Ires"], ascending=[False, True])

In [None]:
drugs.to_csv("Tables/Supplemental_Table_9.txt", sep="\t", index=None)
#drugs.to_excel("Tables/Supplemental_Table_9.xlsx", sep="\t", index=None)