In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("Prep_Tables")

import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os
import sys

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from mjm_tools import zip_res_range, unzip_res_range

from collections import defaultdict

# Paramaters for file locations

In [32]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Output directory for finished tables
table_dir = "{0}/Tables".format(base_dir)

if(not os.path.exists(table_dir)):
    os.mkdir(table_dir)

# Table 1 - ECLAIR Interface Annotations

In [4]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df = ires_df[ires_df["Source"] == "ECLAIR"]

In [5]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [6]:
proteins.head()

Unnamed: 0,ID,Is_Viral,Gene_Name,Length,Sequence,UniProt,Gene Name
0,Q9Y312,False,AAR2,384,MAAVQMDPELAKRLFFEGATVVILNMPKGTEFGIDYNSWEVGPKFR...,Q9Y312,AAR2
1,Q9UDR5,False,AASS,926,MLQVHRTGLGRLGVSLSKGLHHKAVLAVRREDVNAWERRAPLAPKH...,Q9UDR5,AASS
2,Q9NY61,False,AATF,560,MAGPQPLALQLEQLLNPRPSEADPEADPEEATAARVIDRFDEGEDG...,Q9NY61,AATF
3,P33527,False,ABCC1,1531,MALRGFCSADGSDPLWDWNVTWNTSNPDFTKCFQNTVLVWVPCFYL...,P33527,ABCC1
4,Q9H845,False,ACAD9,621,MSGCGLFLRTTAAARACRGLVVSTANRRLLRTSPPVRAFAKELFLG...,Q9H845,ACAD9


In [7]:
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

In [8]:
ires_df = ires_df[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"]]
ires_df.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Len Viral", "N Ires Viral", "Ires Viral", "Len Human", "N Ires Human", "Ires Human"]

In [9]:
ires_df["Ires Viral"] = ires_df["Ires Viral"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")
ires_df["Ires Human"] = ires_df["Ires Human"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")

In [15]:
ires_df.sort_values(["Gene Viral", "Gene Human"]).to_csv("{0}/Supplemental_Table_1.txt".format(table_dir), sep="\t", index=None)
ires_df.sort_values(["Gene Viral", "Gene Human"]).to_excel("{0}/Supplemental_Table_1.xlsx".format(table_dir), engine="xlsxwriter")

# Table 2 - Docking Interface Annotations

In [16]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df = ires_df[ires_df["Source"] == "Docking"]

In [18]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [19]:
proteins.head()

Unnamed: 0,ID,Is_Viral,Gene_Name,Length,Sequence,UniProt,Gene Name
0,Q9Y312,False,AAR2,384,MAAVQMDPELAKRLFFEGATVVILNMPKGTEFGIDYNSWEVGPKFR...,Q9Y312,AAR2
1,Q9UDR5,False,AASS,926,MLQVHRTGLGRLGVSLSKGLHHKAVLAVRREDVNAWERRAPLAPKH...,Q9UDR5,AASS
2,Q9NY61,False,AATF,560,MAGPQPLALQLEQLLNPRPSEADPEADPEEATAARVIDRFDEGEDG...,Q9NY61,AATF
3,P33527,False,ABCC1,1531,MALRGFCSADGSDPLWDWNVTWNTSNPDFTKCFQNTVLVWVPCFYL...,P33527,ABCC1
4,Q9H845,False,ACAD9,621,MSGCGLFLRTTAAARACRGLVVSTANRRLLRTSPPVRAFAKELFLG...,Q9H845,ACAD9


In [20]:
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

In [22]:
ires_df = ires_df.join(pd.read_csv("{0}/Data/Models.txt".format(base_dir), sep="\t").set_index("ID")[["Source", "PDB_ID", "PDB_Chain", "ModBase_ID"]], on="P2", rsuffix=" Model")

In [23]:
ires_df = ires_df[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Source Model", "PDB_ID", "PDB_Chain", "ModBase_ID", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"]]
ires_df.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Structure Source Human", "PDB ID Human", "PDB Chain Human", "ModBase ID Human", "Len Viral", "N Ires Viral", "Ires Viral", "Len Human", "N Ires Human", "Ires Human"]

In [24]:
ires_df["Ires Viral"] = ires_df["Ires Viral"].map(lambda x: zip_res_range(x))
ires_df["Ires Human"] = ires_df["Ires Human"].map(lambda x: zip_res_range(x))

In [26]:
ires_df.sort_values(["Gene Viral", "Gene Human"]).to_csv("{0}/Supplemental_Table_2.txt".format(table_dir), sep="\t", index=None)
ires_df.sort_values(["Gene Viral", "Gene Human"]).to_excel("{0}/Supplemental_Table_2.xlsx".format(table_dir), engine="xlsxwriter")

# Table 3 - Human-Pathogen PDB Benchmark

In [7]:
# Read in Data
ires = pd.read_csv("../PDB_Interspecies_Benchmark/Interspecies_Benchmark_ires.txt", sep="\t")

# Subset columns
ires = ires[["UniProtA", "UniProtB", "TaxIDA", "DomainA", "TaxIDB", "DomainB", "NumIresA", "NumIresB", "UniProtIresA", "UniProtIresB", "PDBSources"]]

In [8]:
# Read in predicted interfaces
preds = pd.read_csv("../PDB_Interspecies_Benchmark/Interspecies_Benchmark_ires_preds.txt", sep="\t")

In [12]:
# Join
ires = ires.join(preds.set_index(["P1", "P2"]), on=["UniProtA", "UniProtB"], how="left")

In [26]:
# Reformat columns so human is always first
ires2 = []
for row in ires.values:
    row = dict(zip(list(ires), row))
    
    if(not row["TaxIDA"] == 9606.0):
        for col in list(row):
            if(col[-1] == "A"):
                row[col[:-1] + "Z"] = row[col]
                row[col] = row[col[:-1] + "B"]
                row[col[:-1] + "B"] = row[col[:-1] + "Z"]
        pdbsources = []
        for source in row["PDBSources"].split(";"):
            pdb, chainA, chainB = source.replace(":", "-").split("-")
            pdbsources.append("{0}-{1}:{2}".format(pdb, chainB, chainA))
        pdbsources = ";".join(pdbsources)
        row["PDBSources"] = pdbsources
    ires2.append([row[x] for x in list(ires)])
ires2 = pd.DataFrame(ires2, columns=list(ires))

In [30]:
ires2.columns = ['UniProtA',
 'UniProtB',
 'TaxIDA',
 'DomainA',
 'TaxIDB',
 'DomainB',
 'NumIresA',
 'NumIresB',
 'PDB_IresA',
 'PDB_IresB',
 'PDBSources',
 'ECLAIR_IresA',
 'ECLAIR_IresB',
 'Guided_Dock_IresA',
 'Guided_Dock_IresB',
 'Raw_Dock_IresA',
 'Raw_Dock_IresB']

In [34]:
# Save
ires2.sort_values(["UniProtA", "UniProtB"]).to_csv("{0}/Supplemental_Table_3.txt".format(table_dir), sep="\t", index=None)
ires2.sort_values(["UniProtA", "UniProtB"]).to_excel("{0}/Supplemental_Table_3.xlsx".format(table_dir), engine="xlsxwriter")

# Table 4 - GnomAD Pop Vars

In [27]:
pop_vars = pd.read_csv("{0}/Data/Pop_Vars.txt".format(base_dir), sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df.drop_duplicates(["P1", "P2"])
uni2ires = defaultdict(set)
uni2ires.update(dict(ires_df[["P2", "P2_Ires"]].apply(lambda x: (x[0], set([int(y) for y in x[1].split(",")])) if type(x[1]) == str else (x[0], set()), axis=1).values))
pop_vars["Is_Interface"] = pop_vars[["UniProt", "AA_Pos"]].apply(lambda x: x[1] in uni2ires[x[0]], axis=1)

In [29]:
pop_vars[["Gene_Symbol", "UniProt", "Gene_ID", "Chrom", "Pos", "Ref", "Alt", "rsID", "AA_Pos", "AA_Ref", "AA_Alt", "gnomAD_AF", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score",  "Is_Interface"]].sort_values(["Chrom", "Pos"]).to_csv("{0}/Supplemental_Table_3.txt".format(table_dir), sep="\t", index=None)
pop_vars[["Gene_Symbol", "UniProt", "Gene_ID", "Chrom", "Pos", "Ref", "Alt", "rsID", "AA_Pos", "AA_Ref", "AA_Alt", "gnomAD_AF", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score",  "Is_Interface"]].sort_values(["Chrom", "Pos"]).to_excel("{0}/Supplemental_Table_3.xlsx".format(table_dir), engine="xlsxwriter")

In [27]:
pop_vars = pd.read_csv("{0}/Data/Pop_Vars.txt".format(base_dir), sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df.drop_duplicates(["P1", "P2"])
uni2ires = defaultdict(set)
uni2ires.update(dict(ires_df[["P2", "P2_Ires"]].apply(lambda x: (x[0], set([int(y) for y in x[1].split(",")])) if type(x[1]) == str else (x[0], set()), axis=1).values))
pop_vars["Is_Interface"] = pop_vars[["UniProt", "AA_Pos"]].apply(lambda x: x[1] in uni2ires[x[0]], axis=1)

In [29]:
pop_vars[["Gene_Symbol", "UniProt", "Gene_ID", "Chrom", "Pos", "Ref", "Alt", "rsID", "AA_Pos", "AA_Ref", "AA_Alt", "gnomAD_AF", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score",  "Is_Interface"]].sort_values(["Chrom", "Pos"]).to_csv("{0}/Supplemental_Table_4.txt".format(table_dir), sep="\t", index=None)
pop_vars[["Gene_Symbol", "UniProt", "Gene_ID", "Chrom", "Pos", "Ref", "Alt", "rsID", "AA_Pos", "AA_Ref", "AA_Alt", "gnomAD_AF", "SIFT_Category", "SIFT_Score", "PolyPhen_Category", "PolyPhen_Score",  "Is_Interface"]].sort_values(["Chrom", "Pos"]).to_excel("{0}/Supplemental_Table_4.xlsx".format(table_dir), engine="xlsxwriter")

# Table 5 - SARS --> COVID Vars

In [30]:
viral_muts = pd.read_csv("{0}/Data/Viral_Muts.txt".format(base_dir), sep="\t")

In [31]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [32]:
viral_muts = viral_muts.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="COVID_ID", how="left", rsuffix=" A")

In [33]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df.drop_duplicates(["P1", "P2"])
uni2ires = defaultdict(set)
for k, v in ires_df[["P1", "P1_Ires"]].apply(lambda x: (x[0], set([int(y) for y in x[1].split(",")])) if type(x[1]) == str else (x[0], set()), axis=1).values:
    k = k.replace("C145A", "")
    uni2ires[k].update(v)
viral_muts["Is_Interface"] = viral_muts[["COVID_ID", "COVID_Pos"]].apply(lambda x: x[1] in uni2ires[x[0]], axis=1)

In [34]:
viral_muts = viral_muts[~viral_muts["COVID_ID"].map(lambda x: "145" in x)]

In [35]:
viral_muts = viral_muts[["Gene Name", "SARS_ID", "UniProt", "SARS_Pos", "SARS_AA", "COVID_Pos", "COVID_AA", "Is_Interface"]]
viral_muts.columns = ["Gene", "SARS_CoV UniProt", "SARS_CoV_2 UniProt", "SARS_CoV Pos", "SARS_CoV_AA", "SARS_CoV_2 Pos", "SARS_CoV_2 AA", "Is_Interface"]

In [36]:
viral_muts.sort_values(["Gene", "SARS_CoV Pos"]).to_csv("{0}/Supplemental_Table_5.txt".format(table_dir), sep="\t", index=None)
viral_muts.sort_values(["Gene", "SARS_CoV Pos"]).to_excel("{0}/Supplemental_Table_5.xlsx".format(table_dir), engine="xlsxwriter")

# Table 6 - Mut Enrichment

In [37]:
enrich1 = pd.read_csv("{0}/Data/Pop_Var_Enrichments.txt".format(base_dir), sep="\t")
enrich1.head()

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar
0,COVID19E,O00203,0.600101,-0.531695,1.731897,0.191567,9.0,9.0,316.0,479.0
1,COVID19E,O60885,-0.485427,-1.815771,0.844918,0.274191,20.0,5.0,21.0,60.0
2,COVID19E,P25440,-0.186219,-1.311495,0.939057,0.392733,16.0,9.0,32.0,50.0
3,COVID19E,Q6UX04,-0.093109,-1.212636,1.026418,0.445594,12.0,9.0,64.0,80.0
4,COVID19E,Q86VM9,-0.845963,-2.06054,0.368614,0.125969,11.0,6.0,451.0,460.0


In [38]:
enrich2 = pd.read_csv("{0}/Data/Viral_Mut_Enrichment.txt".format(base_dir), sep="\t")
enrich2.head()

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar
0,COVID19E,O00203,,,,,11.0,0.0,0.0,20.0
1,COVID19E,O60885,,,,,21.0,0.0,0.0,10.0
2,COVID19E,P25440,,,,,20.0,0.0,0.0,11.0
3,COVID19E,Q6UX04,,,,,21.0,0.0,0.0,10.0
4,COVID19E,Q86VM9,,,,,12.0,0.0,0.0,19.0


In [39]:
enrich = enrich2.join(enrich1.set_index(["P1", "P2"]), on=["P1", "P2"], lsuffix=" Viral", rsuffix=" Human")

In [40]:
enrich

Unnamed: 0,P1,P2,LogOdds Viral,CI_Low Viral,CI_Up Viral,P-value Viral,Ires_NoVar Viral,Ires_Var Viral,NoIres_Var Viral,NoIres_NoVar Viral,LogOdds Human,CI_Low Human,CI_Up Human,P-value Human,Ires_NoVar Human,Ires_Var Human,NoIres_Var Human,NoIres_NoVar Human
0,COVID19E,O00203,,,,,11.0,0.0,0.0,20.0,0.600101,-0.531695,1.731897,0.191567,9.0,9.0,316.0,479.0
1,COVID19E,O60885,,,,,21.0,0.0,0.0,10.0,-0.485427,-1.815771,0.844918,0.274191,20.0,5.0,21.0,60.0
2,COVID19E,P25440,,,,,20.0,0.0,0.0,11.0,-0.186219,-1.311495,0.939057,0.392733,16.0,9.0,32.0,50.0
3,COVID19E,Q6UX04,,,,,21.0,0.0,0.0,10.0,-0.093109,-1.212636,1.026418,0.445594,12.0,9.0,64.0,80.0
4,COVID19E,Q86VM9,,,,,12.0,0.0,0.0,19.0,-0.845963,-2.060540,0.368614,0.125969,11.0,6.0,451.0,460.0
5,COVID19E,Q8IWA5,,,,,17.0,0.0,0.0,14.0,-0.560265,-1.662115,0.541584,0.201473,15.0,7.0,278.0,404.0
6,COVID19M,O75439,,,,,0.0,0.0,21.0,201.0,0.699605,-0.516302,1.915512,0.171969,7.0,9.0,209.0,264.0
7,COVID19M,O95070,,,,,0.0,0.0,21.0,201.0,,,,,0.0,0.0,150.0,143.0
8,COVID19M,P05026,,,,,0.0,0.0,21.0,201.0,0.016488,-0.817446,0.850422,0.487028,29.0,14.0,84.0,176.0
9,COVID19M,P11310,,,,,0.0,0.0,21.0,201.0,-0.370807,-1.025323,0.283710,0.175702,44.0,24.0,146.0,207.0


In [41]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [42]:
enrich = enrich.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="P1", how="left", rsuffix=" Viral")
enrich = enrich.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="P2", how="left", rsuffix=" Human")

In [43]:
enrich = enrich[list(enrich)[-4:] + list(enrich)[2:-4]]
enrich.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human"] + list(enrich)[4:]

In [44]:
enrich.sort_values(["Gene Viral", "Gene Human"]).to_csv("{0}/Supplemental_Table_6.txt".format(table_dir), sep="\t", index=None)
enrich.sort_values(["Gene Viral", "Gene Human"]).to_excel("{0}/Supplemental_Table_6.xlsx".format(table_dir), engine="xlsxwriter")

# Table 7 - Disease Enrichment

In [35]:
summary = pd.read_csv("{0}/MedGen_Term_Enrichments.txt".format(base_dir), sep="\t")

In [36]:
summary

Unnamed: 0,ID,A,B,C,D,Children,Parents,LogOR,Low,Up,p-value,Name,Significant Parents,Significant Children,Significant Ancestors,Significant Descendants
0,C1837406,0,332,2,20016,C1855299,"C0239399,C4025035",,,,,Upper limb undergrowth,,,"C4023165,C4021790,C4023163,C4551464,C4024592,C...",
1,C1837404,1,331,0,20018,,"C1398312,C0240635",,,,,"High, narrow palate",C0240635,,"C0026633,C0243057,C0240635,C4021817,C4021815,C...",
2,C4024743,0,332,1,20017,"C4021084,C1298695,C0338502","C0029131,C4024744",,,,,Aplasia/Hypoplasia of the optic nerve,C0029131,,C0029131,
3,C4024744,0,332,1,20017,"C4024740,C4024743","C4025804,C4024745",,,,,Aplasia/Hypoplasia affecting the fundus,,,,
4,C4024745,0,332,9,20009,"C4024744,C0026010,C4024739,C4024746,C0003119",C4022925,,,,,Aplasia/Hypoplasia affecting the eye,,,,
5,C4024746,0,332,6,20012,"C4024748,C1836890","C4024745,C4025842",,,,,Aplasia/Hypoplasia affecting the uvea,,,,
6,C4024748,0,332,6,20012,"C0003076,C0344539","C4025845,C4024739,C4024746",,,,,Aplasia/Hypoplasia of the iris,,,,
7,C0265677,0,332,1,20017,"C0432152,C0432149,C0432155","C0432163,C0000768,C1839326",,,,,Hemivertebrae,,,"C4024586,C4023165,C4021790",
8,C3554540,0,332,1,20017,,,,,,,Lymphoproliferative syndrome 2,,,,
9,C4022597,1,331,3,20015,"C0151611,C0522216,C4021577,C4022687,C3552825",C4021781,4.333144,1.589860,7.076428,0.004687,Abnormality of central nervous system electrop...,C4021781,C0151611,"C4021781,C0027765",C0151611


summary = summary[(summary["A"] != 0)|(summary["C"] != 0)]
def do(a, b, c, d):
    if([0] in [a, b, c, d]):
        return np.nan, np.nan, np.nan, np.nan
    exposure_mask = np.zeros(a + b + c + d)
    case_mask = np.zeros(a + b + c + d)
    exposure_mask[:a+b] = 1
    case_mask[:a] = 1
    case_mask[a+b:a+b+c] = 1

    OR, up, low, p = my.odds_ratio(exposure_mask, case_mask, log_odds=True)
    pbar.update()
    return OR, low, up, p
# FUNCTION END
pbar = tqdm_notebook(total=len(summary))
tmp = summary[["A", "B", "C", "D"]].apply(lambda x: do(*x), axis=1)
summary["LogOR"] = [x[0] for x in tmp]
summary["Low"] = [x[1] for x in tmp]
summary["Up"] = [x[2] for x in tmp]
summary["p-value"] = [x[3] for x in tmp]

medgen_names = pd.read_csv("MGCONSO.RRF", sep="|")

term2name = defaultdict(str)
term2name.update(medgen_names[(medgen_names["TS"] == "P")&(medgen_names["STT"] == "PF")].drop_duplicates("#CUI")[["#CUI", "STR"]].set_index("#CUI")["STR"].to_dict())

summary["Name"] = summary["ID"].map(lambda x: term2name[x])

#summary = summary[summary["A"] != 0]

sig_nodes = set()
node2parents = defaultdict(set)
node2children = defaultdict(set)
for id, child, parent, p in summary[["ID", "Children", "Parents", "p-value"]].values:
    if(p <= 0.05):
        sig_nodes.add(id)
    if(not pd.isnull(child)):
        for c in child.split(","):
            node2children[id].add(c)
    if(not pd.isnull(parent)):
        for p in parent.split(","):
            node2parents[id].add(p)
summary["Significant Parents"] = summary["ID"].map(lambda x: ",".join([y for y in node2parents[x] if y in sig_nodes]))
summary["Significant Children"] = summary["ID"].map(lambda x: ",".join([y for y in node2children[x] if y in sig_nodes]))

set([1, 3]).difference(set([1, 2, 3]))

In [37]:
summary[["ID", "Name", "A", "B", "C", "D", "Children", "Parents", "LogOR", "Low", "Up", "p-value", "Significant Ancestors", "Significant Descendants"]]

Unnamed: 0,ID,Name,A,B,C,D,Children,Parents,LogOR,Low,Up,p-value,Significant Ancestors,Significant Descendants
0,C1837406,Upper limb undergrowth,0,332,2,20016,C1855299,"C0239399,C4025035",,,,,"C4023165,C4021790,C4023163,C4551464,C4024592,C...",
1,C1837404,"High, narrow palate",1,331,0,20018,,"C1398312,C0240635",,,,,"C0026633,C0243057,C0240635,C4021817,C4021815,C...",
2,C4024743,Aplasia/Hypoplasia of the optic nerve,0,332,1,20017,"C4021084,C1298695,C0338502","C0029131,C4024744",,,,,C0029131,
3,C4024744,Aplasia/Hypoplasia affecting the fundus,0,332,1,20017,"C4024740,C4024743","C4025804,C4024745",,,,,,
4,C4024745,Aplasia/Hypoplasia affecting the eye,0,332,9,20009,"C4024744,C0026010,C4024739,C4024746,C0003119",C4022925,,,,,,
5,C4024746,Aplasia/Hypoplasia affecting the uvea,0,332,6,20012,"C4024748,C1836890","C4024745,C4025842",,,,,,
6,C4024748,Aplasia/Hypoplasia of the iris,0,332,6,20012,"C0003076,C0344539","C4025845,C4024739,C4024746",,,,,,
7,C0265677,Hemivertebrae,0,332,1,20017,"C0432152,C0432149,C0432155","C0432163,C0000768,C1839326",,,,,"C4024586,C4023165,C4021790",
8,C3554540,Lymphoproliferative syndrome 2,0,332,1,20017,,,,,,,,
9,C4022597,Abnormality of central nervous system electrop...,1,331,3,20015,"C0151611,C0522216,C4021577,C4022687,C3552825",C4021781,4.333144,1.589860,7.076428,0.004687,"C4021781,C0027765",C0151611


In [38]:
summary = summary[["ID", "Name", "A", "B", "C", "D", "Children", "Parents", "LogOR", "Low", "Up", "p-value", "Significant Ancestors", "Significant Descendants"]]
summary.columns = ["MedGenCUI", "Term Name", "Interactors_With_Term", "Interactors_Without_Term", "NonInteractors_With_Term", "NonInteractors_Without_Term", "Children Terms", "Parent Terms", "Log2OddsRatio", "Lower 95% CI", "Upper 95% CI", "p-value", "Significant Descendant Terms", "Significant Ancestor Terms"]

In [39]:
summary = summary.sort_values(["p-value"])

In [41]:
# Unsure if I had originally wanted to filter to only show these most general, but still significant
# term or not?
# I think I had decided to retain everything.
summary[(summary["p-value"] <= 0.05)&(pd.isnull(summary["Significant Ancestor Terms"]))].sort_values("p-value").to_csv("{0}/Supplemental_Table_7A.txt".format(table_dir), sep="\t", index=None)

In [42]:
summary.to_csv("{0}/Supplemental_Table_7B.txt".format(table_dir), sep="\t", index=None)

# There are some odd ASCII characters here, so this one doesn't convert
# successfully. Need to do manually by copy / pasting into excel
#summary.to_excel("{0}/Supplemental_Table_6.xlsx".format(table_dir), engine="xlsxwriter")

# Table 8 - SARS --> COVID

In [54]:
ddg = pd.read_csv("{0}/Data/ddG_Summary.txt".format(base_dir), sep="\t")

In [55]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [56]:
ddg["P1"] = ddg["P1"].map(lambda x: "COVID19" + x)

In [57]:
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P1", how="left", rsuffix=" Viral")
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P2", how="left", rsuffix=" Human")

In [58]:
ddg = ddg[list(ddg)[-4:] + list(ddg)[2:-5]]

In [59]:
ddg.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human"] + [x.replace("WT", "SARS_CoV").replace("Mut", "SARS_CoV_2").replace("Score", "Complex_G") for x in list(ddg)[4:]]

In [60]:
ddg = ddg[[x for x in list(ddg) if not "std" in x.lower() or x == "ddG_Std"]].sort_values("ddG")

In [61]:
ddg["z-score"] = (ddg["ddG"] - ddg["ddG"].mean()) / ddg["ddG"].std()

In [62]:
ddg.to_csv("{0}/Supplemental_Table_8.txt".format(table_dir), sep="\t", index=None)
ddg.to_excel("{0}/Supplemental_Table_8.xlsx".format(table_dir), engine="xlsxwriter")

# Table 9 - Pop Var ddG

In [63]:
# Read in z-score normalized mutant summary
ddg = pd.read_csv("{0}/Data/ddG_Single_Mutants/Hotspot_Scored_Mutants.txt".format(base_dir), sep="\t")

# Read in Popvar Data to label which mutants are real population variants
pop_vars = pd.read_csv("{0}/Data/Pop_Vars.txt".format(base_dir), sep="\t")
pop_vars = set(pop_vars[["UniProt", "AA_Ref", "AA_Pos", "AA_Alt"]].apply(tuple, axis=1).values)

# Select only the population variants
ddg = ddg[ddg["is_pop_var"]]

In [64]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [65]:
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P1", how="left", rsuffix=" Viral")
ddg = ddg.join(proteins.set_index("ID")[["Gene Name", "UniProt", ]], on="P2", how="left", rsuffix=" Human")

In [66]:
ddg = ddg[list(ddg)[-4:] + list(ddg)[2:-4]]

In [67]:
ddg = ddg[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Ref", "Pos", "Alt", "ddG", "std", "p-value", "z-score (same AA)", "z-score"]]
ddg.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Ref", "Pos", "Alt", "ddG", "std", "p-value", "z-score (Same AA)", "z-score (Any AA)"]

In [68]:
def do(x):
    x, p = x
    if(p > 0.05):
        return "Non-Hotspot"
    if(x >= 2):
        return "Strong Disruptive Hotspot"
    if(x >= 1):
        return "Disruptive Hotspot"
    if(x > -1):
        return "Non-Hotspot"
    if(x <= -2):
        return "Strong Stabilizing Hostpot"
    if(x <= -1):
        return "Stabilizing Hotspot"
# FUNCTION END
ddg["Interpretation"] = ddg[["z-score (Same AA)", "p-value"]].apply(do, axis=1)

In [69]:
ddg["Interpretation"].value_counts()

Non-Hotspot                   1882
Strong Disruptive Hotspot       48
Disruptive Hotspot              42
Strong Stabilizing Hostpot      26
Stabilizing Hotspot             25
Name: Interpretation, dtype: int64

In [70]:
ddg.sort_values("z-score (Same AA)", ascending=False).to_csv("{0}/Supplemental_Table_9.txt".format(table_dir), sep="\t", index=None)
ddg.sort_values("z-score (Same AA)", ascending=False).to_excel("{0}/Supplemental_Table_9.xlsx".format(table_dir), engine="xlsxwriter")

# Table 10 - Drug Binding

In [71]:
drugs = pd.read_csv("{0}/Data/Drug_Interface_Enrichmet.txt".format(base_dir), sep="\t")
drugs = drugs[drugs["Docking_Rank"] == 1]

In [72]:
proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

In [73]:
drugs = drugs.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Human_Protein", rsuffix=" Human")
drugs = drugs.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Viral_Interactor", rsuffix=" Viral")

In [74]:
all_drugs = pd.read_csv("{0}/Data/Krogan_Drug_Candidates.txt".format(base_dir), sep="\t")
all_drugs

Unnamed: 0,Compound Name,Human Gene,Human ID,Viral ID,Human PDBs,Drug Status,Activity Type,Activity,Reference,Smiles,ZINC_ID,Purchase Notes,Source
0,4E2RCat,Translation,,,,Pre-clinical,eIF4E/G PPI inhibitor,IC50 = 13500,21507972,O=C(O)C1=CC(C2=CC=C(/C=C(SC(N3CC4=CC=CC=C4)=S)...,ZINC7018722,MedChemExpress 432499-63-3,Expert
1,ABBV-744,BRD4,O60885,SARS-CoV2 E,2I8N;2LSP;2MJV;2N3K;2NCZ;2ND0;2ND1;2NNU;2OSS;2...,Clinical Trial,BRD inhibitor,Kd = 2.1,31969702,CCNC(=O)C1=CC2=C([NH]1)C(=O)N(C)C=C2C3=C(OC4=C...,ZINC1250228389,Sellekchem S8723,Expert
2,ABBV-744,BRD2,P25440,SARS-CoV2 E,1X0J;2DVQ;2DVR;2DVS;2DVV;2E3K;2G4A;2YDW;2YEK;3...,Clinical Trial,BRD inhibitor,Kd = 2.1,31969702,CCNC(=O)C1=CC2=C([NH]1)C(=O)N(C)C=C2C3=C(OC4=C...,ZINC1250228389,Sellekchem S8723,Expert
3,Camostat,Cell Entry,,,,Approved (Pancreatitis),Serine protease 1 inhibitor,IC50 < 1000,10.1101/2020.01.31.929042 22496216,CN(C)C(=O)COC(=O)Cc1ccc(OC(=O)c2ccc(N=C(N)N)cc...,ZINC3871842,Cayman Chemicals 16018,Expert
4,Captopril,Cell Entry,,,,Approved (Hypertension),ACE inhibitor,Ki = 3,9187274,O=C(O)[C@H]1N(C(=O)[C@H](C)CS)CCC1,ZINC57001,Cayman Chemicals 15313,Expert
5,CB5083,VCP,,,,Clinical Trial,p97 inhibitor,IC50 = 11,26565666,CC1=CC2=C(C=CC=C2C(N)=O)[N]1C3=NC4=C(COCC4)C(=...,ZINC208076131,Sellekchem S8101,Expert
6,Chloramphenicol,Mitochondrial Ribosome,,,,Approved (Bacterial infection),mitochondrial ribosome inhibitor,IC50 = 7400,23148581,C1=CC(=CC=C1[C@H]([C@@H](CO)NC(=O)C(Cl)Cl)O)[N...,ZINC113382,Sellekchem S1677,Expert
7,Chloroquine,SIGMAR1,Q99720,SARS-CoV2 nsp6,5HK1;5HK2;6DJZ;6DK0;6DK1,Approved (Malaria),Sigma 1 binder,Ki = 100,30042674,CCN(CC)CCC[C@H](C)Nc1ccnc2cc(Cl)ccc12,ZINC19144231,,Expert
8,Compound 10,Viral Transcription,,,,Pre-clinical,PI4K-IIIß inhibitor,IC50 = 3.4,26885694,CC1=C(SC(NC(C(C)(C)C)=O)=N1)C2=CC=C(OC)C(S(NC3...,,custom,Expert
9,Compound 2,Viral Transcription,,,,Pre-clinical,Cyclophilin inhibitor,Kd = 24,30074795,O=C([C@@H]1CCCN(C([C@H](C)NC([C@H](C(C)C)N2)=O...,,custom,Expert


In [75]:
drugs = drugs.join(all_drugs.drop_duplicates("Compound Name").set_index("Compound Name")[["Smiles", "ZINC_ID"]], on="Compound_Name", how="left")

In [76]:
len(drugs)

30

In [77]:
drugs = drugs[["Gene Name", "UniProt", "Gene Name Viral", "UniProt Viral", "Compound_Name", "Smiles", "ZINC_ID", "Score", "Drug_Ires", "Protein_Ires", "Log2OR", "LowerCI", "UpperCI", "p-value"]]
drugs.columns = ["Gene Human", "UniProt Human", "Gene Viral", "UniProt Viral", "Compound Name", "Smiles", "ZINC_ID", "Drug Docking Score", "Drug Ires", "Protein Ires", "Log2OddsRatio", "Lower 95% CI", "Upper 95% CI", "p-value"]

In [78]:
drugs["Drug Ires"] = drugs["Drug Ires"].map(lambda x: zip_res_range(x))
drugs["Protein Ires"] = drugs["Protein Ires"].map(lambda x: zip_res_range(x) if type(x) == str else x)

In [79]:
drugs = drugs.sort_values(["Log2OddsRatio", "Protein Ires"], ascending=[False, True])

In [81]:
drugs.to_csv("{0}/Supplemental_Table_10.txt".format(table_dir), sep="\t", index=None)

# There are some odd ASCII characters here, so this one doesn't convert
# successfully. Need to do manually by copy / pasting into excel
#drugs.to_excel("{0}/Supplemental_Table_9.xlsx".format(table_dir), engine="xlsxwriter")