In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("Prep_Downloads")

import requests
import pandas as pd
pd.options.display.max_columns = 999

import numpy as np
import helper as my

import glob
import os
import sys

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from collections import defaultdict
import math

In [2]:
from mjm_tools import zip_res_range, unzip_res_range

# Paramaters for file locations

In [4]:
# Base directory for whole project
base_dir = "/home/sdw95/3D_SARS2"

# Output directory for finished tables
downloads_dir = "{0}/Downloads".format(base_dir)

if(not os.path.exists(downloads_dir)):
    os.mkdir(downloads_dir)

# Interface

In [6]:
os.system("mkdir -p {0}/Interface".format(downloads_dir))

0

# 1.1 Interface Annotations

In [8]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df = ires_df.sort_values(["P1", "P2"]).drop_duplicates(["P1", "P2"])

proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

ires_df = ires_df[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Source", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"]]
ires_df.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Source", "Len Viral", "N Ires Viral", "Ires Viral", "Len Human", "N Ires Human", "Ires Human"]

ires_df["Ires Viral"] = ires_df["Ires Viral"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")
ires_df["Ires Human"] = ires_df["Ires Human"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")

ires_df.sort_values(["Gene Viral", "Gene Human"]).to_csv("{0}/Interface/Interface_annotation_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# 2.1 ECLAIR Summary

In [9]:
ires_df = pd.read_csv("{0}/Data/Interface_Summary.txt".format(base_dir), sep="\t")
ires_df = ires_df[ires_df["Source"] == "ECLAIR"]

proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
ires_df = ires_df.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

ires_df = ires_df[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "P1_Len", "P1_N_Ires", "P1_Ires", "P2_Len", "P2_N_Ires", "P2_Ires"]]
ires_df.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Len Viral", "N Ires Viral", "Ires Viral", "Len Human", "N Ires Human", "Ires Human"]

ires_df["Ires Viral"] = ires_df["Ires Viral"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")
ires_df["Ires Human"] = ires_df["Ires Human"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")

ires_df.sort_values(["Gene Viral", "Gene Human"]).to_csv("{0}/Interface/ECLAIR_summary_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# 2.2 ECLAIR Predictions

In [11]:
os.system("mkdir -p {0}/Interface/Eclair_Predictions_2021_05".format(downloads_dir))

0

In [15]:
for f in tqdm_notebook(glob.glob("{0}/Data/Eclair_Predictions/*".format(base_dir))):
    data = pd.read_csv(f, sep="\t")
    
    data = data.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
    data = data.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")
    
    data = data[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human"] + list(data)[2:-4]]
    data.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Prot", "Pos", "Res", "Clf1", "Clf2", "Clf3", "Clf4", "Clf5", "Clf6", "Clf7", "Clf8", "TopClf", "Pred", "Tier"]
    
    gene1 = data["Gene Viral"].values[0]
    if(pd.isnull(gene1)):
        gene1 = data["UniProt Viral"].values[0]
    gene2 = data["Gene Human"].values[0]
    if(pd.isnull(gene2)):
        gene2 = data["UniProt Human"].values[0]
    data.to_csv("{0}/Interface/Eclair_Predictions_2021_05/{1}_{2}_Eclair_Preds.txt".format(downloads_dir, gene1, gene2), sep="\t", index=None)

HBox(children=(IntProgress(value=0, max=332), HTML(value=u'')))




In [16]:
start_dir = os.getcwd()
os.chdir("{0}/Interface/".format(downloads_dir))
my.call("zip -r Eclair_Predictions_2021_05.zip Eclair_Predictions_2021_05")
os.chdir(start_dir)

# 3.1 Docking Summary

In [17]:
docking = pd.read_csv("{0}/Data/Docking_Summary.txt".format(base_dir), sep="\t")
docking = docking[docking["Rank"] == 1]

proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

docking = docking.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
docking = docking.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

docking = docking[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Attempt", "Score", "P1_Ires", "P2_Ires"]]
docking.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Attempt", "Score", "Ires Viral", "Ires Human"]

docking["Ires Viral"] = docking["Ires Viral"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")
docking["Ires Human"] = docking["Ires Human"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")

docking.sort_values(["Gene Viral", "Gene Human"]).to_csv("{0}/Interface/Protein_docking_summary_20202_07.txt".format(downloads_dir), sep="\t", index=None)

# 3.2 / 3.3 Top Docks / Other Docks

In [18]:
os.system("mkdir -p {0}/Interface/Top_ranked_docks_2021_05".format(downloads_dir))
os.system("mkdir -p {0}/Interface/Other_docks_2021_05".format(downloads_dir))

0

In [30]:
docking = pd.read_csv("{0}/Data/Docking_Summary.txt".format(base_dir), sep="\t")

proteins = pd.read_csv("{0}/Data/Proteins.txt".format(base_dir), sep="\t")

docking = docking.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P1", how="left", rsuffix=" Viral")
docking = docking.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="P2", how="left", rsuffix=" Human")

docking = docking[["Gene Name", "UniProt", "Gene Name Human", "UniProt Human", "Attempt", "File", "Rank", "Score", "P1_Ires", "P2_Ires"]]
docking.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Attempt", "File", "Rank", "Score", "Ires Viral", "Ires Human"]

docking["Ires Viral"] = docking["Ires Viral"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")
docking["Ires Human"] = docking["Ires Human"].map(lambda x: zip_res_range(x) if type(x) == str else "[]")

docking.sort_values(["Gene Viral", "Gene Human", "Rank"]).to_csv("{0}/Interface/Other_docks_2021_05/Protein_docking_summary_full_2021_05.txt".format(downloads_dir), sep="\t", index=None)

In [31]:
files = []
for g1, uni1, g2, uni2, attempt, f, rank in tqdm_notebook(docking[["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Attempt", "File", "Rank"]].values):
    if(pd.isnull(g1)):
        g1 = uni1
    if(pd.isnull(g2)):
        g2 = uni2
    files.append("{1}_{2}/{1}_{2}_Dock_{3:02}.pdb".format(f, g1, g2, attempt))
    
    
    my.call("mkdir -p {0}/Interface/Other_docks_2021_05/{1}_{2}".format(downloads_dir, g1, g2))
    o = os.system("cp {0} {1}/Interface/Other_docks_2021_05/{2}_{3}/{2}_{3}_Dock_{4:02}.pdb".format(f, downloads_dir, g1, g2, attempt))
    if(o != 0):
        print "ERROR"
        break
    
    if(rank == 1):
        o = os.system("cp {0} {1}/Interface/Top_ranked_docks_2021_05/{2}_{3}_top_dock.pdb".format(f, downloads_dir, g1, g2))

HBox(children=(IntProgress(value=0, max=27600), HTML(value=u'')))




In [32]:
docking["File"] = files
docking.sort_values(["Gene Viral", "Gene Human", "Rank"]).to_csv("{0}/Interface/Other_docks_2021_05/Protein_docking_summary_full_2021_05.txt".format(downloads_dir), sep="\t", index=None)

In [33]:
for g in docking.groupby(["UniProt Human"]):
    base = g[1]["File"].map(lambda x: x.split("/")[0]).values[0]
    g[1]["File"] = g[1]["File"].map(lambda x: x.split("/")[1])
    
    g[1].sort_values(["Gene Viral", "Gene Human", "Rank"]).to_csv("{0}/Interface/Other_docks_2021_05/{1}/{1}_docking_summary.txt".format(downloads_dir, base), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [34]:
start_dir = os.getcwd()
os.chdir("{0}/Interface/".format(downloads_dir))
my.call("zip -r Top_ranked_docks_2021_05.zip Top_ranked_docks_2021_05")
my.call("zip -r Other_docks_2021_05.zip Other_docks_2021_05")
os.chdir(start_dir)

# Structures

In [35]:
os.system("mkdir -p {0}/Structures".format(downloads_dir))

0

# 1.1 Viral Homology Models Summary

In [None]:
# Now Obsolete?

In [50]:
# 1.1 Viral Structures Summary

In [51]:
uni2seq = proteins.set_index("ID")["Sequence"].to_dict()

In [55]:
structures = pd.read_csv("{0}/Data/Models.txt".format(base_dir), sep="\t")
structures["Coverage"] = structures[["ID", "Resi_Covered"]].apply(lambda x: len(unzip_res_range(x[1])) / float(len(uni2seq[x[0]])), axis=1)
structures["Residues Covered"] = structures["Resi_Covered"]
structures = structures[structures["ID"].map(lambda x: "COVID" in x)]
structures = structures[[x for x in structures if not x == "Resi_Covered" and not x == "PDB_File"]]

structures = structures.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="ID")
structures = structures[["Gene Name", "UniProt"] + list(structures)[1:-2]]

structures = structures.drop("ModBase_ID", axis=1)

In [57]:
structures.sort_values("Gene Name").to_csv("{0}/Structures/SARS_CoV_2_structure_summary_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# 1.2 Human Structures Summary

In [58]:
uni2seq = proteins.set_index("ID")["Sequence"].to_dict()

In [59]:
structures = pd.read_csv("{0}/Data/Models.txt".format(base_dir), sep="\t")
structures["Coverage"] = structures[["ID", "Resi_Covered"]].apply(lambda x: len(unzip_res_range(x[1])) / float(len(uni2seq[x[0]])), axis=1)
structures["Residues Covered"] = structures["Resi_Covered"]
structures = structures[structures["ID"].map(lambda x: not "COVID" in x)]
structures = structures[[x for x in structures if not x == "Resi_Covered" and not x == "PDB_File"]]

structures = structures.join(proteins.set_index("ID")[["UniProt", "Gene Name"]], on="ID")
structures = structures[["Gene Name", "UniProt"] + list(structures)[1:-2]]


In [60]:
structures.sort_values("Gene Name").to_csv("{0}/Structures/Human_structure_summary_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# 1.3 Viral Structures

In [61]:
os.system("mkdir -p {0}/Structures/SARS_CoV_2_proteins_2021_05".format(downloads_dir))

0

In [62]:
for f in glob.glob("{0}/Data/Undocked_Structures/*".format(base_dir)):
    if("COVID19" in f):
        if("5C145A" in f):
            continue
        os.system("cp {0} {1}/Structures/SARS_CoV_2_proteins_2021_05/{2}.pdb".format(f, downloads_dir, f.split("COVID19")[1].upper().split(".")[0].replace("Spike", "S")))

In [63]:
start_dir = os.getcwd()
os.chdir("{0}/Structures/".format(downloads_dir))
my.call("zip -r SARS_CoV_2_proteins_2021_05.zip SARS_CoV_2_proteins_2021_05")
os.chdir(start_dir)

# 1.4 Human Structures

In [47]:
os.system("mkdir -p {0}/Structures/Human_proteins_2021_05".format(downloads_dir))

0

In [49]:
for f in glob.glob("{0}/Data/Undocked_Structures/*".format(base_dir)):
    if(not "COVID19" in f):
        g1 = os.path.basename(f).split("_")[0]
        g2 = proteins[proteins["ID"] == g1]["Gene Name"].values[0]
        if(not str(g2).lower() == "nan"):
            g1 = g2
        o = os.system("cp {0} {1}/Structures/Human_proteins_2021_05/{2}.pdb".format(f, downloads_dir, g1))
        if(o != 0):
            print "Error"

In [64]:
start_dir = os.getcwd()
os.chdir("{0}/Structures/".format(downloads_dir))
my.call("zip -r Human_proteins_2021_05.zip Human_proteins_2021_05")
os.chdir(start_dir)

# 2.1 Drug Top-Docks

In [65]:
os.system("mkdir -p {0}/Structures/Top_ranked_docked_ligands_2021_05".format(downloads_dir))

0

In [66]:
for f in glob.glob("{0}/Data/Docked_Ligands/Ligand_plus_Structures/*".format(base_dir)):
    g1, drug = os.path.basename(f).split("_")[:2]
    g2 = proteins[proteins["ID"] == g1]["Gene Name"].values[0]
    if(not str(g2).lower() == "nan"):
        g1 = g2
    os.system("cp {0} {1}/Structures/Top_ranked_docked_ligands_2021_05/{2}_{3}_top_dock.pdb".format(f, downloads_dir, g1, drug))

In [67]:
start_dir = os.getcwd()
os.chdir("{0}/Structures/".format(downloads_dir))
my.call("zip -r Top_ranked_docked_ligands_2021_05.zip Top_ranked_docked_ligands_2021_05")
os.chdir(start_dir)

# 2.2 Drug Other Docks

In [68]:
os.system("mkdir -p {0}/Structures/Top_10_docked_ligands_2021_05".format(downloads_dir))

0

In [69]:
for f in glob.glob("{0}/Data/Docked_Ligands/*.pdb".format(base_dir)):
    g1, drug = os.path.basename(f).split("_")[:2]
    g2 = proteins[proteins["ID"] == g1]["Gene Name"].values[0]
    if(not str(g2).lower() == "nan"):
        g1 = g2
    
    lines = my.easyReadLines(f)
    try:
        stop_index = [i for i, l in enumerate(lines) if "ENDMDL" in l][9]
    except:
        continue
    my.easyWriteLines("{0}/Structures/Top_10_docked_ligands_2021_05/{1}_{2}_top_10_dock.pdb".format(downloads_dir, g1, drug), lines[:stop_index + 1])

In [70]:
start_dir = os.getcwd()
os.chdir("{0}/Structures/".format(downloads_dir))
my.call("zip -r Top_10_docked_ligands_2021_05.zip Top_10_docked_ligands_2021_05")
os.chdir(start_dir)

# 2.3 Drug Ires Summary

In [71]:
ires_df = pd.read_csv("{0}/Data/Drug_Docking_Ires_Summary.txt".format(base_dir), sep="\t")
ires_df = ires_df.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Human_Protein")
ires_df = ires_df[["Gene Name", "UniProt", "Compound_Name", "Rank", "Score", "Ires"]]
ires_df.columns = ["Gene Name", "UniProt", "Compound Name", "Rank", "Score", "Ires"]
ires_df.sort_values(["Gene Name", "Compound Name", "Rank"]).to_csv("{0}/Structures/Drug_docking_ires_summary_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# 2.4 Drug Interface Enrichment Summary

In [72]:
ires_df = pd.read_csv("{0}/Data/Drug_Interface_Enrichmet.txt".format(base_dir), sep="\t")
ires_df = ires_df.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Viral_Interactor")
ires_df = ires_df.join(proteins.set_index("ID")[["Gene Name", "UniProt"]], on="Human_Protein", rsuffix=" Human")
ires_df = ires_df[["Gene Name Human", "UniProt Human", "Compound_Name", "Gene Name", "UniProt", "Docking_Rank", "Score", "Drug_Ires", "Protein_Ires", "Log2OR", "LowerCI", "UpperCI", "p-value"]]
ires_df.columns = ["Gene Human", "UniProt Human", "Compound Name", "Gene Viral", "UniProt Viral", "Rank", "Score", "Drug Ires", "Protein Ires", "Log2OR", "LowerCI", "UpperCI", "p-value"]
ires_df.sort_values(["Gene Human", "Compound Name", "Rank"]).to_csv("{0}/Structures/Drug_docking_interface_enrichment_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# Mutations

In [73]:
os.system("mkdir -p {0}/Mutations".format(downloads_dir))

0

# 1.1 SARS Muts

In [74]:
sars_muts = pd.read_csv("{0}/Tables/Supplemental_Table_4.txt".format(base_dir), sep="\t")
sars_muts.to_csv("{0}/Mutations/SARS_CoV_2_mutations_2021_05.txt".format(downloads_dir), sep="\t", index=None)

# 1.2 SARS Enrichment

In [75]:
sars_enrich = pd.read_csv("{0}/Tables/Supplemental_Table_5.txt".format(base_dir), sep="\t")
sars_enrich = sars_enrich[["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "LogOdds Viral", "CI_Low Viral", "CI_Up Viral", "P-value Viral", "Ires_NoVar Viral", "Ires_Var Viral", "NoIres_Var Viral", "NoIres_NoVar Viral"]]
sars_enrich.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Log2OR", "CI_Low", "CI_Up", "p-value", "Ires_NoVar", "Ires_Var", "NoIres_Var", "NoIres_NoVar"]

sars_enrich.to_csv("{0}/Mutations/SARS_CoV_2_mutation_enrichment_2021_05".format(downloads_dir))

# 2.1 Human Muts

In [76]:
sars_muts = pd.read_csv("{0}/Tables/Supplemental_Table_3.txt".format(base_dir), sep="\t")
sars_muts.to_csv("{0}/Mutations/Human_population_variants_2021_05.txt".format(downloads_dir), sep="\t", index=None)

  interactivity=interactivity, compiler=compiler, result=result)


# 2.2 Human Enrichment

In [77]:
sars_enrich = pd.read_csv("{0}/Tables/Supplemental_Table_5.txt".format(base_dir), sep="\t")
sars_enrich = sars_enrich[["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "LogOdds Human", "CI_Low Human", "CI_Up Human", "P-value Human", "Ires_NoVar Human", "Ires_Var Human", "NoIres_Var Human", "NoIres_NoVar Human"]]
sars_enrich.columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human", "Log2OR", "CI_Low", "CI_Up", "p-value", "Ires_NoVar", "Ires_Var", "NoIres_Var", "NoIres_NoVar"]

sars_enrich.to_csv("{0}/Mutations/Human_population_variants_enrichment_2021_05".format(downloads_dir))

# ddG

In [78]:
os.system("mkdir -p {0}/ddG".format(downloads_dir))

0

# 1.1 ddG Scanning Mutagenesis

In [79]:
os.system("mkdir -p {0}/ddG/ddG_scanning_mutagenesis_2021_05".format(downloads_dir))

0

In [80]:
for f in glob.glob("{0}/Data/ddG_Single_Mutants/Summaries/*Mean.txt".format(base_dir)):
    idA, uniB = os.path.basename(f).split("_")[:2]
    uniA = proteins[proteins["ID"] == idA]["UniProt"].values[0]
    geneA = proteins[proteins["ID"] == idA]["Gene Name"].values[0]
    geneB = proteins[proteins["ID"] == uniB]["Gene Name"].values[0]

    tmp = pd.read_csv(f, sep="\t")
    tmp2 = []
    for row in tmp.values:
        row = dict(zip(list(tmp), row))
        for col in row:
            if("ddG_{0}".format(row["Ref"]) in col):
                row[col] = np.nan
        if(row["Chain"] == "A"):
            row["Chain"] = 0
        else:
            row["Chain"] = 1
        tmp2.append([geneA, uniA, geneB, uniB] + [row[c] for c in tmp])
    tmp2 = pd.DataFrame(tmp2, columns = ["Gene Viral", "UniProt Viral", "Gene Human", "UniProt Human"] + list(tmp))
    
    a = geneA
    b = geneB
    if(str(b).lower() == "nan"):
        b = uniB
    tmp2.to_csv("{0}/ddG/ddG_scanning_mutagenesis_2021_05/{1}_{2}_scanning_mutagenesis.txt".format(downloads_dir, a, b), sep="\t", index=None)

In [81]:
start_dir = os.getcwd()
os.chdir("{0}/ddG/".format(downloads_dir))
my.call("zip -r ddG_scanning_mutagenesis_2021_05.zip ddG_scanning_mutagenesis_2021_05")
os.chdir(start_dir)

# 1.2 SARS-CoV SARS-CoV-2 ddG

In [82]:
tmp = pd.read_csv("{0}/Tables/Supplemental_Table_7.txt".format(base_dir), sep="\t")
tmp.to_csv("{0}/ddG/ddG_summary_SARS_CoV_vs_SARS_CoV_2_2021_05".format(downloads_dir), sep="\t", index=None)