In [1]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("03_Generate_Proteins_Summary")

from config import *
from helper_functions import fasta2dict, NWSeqAlignment, alignPrint, batchUniProtAPI

import pandas as pd
pd.options.display.max_columns = 999

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

from collections import defaultdict

This notebook generates a summary of all of the proteins included in the input interaction set and outputs it as Proteins.txt. The primary purpose of this is just to have convenient access to gene names / sequences / lengths for all human UniProt / SARS-CoV-2 protein entries for later analyses and result formatting.

- Inputs:
  - Interactions.txt
  - Covid19_Interactome.txt


- Static Resource Dependencies:
  - uniprot_info.txt
  - pfam_domains.txt
  - uniprot_covid_19.fasta


- Outputs:
  - Proteins.txt
  - Protein_Domains.txt


- Dependencies:
  - Should be run AFTER all interacitons have been fed through ECLAIR pipeline for interface prediction
    - **NOTE:** The ECLAIR pipeline is not incluuded in this repository, and treats any output from this pipeline as a static result that is already available

# Fetch Info for Human / Viral Proteins

In [2]:
# NOTE: This script is based on some hardcoded local resources
#       that are not updated by the code provided here. Specifically
#       the "uniprot_info.txt" is generated during the steps of our
#       ECLAIR pipeline.
#       
#       Any such resources / pre-requisites that cannot be reconstructed
#       from scratch from this repository are included in the "statis_resources"
#       directory. Inquiries for reconstructing or updating any of these files
#       can be addressed to Shayne Wierbowski (sdw95@cornell.edu). If there is
#       sufficient interest, this code can be made available as a separate repository,
#       but is currently highly integrated into our local servers and difficult to
#       compartmentalize for public use.

In [3]:
# Uniprot Info Generated in ECLIAR
uniprot_info = pd.read_csv("{0}/uniprot_info.txt".format(resource_dir), sep="\t")

# Original Interaction List from Krogan Paper (Gordon et al Nature 2020, Supplementary Table 2)
# Available at - https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2286-9/MediaObjects/41586_2020_2286_MOESM6_ESM.xlsx
# 
# NOTE: This table really just used to conveniently grab human gene symbols, and (this format) is not technically
#       necessary if this code were modified for other interaction sets
interactions = pd.read_csv("{0}/COVID19_Interactome.txt".format(input_dir), sep="\t")
id2gene = interactions.set_index("Preys")["PreyGene"].to_dict() # Map Human UniProt to Prefered Gene Name

# Generate Full Set of Identifiers submitted to ECLIAR
#
# NOTE: This is the primary input that mattersr (tab separated list of interaction based on UniProt ID)
#       (Although since UniProt IDs were not available for SARS-CoV-2 proteins at the onset of this project
#        we've used a custom set of names throughput)
interactions2 = pd.read_csv("{0}/Interactions.txt".format(input_dir), sep="\t")
all_ids = set(interactions2["P1"].to_list() + interactions2["P2"].to_list())

# Pull out / Reformat the lines in UniProt Info we care about
protein_summary = uniprot_info[uniprot_info["id"].map(lambda x: x in all_ids)][["id", "reviewed", "genes", "protein names", "length", "sequence"]]
protein_summary["Is_Viral"] = protein_summary["id"].map(lambda x: "COVID" in x)
protein_summary["reviewed"] = protein_summary[["id", "reviewed"]].apply(lambda x: True if x[1] == "reviewed" and not "COVID" in x[0] else False, axis=1)
protein_summary["genes"] = protein_summary[["id", "genes"]].apply(lambda x: x[1] if not x[0] in id2gene else id2gene[x[0]], axis=1)

In [4]:
# Save
#protein_summary.sort_values(["Is_Viral", "genes"])[["id", "Is_Viral", "reviewed", "genes", "protein names", "length", "sequence"]]
protein_summary.sort_values(["Is_Viral", "genes"])[["id", "Is_Viral", "genes", "length", "sequence"]].to_csv("{0}/Proteins.txt".format(output_dir), sep="\t", header=["ID", "Is_Viral", "Gene_Name", "Length", "Sequence"], index=None)

# Fetch Human UniProt Domain Annotations

In [5]:
# NOTE: pfam_domain data also pulled from ECLAIR pipeline rather than generated from scratch
#       This data only for internal use and is intended for part of the web display. Should not
#       be relevant to the main pipeline.

In [6]:
# Read ECLAIR Domain Info
pfam_doms = pd.read_csv("{0}/pfam_domains.txt".format(resource_dir), names=["ID", "Is_Domain"], sep="\t")
pfam_doms = pfam_doms[pfam_doms["ID"].map(lambda x: x in all_ids)]

pfam_doms["Is_Viral"] = pfam_doms["ID"].map(lambda x: "COVID" in x)

pfam_doms.sort_values(["Is_Viral", "ID"])[["ID", "Is_Viral", "Is_Domain"]].to_csv("{0}/Protein_Domains.txt".format(output_dir), sep="\t", index=None)

# Add COVID UniProt IDs to Protein Info where Available

In [7]:
# Re-read Protein summary
proteins = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t")

# Read local covid19 uniprot fasta
# This was just a download through the UniProt
# COVID19 resource (https://covid-19.uniprot.org/uniprotkb?query=*)
covid_fasta = fasta2dict("{0}/uniprot_covid_19.fasta".format(resource_dir))

# This fasta includes human / SARS1 entries as well. Filter to only the SARS2 entries
covid_fasta = {k.split("|")[1]:v for k, v in covid_fasta.iteritems() if "OX=2697049" in k}

In [13]:
# Select most the best matched SARS-CoV-2 entry (provided by us) for each
# UniProt SARS-CoV-2 entry
covid2best = defaultdict(lambda: ["None", {"Pident":0}])

# Iterate over our IDs
for uniA, seqA in tqdm_notebook(proteins[["ID", "Sequence"]].values):
    # Skip the human proteins
    if(not "COVID19" in uniA):
        continue
    
    # Iterate over the Uniprot IDs
    for uniB, seqB in covid_fasta.iteritems():
        # Manually selected which UniProt should be used for orf8 (based on visual inspection of alignments)
        if((uniA == "orf8") and not uniB == "P0DTC8"):
            continue
        
        # Generate alignment, save this entry if it the best so far
        align = NWSeqAlignment(seqB, seqA)
        if(align["Pident"] > covid2best[uniB][1]["Pident"]):
            covid2best[uniB] = [uniA, align]

HBox(children=(IntProgress(value=0, max=359), HTML(value=u'')))

333/|/ 93%|| 333/359 [00:20<00:01, 15.95it/s]


In [17]:
# Now from these select the best UniProt ID to match up with our IDs

# Black list to exclude certain Uniprot IDs from the mapping
# - P0DTD8 - orf7b (not included in our set)
# - P0DTC1 and P0DTD1 - Describe the uncleave replicate proteins (we have the cleaved proteins in our dataset)
blacklist = ["P0DTD8", "P0DTC1", "P0DTD1"]
krogan2uni = defaultdict(str)
for uni, v in covid2best.iteritems():
    uniB, align = v
    
    # Print number of non-identical matches in the alignments
    print uni, uniB, len(align["Alignment"].replace("|", ""))
    if(not uni in blacklist):
        krogan2uni[uniB] = uni
        
        # If the alignment isn't perfect take a look at it
        if(align["Pident"] < 1):
            print uni
            my.alignPrint(align, name1=uni, name2=uniB)

proteins["UniProt"] = proteins["ID"].map(lambda x: krogan2uni[x] if "COVID" in x else x)

P0DTC7 COVID19orf7a 0
A0A663DJA2 COVID19orf10 0
P0DTC6 COVID19orf6 0
P0DTC9 COVID19N 0
P0DTD2 COVID19orf9b 0
P0DTD3 COVID19orf9c 0
P0DTC5 COVID19M 0
P0DTC4 COVID19E 0
P0DTC3 COVID19orf3a 0
P0DTC2 COVID19Spike 0
P0DTC1 COVID19nsp1 4225
P0DTD8 COVID19Spike 1252
P0DTD1 COVID19nsp1 6916
P0DTC8 COVID19orf8 1
P0DTC8
P0DTC8:         1 MKFLVFLGII TTVAAFHQEC SLQSCTQHQP YVVDDPCPIH FYSKWYIRVG ARKSAPLIEL 60  
                  |||||||||| |||||||||| |||||||||| |||||||||| |||||||||| ||||||||||     
COVID19orf8:    1 MKFLVFLGII TTVAAFHQEC SLQSCTQHQP YVVDDPCPIH FYSKWYIRVG ARKSAPLIEL 60  

P0DTC8:        61 CVDEAGSKSP IQYIDIGNYT VSCLPFTINC QEPKLGSLVV RCSFYEDFLE YHDVRVVLDF 120 
                  |||||||||| |||||||||| |||-|||||| |||||||||| |||||||||| ||||||||||     
COVID19orf8:   61 CVDEAGSKSP IQYIDIGNYT VSCSPFTINC QEPKLGSLVV RCSFYEDFLE YHDVRVVLDF 120 

P0DTC8:       121 I 121 
                  |     
COVID19orf8:  121 I 121 



In [21]:
# Grab the gene names for all UniProt IDs
s = proteins["UniProt"].to_list()
tmp = dict(zip(s, batchUniProtAPI(s, source_id="ACC", target_id="GENENAME")))
proteins["Gene Name"] = proteins[["ID", "UniProt"]].apply(lambda x: tmp[x[1]] if not "COVID" in x[0] else x[0].replace("COVID19", "").replace("orf9c", "orf14").replace("Spike", "S").replace("C145A", "").upper(), axis=1)

In [26]:
# Save final output
proteins.to_csv("{0}/Proteins.txt".format(output_dir), sep="\t", index=None)