In [3]:
# Set the process name to be human readable in htop
import setproctitle
setproctitle.setproctitle("09_Calculate_Interface_Variation_Enrichment")

from config import *
from helper_functions import pdb2df, odds_ratio, zip_res_range, unzip_res_range

import pandas as pd
pd.options.display.max_columns = 999

import numpy as np

import glob

from tqdm import tqdm, tqdm_notebook
from tqdm._tqdm_notebook import tqdm_notebook

tqdm.pandas(tqdm_notebook)
tqdm_notebook.pandas()

This notebook calculates the Log Odds enrichment for occurence of human population variants or SASR-CoV-1 to SARS-CoV-2 sequence divergences along the ECLAIR-predicted or docked interfaces. Output is summarized in Pop_Var_Enrichments.txt and Viral_Mut_Enrichments.txt.

- Inputs:
  - Interface_Summary.txt
  - Pop_Vars.txt
  - Viral_Muts.txt
  - Proteins.txt


- Static Resource Dependencies:
  - H_sapiens_interfacesAll.txt
  - pdbresiduemapping.txt
  - ires_perpdb_alltax.txt


- Outputs:
  - Pop_Var_Enrichments.txt
  - Viral_Mut_Enrichments.txt


- Dependencies:
  - Must be run after 05_Fetch_Population_Variants, 06_Compile_Viral_Mutations, and 08_Fetch_Top_Docks

# Calculate Enrichment on Human Population Variants

In [4]:
# Read in IRES Info Summary
ires_df = pd.read_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t")
ires_df = ires_df.sort_values(["P1", "P2", "Source"]).drop_duplicates(["P1", "P2"])
#ires_df = ires_df[ires_df["Source"] == "ECLAIR"]
#ires_df = ires_df[ires_df["Source"] == "Docking"]

# Read in Human Pop Var Summary
pop_var_df = pd.read_csv("{0}/Pop_Vars.txt".format(output_dir), sep="\t")

In [5]:
pop_var_df["gnomAD_AF"] = pop_var_df["gnomAD_AF"].map(lambda x: float(x if not x == "-" else np.nan))

In [6]:
# Generate Coverage Masks (only applied to docked interface predictions)
#
# NOTE: Assumes only one structure used for each interaction. This is currently
#       a safe assumption because the viral proteins only have one structure selected
#       and each of the human proteins only interact with one viral protein.
# NOTE: The COVID19 N protein technically had two structures with distinct coverage
#       considered, but only one of them was used for docking. So we still haven't
#       violated the assumption.
#
uni2seq = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t").set_index("ID")["Sequence"].to_dict()
uni2coverage_mask = dict()
for uni in tqdm_notebook(uni2seq):
    uni2coverage_mask[uni] = np.zeros(len(uni2seq[uni]))
    
    # Grab the structure used in docking where available
    #
    # These structures are renamed with these formats...
    #
    # ProtID_PDBID_ChainID.pdb   - For PDB Structures
    #
    # or
    #
    # ProtID_ModbaseID.pdb       - For ModBase Homology Models for Human Proteins
    #
    # or
    #
    # ProtID.pdb                 - For COVID19 Homology Models (only applies to nsp14 currently)
    #
    # In the current setup only one structure is used per protein.
    try:
        # Pull out all structure matches for this Uniprot
        uni_f = glob.glob("{0}/Undocked_Structures/{1}[_\.]*".format(output_dir, uni))
        
        # Make sure there are no cases with more than 1 option
        if(len(uni_f) > 1):
            print uni, uni_f
            
            # Special case where two PDB options with distinct coverage were available
            # However, only one structure was used in docking, so we only just manually
            # make sure the right one get used
            if(uni == "COVID19N"):
                uni_f = [x for x in uni_f if "6WZQ" in x]
            
            # If this happens our assumptions have been violated / code needs to be re-worked for
            # interaciton-level coverage (e.g. inter2coverage_mask instead of uni2coverage_mask)
            else:
                raise
        uni_f = uni_f[0]
        #if(not "COVID19" in uni):
        #    uni_f = glob.glob("{0}/Undocked_Structures/COVID19*_{1}_B.pdb".format(output_dir, uni))[0]
        #else:
        #    uni_f = glob.glob("{0}/Undocked_Structures/{1}_*_A.pdb".format(output_dir, uni))[0]
    except IndexError:
        continue
    
    # Read PDB to identified Residue IDs (and correct for 1-index to 0-index offset)
    pdb_df = pdb2df(uni_f)
    resis = list(pdb_df["Residue ID"].unique() - 1)
    resis = [x for x in resis if x >= 0]
    
    # Update coverage mask accordingly
    uni2coverage_mask[uni][resis] = 1

HBox(children=(IntProgress(value=0, max=19), HTML(value=u'')))

  df.header = header
  df.tailer = tailer





In [6]:
# Calculate Odds Ratios
summary = []
uni2iresmask = dict()
uni2popmask = dict()

# Iterate over all interactions
for p1, p2, uni_len, ires, source in ires_df[["P1", "P2", "P2_Len", "P2_Ires", "Source"]].values:
    # Parse Ires
    if(pd.isnull(ires)):
        ires = []
    else:
        ires = [int(x)-1 for x in ires.split(",")]
    
    # Create mask describing Ires Locations
    ires_mask = np.zeros(uni_len)
    ires_mask[ires] = 1
    
    # Fetch locations of population variants on this gene
    # No gnomAD AF Filter applied (but filter condition left in so this can
    # easily be re-run with an AF filter)
    AF_thresh = -0.001 # Set negative for no filtering
    pop_vars = [x-1 for x in pop_var_df[(pop_var_df["UniProt"] == p2)&(pop_var_df["gnomAD_AF"] >= AF_thresh)]["AA_Pos"].unique()]
    
    # Create mask describing Pop Var locations
    pop_mask = np.zeros(uni_len)
    pop_mask[pop_vars] = 1
    
    # Apply coverage mask if working with interface from Docked predictions
    # (Need to adjust the log odds ratio to only consider the portion of the protein
    # covered. The rest of the protein is "non-interface" by default)
    if(source == "Docking"):
        coverage_mask = uni2coverage_mask[p2]
        ires_mask = ires_mask[coverage_mask == True]
        pop_mask = pop_mask[coverage_mask == True]
    
    # Calculate Odds Ratio for Enrichment
    OR, up, low, p = odds_ratio(ires_mask, pop_mask, error="CI", two_sided=False)
    OR, up, low = np.log2([OR, up, low])
    
    # Store these masks for access later
    if(not p2 in uni2iresmask.keys()):
        uni2iresmask[p2] = ires_mask
        uni2popmask[p2] = pop_mask
    # This case should never happen (retained from similarly structured code for viral side)
    # Also provides a secondary check to flag cases where one human protein interacts
    # with two viral proteins
    else:
        print "HERE"
        uni2iresmask[p2] = uni2iresmask[p2] | ires_mask
        uni2popmask[p2] = uni2popmask[p2] | pop_mask
    
    summary.append([p1, p2, OR, low, up, p, sum(ires_mask*(1-pop_mask)), sum(ires_mask*pop_mask), sum((1-ires_mask)*pop_mask), sum((1-ires_mask)*(1-pop_mask))])
summary = pd.DataFrame(summary, columns=["P1", "P2", "LogOdds", "CI_Low", "CI_Up", "P-value", "Ires_NoVar", "Ires_Var", "NoIres_Var", "NoIres_NoVar"])

In [7]:
# Save results
summary.to_csv("{0}/Pop_Var_Enrichments.txt".format(output_dir), sep="\t", index=None)

In [8]:
# Re-read results (Can continue from this point without re-running the initial enrichment calculations)
summary = pd.read_csv("{0}/Pop_Var_Enrichments.txt".format(output_dir), sep="\t")

In [10]:
# Overall Enrichment Across all Genes
ires_mask = np.concatenate([uni2iresmask[uni] for uni in uni2iresmask.keys()])
pop_mask = np.concatenate([uni2popmask[uni] for uni in uni2iresmask.keys()])

OR, up, low, p = odds_ratio(ires_mask, pop_mask, error="SE", two_sided=False, verbose=True)
OR, up, low = np.log2([OR, up, low])
    
print OR, up, low, p

               Case  Non-Case
Exposed        52.0     175.0
Non-Exposed  1531.0    2927.0
-0.815822930782 -0.583464501098 -1.04818136047 0.000223167089529


In [12]:
# Compare against IRES Pop-var enrichment for human-human interfaces (on the same human protein set)

In [13]:
# OPTION 1: Use PDB interface reisudes and consider ALL positions in the enrichment
#
# NOTE: This approach is consistent with what I did for ECLAIR predicted viral-human interfaces.
#       since the ECLAIR predictions don't inherently limit the score to one structure, there's no reason
#       not to use the full length protein sequence in the enrichment calculation.
#       
#       We don't specifically expect there to be any bias in the true co-crystal structure
#       so we could argue the coverage restriction is not necessary here?

In [14]:
# This file available for download here...
# http://interactomeinsider.yulab.org/downloads.html
# http://interactomeinsider.yulab.org/downloads/interfacesALL/H_sapiens_interfacesALL.txt
eclair = pd.read_csv("{0}/H_sapiens_interfacesALL.txt".format(resource_dir), sep="\t")
eclair = eclair[eclair["Source"] == "PDB"]

In [15]:
from mjm_tools import unzip_res_range

In [16]:
uni2realiresmask = dict()
uni2popmask = dict()
for p1, p2, source, ires1, ires2 in eclair[eclair["P1"].isin(uni2seq.keys()) | eclair["P2"].isin(uni2seq.keys())].values:
    if(p1 in uni2seq.keys()):
        ires_mask = np.zeros(len(uni2seq[p1]))
        ires = [int(x)-1 for x in unzip_res_range(ires1)]
        ires_mask[ires] = 1
        
        pop_vars = [x-1 for x in pop_var_df[(pop_var_df["UniProt"] == p1)&(pop_var_df["gnomAD_AF"] >= -0.001)]["AA_Pos"].unique()]
        
        pop_mask = np.zeros(len(uni2seq[p1]))
        pop_mask[pop_vars] = 1
        
        if(not p1 in uni2realiresmask.keys()):
            uni2realiresmask[p1] = ires_mask
            uni2popmask[p1] = pop_mask
        else:
            uni2realiresmask[p1] = uni2realiresmask[p1] + ires_mask
    
    if(p2 in uni2seq.keys()):
        ires_mask = np.zeros(len(uni2seq[p2]))
        ires = [int(x)-1 for x in unzip_res_range(ires2)]
        ires_mask[ires] = 1
        
        pop_vars = [x-1 for x in pop_var_df[(pop_var_df["UniProt"] == p2)&(pop_var_df["gnomAD_AF"] >= -0.001)]["AA_Pos"].unique()]
        
        pop_mask = np.zeros(len(uni2seq[p2]))
        pop_mask[pop_vars] = 1
        
        if(not p2 in uni2realiresmask.keys()):
            uni2realiresmask[p2] = ires_mask
            uni2popmask[p2] = pop_mask
        else:
            uni2realiresmask[p2] = uni2realiresmask[p2] + ires_mask

In [18]:
# Overall Enrichment Across all Genes
ires_mask = np.concatenate([uni2realiresmask[uni] for uni in uni2realiresmask.keys()]) >= 1
pop_mask = np.concatenate([uni2popmask[uni] for uni in uni2realiresmask.keys()]) >= 1

OR, up, low, p = odds_ratio(ires_mask, pop_mask, error="SE", two_sided=False, verbose=True)
OR, up, low = np.log2([OR, up, low])
    
print OR, up, low, p

              Case  Non-Case
Exposed       41.0     179.0
Non-Exposed  623.0    1358.0
-1.00208436143 -0.742726673558 -1.26144204929 5.58376029796e-05


In [19]:
# Option 2: Use PDB interface residues, but ONLY consider positions that are covered in the source PBD structures
# 
# NOTE: This is consistent with the calculation I do for docked human-viral interfaces.
#       I also tend to believe this is the more fair approach because we can't
#       definitively label anything outside the co-crystal structure as either interface
#       or non-interface, so it's best to leave un-covered residues out of the calculation.
#       
#       On average the residues excluded from a co-crystal structure are probably less likely
#       to be interface residues, but there could always be edge cases (e.g. only two interacting
#       domains could be crystalized, but there are other interacting regions)

In [20]:
# Read sifts data / filter to only relevant UniProts
#
# NOTE: SIFTS data originally downloaded and parsed from SIFTS
#       based on code originally written by Michael Meyer
#       
#       This version of the SIFTS file is subsetted to only include
#       human interactors of SARS-CoV-2
#       
#       Unparsed data can be downloaded here...
#       
#       ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/split_xml
sifts = pd.read_csv("{0}/sifts/pdbresiduemapping.txt".format(resource_dir), sep="\t")
sifts = sifts[sifts["UniProt"].isin(uni2seq.keys())]

# Generate (PDB, Chain) --> Covered Resis dictionary
pdb2pos = sifts.set_index(["PDB", "Chain"])["MappableResInPDBChainOnUniprotBasis"].map(lambda x: [int(x)-1 for x in unzip_res_range(x)]).to_dict()

# Read in interface data from whole PDB
ires_df = pd.read_csv("{0}/ires/ires_perpdb_alltax.txt".format(resource_dir), sep="\t")

In [21]:
uni2realiresmask = dict()
uni2popmask = dict()
uni2coveragemask = dict()
for p1, p2, pdb, chain1, chain2, ires1, ires2 in ires_df[ires_df["UniProtA"].isin(uni2seq.keys()) | ires_df["UniProtB"].isin(uni2seq.keys())][["UniProtA", "UniProtB", "PDB", "ChainA", "ChainB", "UniProtIresA", "UniProtIresB"]].values:
    if(p1 in uni2seq.keys()):
        ires_mask = np.zeros(len(uni2seq[p1]))
        ires = [int(x)-1 for x in unzip_res_range(ires1)]
        ires_mask[ires] = 1
        
        pop_vars = [x-1 for x in pop_var_df[(pop_var_df["UniProt"] == p1)&(pop_var_df["gnomAD_AF"] >= -0.001)]["AA_Pos"].unique()]
        
        pop_mask = np.zeros(len(uni2seq[p1]))
        pop_mask[pop_vars] = 1
        
        coverage_mask = np.zeros(len(uni2seq[p1]))
        coverage_mask[pdb2pos[(pdb, chain1)]] = 1
        
        if(not p1 in uni2realiresmask.keys()):
            uni2realiresmask[p1] = ires_mask
            uni2popmask[p1] = pop_mask
            uni2coveragemask[p1] = coverage_mask
        # Update the Ires and Coverage Masks to consider the aggregate from
        # all interactions involving these proteins
        else:
            uni2realiresmask[p1] = uni2realiresmask[p1] + ires_mask
            uni2coveragemask[p1] = uni2coveragemask[p1] + coverage_mask
    
    if(p2 in uni2seq.keys()):
        ires_mask = np.zeros(len(uni2seq[p2]))
        ires = [int(x)-1 for x in unzip_res_range(ires2)]
        ires_mask[ires] = 1
        
        pop_vars = [x-1 for x in pop_var_df[(pop_var_df["UniProt"] == p2)&(pop_var_df["gnomAD_AF"] >= -0.001)]["AA_Pos"].unique()]
        
        pop_mask = np.zeros(len(uni2seq[p2]))
        pop_mask[pop_vars] = 1
        
        coverage_mask = np.zeros(len(uni2seq[p2]))
        coverage_mask[pdb2pos[(pdb, chain2)]] = 1
        
        if(not p2 in uni2realiresmask.keys()):
            uni2realiresmask[p2] = ires_mask
            uni2popmask[p2] = pop_mask
            uni2coveragemask[p2] = coverage_mask
        # Update the Ires and Coverage Masks to consider the aggregate from
        # all interactions involving these proteins
        else:
            uni2realiresmask[p2] = uni2realiresmask[p2] + ires_mask
            uni2coveragemask[p2] = uni2coveragemask[p2] + coverage_mask

In [22]:
# Overall Enrichment Across all Genes
ires_mask = np.concatenate([uni2realiresmask[uni][uni2coveragemask[uni] >= 1] for uni in uni2realiresmask.keys()]) >= 1
pop_mask = np.concatenate([uni2popmask[uni][uni2coveragemask[uni] >= 1] for uni in uni2realiresmask.keys()]) >= 1

OR, up, low, p = odds_ratio(ires_mask, pop_mask, error="SE", two_sided=False, verbose=True)
OR, up, low = np.log2([OR, up, low])
    
print OR, up, low, p

              Case  Non-Case
Exposed      117.0     420.0
Non-Exposed  453.0    1287.0
-0.337451699908 -0.167284200871 -0.507619198946 0.0236805807921


In [23]:
# Look at significantly enriched interfaces in order
summary[(summary["P-value"] <= 0.05)].sort_values("P-value")

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar


In [25]:
# Check for perfectly enriched or perfectly depleted interfaces (i.e. every interface residue
# has or does not have a variant)
# (a proper log ratio cannot be calculated in these cases, but if they occur they
#  would be of interest)
summary[((summary["Ires_NoVar"] + summary["Ires_Var"]) > 0)&(pd.isnull(summary["LogOdds"]))].drop_duplicates("P2")

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar


# Calculate Enrichment on Viral Mutations

In [26]:
# Read in IRES Info Summary
ires_df = pd.read_csv("{0}/Interface_Summary.txt".format(output_dir), sep="\t")
ires_df = ires_df.sort_values(["P1", "P2", "Source"]).drop_duplicates(["P1", "P2"])
#ires_df = ires_df[ires_df["Source"] == "ECLAIR"]
#ires_df = ires_df[ires_df["Source"] == "Docking"]

# Read in Viral Mutations Summary
viral_mut_df = pd.read_csv("{0}/Viral_Muts.txt".format(output_dir), sep="\t")

In [27]:
# Generate Coverage Masks
uni2seq = pd.read_csv("{0}/Proteins.txt".format(output_dir), sep="\t").set_index("ID")["Sequence"].to_dict()
uni2coverage_mask = dict()
for uni in tqdm_notebook(uni2seq):
    uni2coverage_mask[uni] = np.zeros(len(uni2seq[uni]))
    try:
        # Pull out all structure matches for this Uniprot
        uni_f = glob.glob("{0}/Undocked_Structures/{1}[_\.]*".format(output_dir, uni))
        
        # Make sure there are no cases with more than 1 option
        if(len(uni_f) > 1):
            print uni, uni_f
            
            # Special case where two PDB options with distinct coverage were available
            # However, only one structure was used in docking, so we only just manually
            # make sure the right one get used
            if(uni == "COVID19N"):
                uni_f = [x for x in uni_f if "6WZQ" in x]
            
            # If this happens our assumptions have been violated / code needs to be re-worked for
            # interaciton-level coverage (e.g. inter2coverage_mask instead of uni2coverage_mask)
            else:
                raise
        uni_f = uni_f[0]
    except IndexError:
        #print uni
        continue
    
    pdb_df = pdb2df(uni_f)
    resis = list(pdb_df["Residue ID"].unique() - 1)
    resis = [x for x in resis if x >= 0]
    
    uni2coverage_mask[uni][resis] = 1
    
    if(sum(uni2coverage_mask[uni][resis]) == len(uni2seq[uni]) - 1):
        print uni
        uni2coverage_mask[uni][:] = 1

HBox(children=(IntProgress(value=0, max=19), HTML(value=u'')))

COVID19nsp15
COVID19nsp14



In [28]:
# Calculate Odds Ratios
summary = []
uni2iresmask = dict()
uni2mutmask = dict()
for p1, p2, uni_len, ires, source in ires_df[["P1", "P2", "P1_Len", "P1_Ires", "Source"]].values:
    if(not p1 in viral_mut_df["COVID_ID"].unique()):
        print p1, "Has no detectable muts"
        summary.append([p1, p2, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
        continue
    if(pd.isnull(ires)):
        ires = []
    else:
        ires = [int(x)-1 for x in ires.split(",")]
    
    ires_mask = np.zeros(uni_len)
    ires_mask[ires] = 1
    
    muts = [x-1 for x in viral_mut_df[(viral_mut_df["COVID_ID"] == p1)]["COVID_Pos"].unique()]
    
    mutmask = np.zeros(uni_len)
    mutmask[muts] = 1
    
    if(source == "Docking"):
        coverage_mask = uni2coverage_mask[p1]
        ires_mask = ires_mask[coverage_mask == True]
        mutmask = mutmask[coverage_mask == True]
    
    OR, up, low, p = odds_ratio(ires_mask, mutmask, error="CI", two_sided=False)
    OR, up, low = np.log2([OR, up, low])
    
    if(not source == "Docking" and any(ires_df[ires_df["P1"] == p1]["Source"] == "Docking")):
        coverage_mask = uni2coverage_mask[p1]
        ires_mask = ires_mask[coverage_mask == True]
        mutmask = mutmask[coverage_mask == True]
    
    
    if(not p1 in uni2iresmask.keys()):
        uni2iresmask[p1] = ires_mask
        uni2mutmask[p1] = mutmask
    else:
        #print "HERE"
        uni2iresmask[p1] = uni2iresmask[p1] + ires_mask
        uni2mutmask[p1] = uni2mutmask[p1] + mutmask
    
    summary.append([p1, p2, OR, low, up, p, sum(ires_mask*(1-mutmask)), sum(ires_mask*mutmask), sum((1-ires_mask)*mutmask), sum((1-ires_mask)*(1-mutmask))])
summary = pd.DataFrame(summary, columns=["P1", "P2", "LogOdds", "CI_Low", "CI_Up", "P-value", "Ires_NoVar", "Ires_Var", "NoIres_Var", "NoIres_NoVar"])

In [29]:
summary.to_csv("{0}/Viral_Mut_Enrichments.txt".format(output_dir), sep="\t", index=None)

In [30]:
summary = pd.read_csv("{0}/Viral_Mut_Enrichments.txt".format(output_dir), sep="\t")

In [31]:
# Overall Enrichment Across all Genes
ires_mask = np.concatenate([uni2iresmask[uni]>0 for uni in uni2iresmask.keys()])
pop_mask = np.concatenate([uni2mutmask[uni]>0 for uni in uni2iresmask.keys()])

OR, up, low, p = odds_ratio(ires_mask, pop_mask, error="SE", two_sided=False, verbose=True)
OR, up, low = np.log2([OR, up, low])
    
print OR, up, low, p

              Case  Non-Case
Exposed       42.0     276.0
Non-Exposed  273.0    1584.0
-0.17960755484 0.0773664789461 -0.436581588625 0.242297045824


In [32]:
# Look at significantly enriched interfaces in order
summary[(summary["P-value"] <= 0.05)].sort_values("P-value")

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar
4,COVID19nsp15,P62330,1.671989,0.599505,2.744473,0.005169,23.0,8.0,31.0,284.0
5,COVID19nsp2,O14975,-0.834897,-1.537902,-0.131893,0.025383,58.0,16.0,186.0,378.0


In [33]:
# Check for perfectly enriched or perfectly depleted interfaces (i.e. every interface residue
# has or does not have a variant)
# (a proper log ratio cannot be calculated in these cases, but if they occur they
#  would be of interest)
summary[((summary["Ires_NoVar"] + summary["Ires_Var"]) > 0)&(pd.isnull(summary["LogOdds"]))].drop_duplicates("P2")

Unnamed: 0,P1,P2,LogOdds,CI_Low,CI_Up,P-value,Ires_NoVar,Ires_Var,NoIres_Var,NoIres_NoVar
7,COVID19nsp7,P21964,,,,,17.0,0.0,0.0,46.0
