# Are pathogenic mutations more likely to have modifications nearby?
In studyBias.ipynb we explored the relationship between annotations, PTMs, and mutations.  We found that there is a clear study bias for proteins that contain pathogenic mutations, having more GO terms, PTMs, and mutations.  Therefore, here we will consider only mutations coming from these heavily studied proteins as we explore the relationship between mutations and nearby PTMs.

In [1]:
# Setup the workspace, 
from proteomeScoutAPI import ProteomeScoutAPI
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd
from scipy import stats 
import pickle
import random


[df_path, protID_path] = pickle.load(open("pathoProteins.p", "r"))

%matplotlib inline
proteomeScoutFile = '../../data/proteomescout_everything_20151118.tsv' #download from proteomescout.wustl.edu/downloads (current)
PTM_API = ProteomeScoutAPI(proteomeScoutFile)

modWindow = 8 #change this to 
pValCutoff = 0.05

df_path.describe()
df_path.sum()


GO            15094
Mutations     21223
PTMs          17786
Sequence     612521
dtype: int64

In [2]:
#### Create an object of all mutations that exist on proteins that have at least one pathogenic mutation.
# Also, add the number of nearby modifications of the major types, where nearby is ste on +/-7 amino acids
d = pd.DataFrame(columns=['ID', 'mod_pos', 'amino acid', 'pathoBit', 'Phosphoserine', 'Phosphothreonine', 'Phosphotyrosine',
'N6-acetyllysine', 'Ubiquitination', 'N-Glycosylation', 'O-Glycosylation'])

IDsWithErrors = []
for ID in protID_path:
    mutations = PTM_API.get_mutations(ID)
    #print ID
    try:
        for mut in mutations:
            pos, from_res, to_res, patho_status, evidence = mut
        
            mods = PTM_API.get_nearbyPTMs(ID, int(pos), modWindow)
        
            pathoBit = 0
            if patho_status == 'Pathogenic' or patho_status=='pathogenic':
                pathoBit = 1
            #vec =   ([ID, pos, from_res, pathoBit], [np.zeros((1,(len(d.columns)-4)))])  
            temp = pd.Series({'ID': ID, 'mod_pos':pos, 'amino acid':from_res, 'pathoBit':pathoBit, 
                            'Phosphoserine':0, 'Phosphothreonine':0, 'Phosphotyrosine':0, 'N6-acetyllysine':0, 
                            'Ubiquitination':0, 'N-Glycosylation':0, 'O-Glycosylation':0})
        #temp = pd.Series([ID, pos, from_res, pathoBit], [np.zeros((1,(len(d.columns)-4)))], index=d.columns)    
        #temp = pd.DataFrame(np.zeros((1,len(mods_d.columns))), columns = mods_d.columns)
                    
            for mod in mods:
                mod_pos, aa, mod_type = mod          
                try:
                    temp[mod_type] +=1
                except:
                    val = 1
                    #print "ignoring %s"%(mod_type)
            d = d.append(temp, ignore_index='True') 
        #print len(d)
    except:
        IDsWithErrors.append(ID)
        
d.sum()
#IDsWithErrors

ID                  Q8NHX9Q8NHX9Q8NHX9Q8NHX9NP_001701.2NP_001701.2...
mod_pos             3764845647349282832321662032422522863233234585...
amino acid          KMLGLWWRRSRIGFKKMKKDANSSAMTGCLMDDVSPVVVVVFFFFR...
pathoBit                                                         1972
Phosphoserine                                                    3359
Phosphothreonine                                                 1705
Phosphotyrosine                                                  1682
N6-acetyllysine                                                  1534
Ubiquitination                                                   1365
N-Glycosylation                                                   979
O-Glycosylation                                                   245
dtype: object

## Checking for enrichment of mods near pathogenic mutations
Here is a description of the mutations and nearby modifications that make up the dataset we will consider, based on those annotations that come from proteins that have at least one pathogenic mutation.  We will use the Fisher Exact test to determine if there are enrichment differences. 

In [3]:
# Select only unique mutations based on protein and position.  Prioritize pathogenicity and binarize nearby mods 
dCollapse = pd.DataFrame(columns=d.columns)
for uniqueKey, row in d.groupby(['ID', 'mod_pos', 'amino acid']):
    temp = pd.DataFrame([[uniqueKey[0], uniqueKey[1], uniqueKey[2], int(row['pathoBit'].sum() > 0), 
                         int(row['Phosphoserine'].sum() > 0), int(row['Phosphothreonine'].sum() > 0), 
                        int(row['Phosphotyrosine'].sum() > 0), int(row['N6-acetyllysine'].sum() > 0), 
                       int(row['Ubiquitination'].sum() > 0), int(row['N-Glycosylation'].sum() > 0), 
                          int(row['O-Glycosylation'].sum() > 0)]], columns=dCollapse.columns)
    dCollapse = dCollapse.append(temp)
dCollapse.describe()

Unnamed: 0,pathoBit,Phosphoserine,Phosphothreonine,Phosphotyrosine,N6-acetyllysine,Ubiquitination,N-Glycosylation,O-Glycosylation
count,18656.0,18656.0,18656.0,18656.0,18656.0,18656.0,18656.0,18656.0
mean,0.102487,0.109723,0.065019,0.067592,0.049421,0.050654,0.045401,0.007075
std,0.303296,0.312553,0.246567,0.251052,0.216751,0.219296,0.208187,0.08382
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
 d.columns[-7:]

Index([u'Phosphoserine', u'Phosphothreonine', u'Phosphotyrosine', u'N6-acetyllysine', u'Ubiquitination', u'N-Glycosylation', u'O-Glycosylation'], dtype='object')

In [5]:
def FDR(pArr, alpha):
    """
    False Discovery Rate (FDR)
    Returns a Q value for significant values in pArr for a target FDR of alpha
    
    pArr  : Array of p-values
    alpha : Confidence threshold
    """
    
    # number of p-values
    m = len(pArr)
    
    # sort the p-values (smallest first)
    pValSort = sorted(pArr, key=lambda x: float(x))
        
    # assuming there are no rejected hypotheses at all
    pAdj = 0; 
    
    # cycle through each p-value
    for i in range (0, len(pValSort)):
        pi = pValSort[i]
        j = i+1;
        # print "Comparing %0.E to %0.2f"%(pi, (j*alpha/m))
        if(pi <= (j*alpha/m)):
            pAdj = pi
        else:
            # if the p-value is greater use the previously set
            # adjusted p-value
            return pAdj

In [6]:
def printPathoModStats(d):
    """
    Check for enrichment of pathogenic mutations near sites of modifications
    
    d :  binarized data frame defined above
    """

    # get number of mutation sites    
    N = len(d['pathoBit'])
    
    # get the number of mutation sites associated with disease    
    K = d['pathoBit'].sum()
    
    pvalueArr = []
    strSig = ''
    alpha = 0.05
    
    # for each of the modification types
    for mods in d.columns[-7:]:
        
        # number of mutation sites which have a modification 
        # of type $mods nearby
        n = len(d[d[mods]==1])

        # number of disease-related mutation which have a modification 
        # of type $mods nearby
        k = len(d[(d['pathoBit']==1) & (d[mods]==1)])
        
        # use the Fisher's exact test to take the following 
        # contigency table 
        # 
        #  
        #                        PATHOGENIC   NON-PATHOGENIC
        #  MUTATIONS NEAR SITE |   k              n-k
        #  MUTATIONS ANYWHERE  |   K              N-K
        #
        # And ask if the difference between the observations might occur by chance - the p
        # value reports on the likelyhood of finding pathogenic mutations near a site and non
        # pathogenic mutants away from a site occuring by chance vs. what we see - lower means
        # more sure it's a real effect
        oddsratio, pvalue = stats.fisher_exact([[k, n-k], [K, N-K]], alternative='greater')
        pvalueArr.append(pvalue)
        strSig = ''
        
        # for examples where the p-value is less than the defined alpha append a *
        if pvalue <= alpha:
            strSig = '*' 
        print "%2s %17s:\t %2.E \t(N=%d, n=%d, K=%d, k=%d)"%(strSig, mods, pvalue, N, n, K, k)
        
    # use the false discovery rate calculate the adjusted p-value
    p_adjust = FDR(pvalueArr, alpha)
    print "adjusted p-value is %0.3f"%(p_adjust)

        
    


In [7]:
printPathoModStats(dCollapse)

 *     Phosphoserine:	 2E-02 	(N=18656, n=2047, K=1912, k=242)
 *  Phosphothreonine:	 2E-02 	(N=18656, n=1213, K=1912, k=148)
 *   Phosphotyrosine:	 3E-02 	(N=18656, n=1261, K=1912, k=151)
     N6-acetyllysine:	 3E-01 	(N=18656, n=922, K=1912, k=100)
 *    Ubiquitination:	 2E-07 	(N=18656, n=945, K=1912, k=149)
     N-Glycosylation:	 1E+00 	(N=18656, n=847, K=1912, k=65)
     O-Glycosylation:	 1E+00 	(N=18656, n=132, K=1912, k=7)
adjusted p-value is 0.000


# What is the distribution of amino acid types
Is there anything significant about the type of amino acids mutated and associated with pathogenicity?

In [8]:
# for each unique amino acid calculate whether there is over or under-representation (two-tailed test)

# Get the number of mutation sites and the number of pathogenic mutations (N and K respectively)
N = len(dCollapse['pathoBit'])
K = dCollapse['pathoBit'].sum()

runningSumCheck = 0
aaDict = {}

# cycle through each amino acid, collecting all the sites where
# that amino acid is mutated (row)
for uniqueKey, row in dCollapse.groupby(['amino acid']):
    
    # get current amino acids and number of mutation sites on that amino acid
    aa = uniqueKey[0]
    n = len(dCollapse[dCollapse['amino acid']==aa]) # aka len(row)
    
    # save the number of times that amino acid is observed on a mutation site
    aaDict[aa] = n
    
    # increment for book keeping
    runningSumCheck += n
    
    # get the number of occurences where a mutation to the $aa amino acid represents
    # a pathogenic mutation
    k = len(dCollapse[(dCollapse['amino acid']==aa) & (dCollapse['pathoBit']==1)])
    
    # use the Fisher's exact test to take the following 
    # contigency table 
    # 
    #  
    #                        PATHOGENIC   NON-PATHOGENIC
    #  MUTATIONS ON AA $aa |   k              n-k
    #  MUTATIONS ANYWHERE  |   K              N-K
    #
    # Smaller numbers mean we preferentially see pathogenic mutations on
    # some amino acid and less frequently see mutations on that amino
    # acid which are not pathogenic
    #    
    oddsratio, pvalue = stats.fisher_exact([[k, n-k], [K, N-K]])

    # Bonferroni correction, alpha at 0.05
    if pvalue <= pValCutoff/20: 
        print "%3s:\t %0.E \t(N=%d, n=%d, K=%d, k=%d)"%(aa, pvalue, N, n, K, k)
        
if runningSumCheck != N:
    print "Error in Parity: Sum is %d instead of %d"%(runningSumCheck, N)
    



  F:	 3E-04 	(N=18656, n=501, K=1912, k=28)
  H:	 2E-03 	(N=18656, n=437, K=1912, k=26)
  I:	 1E-04 	(N=18656, n=746, K=1912, k=46)
  R:	 6E-16 	(N=18656, n=2700, K=1912, k=424)


## Examine random site-mutation enrichment
Since there is a misbalance for arginines as having pathogenicity, we checked to see if there was enrichment in nearby PTMs for any randomly selected mutation set based on selecting for the same distribution of amino acids. In other words, taking the same data set by randomly reshuffling which mutations were pathogenic and which were not, do we find a specific enrichment of pathogenic mutations near sites?

To do this, we create a new set based on selecting random permutations of the lsits that define particular amino acid types rebuild a fake patho set that is defined based on distribution of amino acids then check for enrichment of nearby mods


In [10]:
numRepeats = 10
print "Printing observed enrichments for random sets. Remember expected rate of seeing a p-value <=%0.2f is %1.2f times"%(pValCutoff, pValCutoff*5*numRepeats)
for i in range(1,numRepeats):
    
    # create an empty dataframe using the original column names
    dS = pd.DataFrame(columns=dCollapse.columns)
    
    # for each different amino acid
    for uniqueKey, row in dCollapse.groupby(['amino acid']):
        aa = uniqueKey[0]
        
        # this is just row (..)
        aaMuts = dCollapse[dCollapse['amino acid']==aa]

        # totally randomize the order of each of the mutation sites
        # associated with mutations of residue aa
        aaShuffled = aaMuts.iloc[np.random.permutation(aaDict[aa])]
        
        # have to explicity set index values so we can assign a subsection
        aaShuffled.index = range(0,len(aaShuffled))

        # Determine the number of pathogenic mutations associated with sites
        num = len(dCollapse[(dCollapse['amino acid']==aa) & (dCollapse['pathoBit']==1)])
        
        # set the first $num sites to be pathogenic and the rest to be not
        aaShuffled.loc[0:num, 'pathoBit'] = 1
        aaShuffled.loc[num:len(aaMuts), 'pathoBit'] = 0
        
        # sanity check!
        if num != aaShuffled['pathoBit'].sum():
            print "Problem for %s: wanted %d and retrieved %d"%(aa, num, aaShuffled['pathoBit'].sum())
            print "\t number of amino acids in set is %d and I have length of set %d"%(aaDict[aa], len(aaShuffled))
            
        # finally, append this set of data to the synthetic dataframe being created!
        dS = dS.append(aaShuffled)
    # now print enrichment
    print "Run %d"%(i)
    printPathoModStats(dS)

Printing observed enrichments for random sets. Remember expected rate of seeing a p-value <=0.05 is 2.50 times
Run 1
       Phosphoserine:	 8E-01 	(N=18656, n=2047, K=1912, k=199)
    Phosphothreonine:	 7E-01 	(N=18656, n=1213, K=1912, k=119)
     Phosphotyrosine:	 8E-01 	(N=18656, n=1261, K=1912, k=120)
     N6-acetyllysine:	 1E+00 	(N=18656, n=922, K=1912, k=75)
      Ubiquitination:	 9E-01 	(N=18656, n=945, K=1912, k=86)
     N-Glycosylation:	 9E-01 	(N=18656, n=847, K=1912, k=76)
     O-Glycosylation:	 9E-01 	(N=18656, n=132, K=1912, k=10)
adjusted p-value is 0.000
Run 2
       Phosphoserine:	 5E-01 	(N=18656, n=2047, K=1912, k=212)
    Phosphothreonine:	 1E-01 	(N=18656, n=1213, K=1912, k=137)
     Phosphotyrosine:	 2E-01 	(N=18656, n=1261, K=1912, k=139)
     N6-acetyllysine:	 5E-01 	(N=18656, n=922, K=1912, k=95)
      Ubiquitination:	 9E-01 	(N=18656, n=945, K=1912, k=82)
     N-Glycosylation:	 2E-01 	(N=18656, n=847, K=1912, k=95)
     O-Glycosylation:	 9E-01 	(N=18656, n=132,