# Are pathogenic mutations more likely to have modifications nearby?
In studyBias.ipynb we explored the relationship between annotations, PTMs, and mutations.  We found that there is a clear study bias for proteins that contain pathogenic mutations, having more GO terms, PTMs, and mutations.  Therefore, here we will consider only mutations coming from these heavily studied proteins as we explore the relationship between mutations and nearby PTMs.

In [1]:
# Setup the workspace, 
from proteomeScoutAPI import ProteomeScoutAPI
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd
from scipy import stats 
import pickle
import random


[df_path, protID_path] = pickle.load(open("pathoProteins.p", "r")) #generate this by running studyBias.ipynb first
%matplotlib inline
proteomeScoutFile = 'proteomescout_mammalia_20150712.tsv'
PTM_API = ProteomeScoutAPI(proteomeScoutFile)

modWindow = 8 #change this to 
pValCutoff = 0.05

df_path.describe()
df_path.sum()


GO            14961
Mutations     21085
PTMs          16701
Sequence     609126
dtype: int64

In [2]:
#### Create an object of all mutations that exist on proteins that have at least one pathogenic mutation.
# Also, add the number of nearby modifications of the major types, where nearby is ste on +/-7 amino acids
d = pd.DataFrame(columns=['ID', 'mod_pos', 'amino acid', 'pathoBit', 'Phosphoserine', 'Phosphothreonine', 'Phosphotyrosine',
'N6-acetyllysine', 'Ubiquitination'])

for ID in protID_path:
    mutations = PTM_API.get_mutations(ID)
   # print ID
    for mut in mutations:
        pos, from_res, to_res, patho_status, evidence = mut
        
        mods = PTM_API.get_nearbyPTMs(ID, int(pos), modWindow)
        
        pathoBit = 0
        if patho_status == 'Pathogenic' or patho_status=='pathogenic':
            pathoBit = 1
        #vec =   ([ID, pos, from_res, pathoBit], [np.zeros((1,(len(d.columns)-4)))])  
        temp = pd.Series({'ID': ID, 'mod_pos':pos, 'amino acid':from_res, 'pathoBit':pathoBit, 
                            'Phosphoserine':0, 'Phosphothreonine':0, 'Phosphotyrosine':0, 'N6-acetyllysine':0, 
                            'Ubiquitination':0})
        #temp = pd.Series([ID, pos, from_res, pathoBit], [np.zeros((1,(len(d.columns)-4)))], index=d.columns)    
        #temp = pd.DataFrame(np.zeros((1,len(mods_d.columns))), columns = mods_d.columns)
                    
        for mod in mods:
            mod_pos, aa, mod_type = mod          
            try:
                temp[mod_type] +=1
            except:
                val = 1
                #print "ignoring %s"%(mod_type)
        d = d.append(temp, ignore_index='True') 
        #print len(d)
        
d.sum()

ID                  Q8NHX9Q8NHX9Q8NHX9Q8NHX9NP_001701.2NP_001701.2...
mod_pos             3764845647349282832321662032422522863233234585...
amino acid          KMLGLWWRRSRIGFKKMKKDANSSAMTGCLMDDVSPVVVVVFFFFR...
pathoBit                                                         1956
Phosphoserine                                                    3359
Phosphothreonine                                                 1705
Phosphotyrosine                                                  1682
N6-acetyllysine                                                  1534
Ubiquitination                                                   1365
dtype: object

## Checking for enrichment of mods near pathogenic mutations
Here is a description of the mutations and nearby modifications that make up the dataset we will consider, based on those annotations that come from proteins that have at least one pathogenic mutation.  We will use the Fisher Exact test to determine if there are enrichment differences. 

In [3]:
# Select only unique mutations based on protein and position.  Prioritize pathogenicity and binarize nearby mods 
dCollapse = pd.DataFrame(columns=d.columns)
for uniqueKey, row in d.groupby(['ID', 'mod_pos', 'amino acid']):
    temp = pd.DataFrame([[uniqueKey[0], uniqueKey[1], uniqueKey[2], int(row['pathoBit'].sum() > 0), 
                         int(row['Phosphoserine'].sum() > 0), int(row['Phosphothreonine'].sum() > 0), 
                        int(row['Phosphotyrosine'].sum() > 0), int(row['N6-acetyllysine'].sum() > 0), 
                       int(row['Ubiquitination'].sum() > 0)]], columns=dCollapse.columns)
    dCollapse = dCollapse.append(temp)
dCollapse.describe()

Unnamed: 0,pathoBit,Phosphoserine,Phosphothreonine,Phosphotyrosine,N6-acetyllysine,Ubiquitination
count,18528.0,18528.0,18528.0,18528.0,18528.0,18528.0
mean,0.102332,0.110481,0.065468,0.068059,0.049763,0.051004
std,0.303092,0.313497,0.247357,0.251854,0.21746,0.220012
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
def FDR(pArr, alpha):
    #return a Q value for significant values in pArr for a target FDR of alpha
    cm = 1
    m = len(pArr)
    pValSort = sorted(pArr, key=lambda x: float(x))
    #print pValSort
    pAdj = 0; #assuming there are no rejected hypotheses at all, unless updated
    for i in range (0, len(pValSort)):
        pi = pValSort[i]
        j = i+1;
        #print "Comparing %0.E to %0.2f"%(pi, (j*alpha/m))
        if(pi <= (j*alpha/m)):
            pAdj = pi
        else:
            return pAdj

In [5]:
def printPathoModStats(d):

# Check for enrichment of pathogenic mutations near sites of modifications
    N = len(d['pathoBit'])
    K = d['pathoBit'].sum()
    pvalueArr = []
    strSig = ''
    alpha = 0.05
    for mods in d.columns[-5:]:
        n = len(d[d[mods]==1])
       
        k = len(d[(d['pathoBit']==1) & (d[mods]==1)])
        oddsratio, pvalue = stats.fisher_exact([[k, n-k], [K, N-K]], alternative='greater')
        pvalueArr.append(pvalue)
        strSig = ''
        if pvalue <= alpha:
            strSig = '*'
        print "%2s %17s:\t %2.E \t(N=%d, n=%d, K=%d, k=%d)"%(strSig, mods, pvalue, N, n, K, k)
        
    p_adjust = FDR(pvalueArr, alpha)
    print "adjusted p-value is %0.3f"%(p_adjust)

        
    


In [6]:
printPathoModStats(dCollapse)

 *     Phosphoserine:	 2E-02 	(N=18528, n=2047, K=1896, k=242)
 *  Phosphothreonine:	 2E-02 	(N=18528, n=1213, K=1896, k=148)
 *   Phosphotyrosine:	 3E-02 	(N=18528, n=1261, K=1896, k=151)
     N6-acetyllysine:	 3E-01 	(N=18528, n=922, K=1896, k=100)
 *    Ubiquitination:	 2E-07 	(N=18528, n=945, K=1896, k=149)
adjusted p-value is 0.029


# What is the distribution of amino acid types
Is there anything significant about the type of amino acids mutated and associated with pathogenicity?

In [7]:
# for each unique amino acid calculate whether there is over or under-representation (two-tailed test)
N = len(dCollapse['pathoBit'])
K = dCollapse['pathoBit'].sum()
runningSumCheck = 0
aaDict = {}
for uniqueKey, row in dCollapse.groupby(['amino acid']):
    aa = uniqueKey[0]
    n = len(dCollapse[dCollapse['amino acid']==aa])
    aaDict[aa] = n
    runningSumCheck += n
    k = len(dCollapse[(dCollapse['amino acid']==aa) & (dCollapse['pathoBit']==1)])
    oddsratio, pvalue = stats.fisher_exact([[k, n-k], [K, N-K]])
    if pvalue <= pValCutoff/20: #Bonferroni correction, alpha at 0.05
        print "%3s:\t %0.E \t(N=%d, n=%d, K=%d, k=%d)"%(aa, pvalue, N, n, K, k)
if runningSumCheck != N:
    print "Error in Parity: Sum is %d instead of %d"%(runningSumCheck, N)
    



  F:	 2E-04 	(N=18528, n=498, K=1896, k=27)
  I:	 2E-04 	(N=18528, n=742, K=1896, k=46)
  R:	 2E-15 	(N=18528, n=2677, K=1896, k=418)


In [9]:
# since there is a misbalance for arginines as having pathogenicity, check to see if enrichment in nearby mods for any 
# randomly selected set based on selecting for the same distribution of amino acids.  

#create a new set based on selecting random permutations of the lsits that define particular amino acid types
# rebuild a fake patho set that is defined based on distribution of amino acids then check for enrichment of nearby mods
numRepeats = 11
print "Printing observed enrichments for random sets. Remember expected rate of seeing a p-value <=%0.2f is %1.2f times"%(pValCutoff, pValCutoff*5*numRepeats)
for i in range(1,numRepeats):
    dS = pd.DataFrame(columns=dCollapse.columns)
    for uniqueKey, row in dCollapse.groupby(['amino acid']):
        aa = uniqueKey[0]
        aaMuts = dCollapse[dCollapse['amino acid']==aa]
    #print aaMuts[:4]
        aaShuffled = aaMuts.iloc[np.random.permutation(aaDict[aa])]

    #set the first "n" as patho and all others as zeros
    #aaShuffled.loc[:aaDict[aa], 'pathoBit'] = 1
        num = len(dCollapse[(dCollapse['amino acid']==aa) & (dCollapse['pathoBit']==1)])
        aaShuffled.loc[:num, 'pathoBit'] = 1
        aaShuffled.loc[-(len(aaMuts)-num):, 'pathoBit'] = 0
        if num != aaShuffled['pathoBit'].sum():
            print "Problem for %s: wanted %d and retrieved %d"%(aa, num, aaShuffled['pathoBit'].sum())
            print "\t number of amino acids in set is %d and I have length of set %d"%(aaDict[aa], len(aaShuffled))
        dS = dS.append(aaShuffled)
    #now print enrichment
    print "Run %d"%(i)
    printPathoModStats(dS)

Printing observed enrichments for random sets. Remember expected rate of seeing a p-value <=0.05 is 2.75 times
Run 1
       Phosphoserine:	 3E-01 	(N=18528, n=2047, K=1896, k=217)
    Phosphothreonine:	 6E-02 	(N=18528, n=1213, K=1896, k=142)
     Phosphotyrosine:	 1E+00 	(N=18528, n=1261, K=1896, k=111)
     N6-acetyllysine:	 8E-01 	(N=18528, n=922, K=1896, k=87)
      Ubiquitination:	 1E-01 	(N=18528, n=945, K=1896, k=109)
adjusted p-value is 0.000
Run 2
       Phosphoserine:	 7E-01 	(N=18528, n=2047, K=1896, k=203)
    Phosphothreonine:	 1E+00 	(N=18528, n=1213, K=1896, k=105)
     Phosphotyrosine:	 1E+00 	(N=18528, n=1261, K=1896, k=110)
     N6-acetyllysine:	 1E+00 	(N=18528, n=922, K=1896, k=75)
      Ubiquitination:	 8E-01 	(N=18528, n=945, K=1896, k=88)
adjusted p-value is 0.000
Run 3
       Phosphoserine:	 2E-01 	(N=18528, n=2047, K=1896, k=222)
    Phosphothreonine:	 6E-01 	(N=18528, n=1213, K=1896, k=123)
     Phosphotyrosine:	 8E-01 	(N=18528, n=1261, K=1896, k=119)
     N6