In [2]:
import shephard
import numpy as np 
import sys
import pandas as pd


print("Running SHEPHARD version: %s" %(shephard.__version__))

Running SHEPHARD version: v0.1.2.1+4.g1b3893a


In [3]:
human_proteome = './Input_Data/human_proteome_2019_10_clean_no_comma.fasta'
human_idrs  = './Input_Data/DISORDER_R3_human_proteome_2019_10_clean_no_comma_shephard_domains.tsv'
human_tails_w_pdb = './Input_Data/tails_shephard_metadata.tsv'

In [4]:
from shephard.apis import uniprot
from shephard.interfaces import si_domains

proteome = uniprot.uniprot_fasta_to_proteome(human_proteome)
si_domains.add_domains_from_file(proteome, human_tails_w_pdb)
si_domains.add_domains_from_file(proteome, human_idrs)


In [5]:
IDRs = {}
IDRs['all'] = []
IDRs['non_tail'] = []
IDRs['N'] = []
IDRs['C'] = []
IDRs['N_w_pdb'] = []
IDRs['C_w_pdb'] = []

FDs = {}
FDs['all'] = []

for protein in proteome:
    #print(protein)
    sequence_len = len(protein.sequence)
    
    uniprot_id = protein.unique_ID
    
    for domain in protein.domains:
        
        idr_len = len(domain.sequence)
                
        if domain.domain_type == 'IDR':
            
            # if we find an IDR that starts at residue 1 and is not the same 
            # size of the protein it must be an N-terminal tail...
            if domain.start == 1:
                if len(domain) < len(protein):
                    IDRs['N'].append([domain.sequence, sequence_len, protein.sequence, uniprot_id])
            # if we find an IDR that ends at the end of the protein and is not the
            # same size as the protein, it must be a C-terminal tail
            elif domain.end == len(protein):
                if len(domain) < len(protein):
                    IDRs['C'].append([domain.sequence, sequence_len, protein.sequence, uniprot_id])
            elif len(domain) < len(protein):
                IDRs['non_tail'].append([domain.sequence, sequence_len, protein.sequence, uniprot_id])
                
            if len(domain) < len(protein):
                IDRs['all'].append([domain.sequence, sequence_len, protein.sequence, uniprot_id])
                
        elif domain.domain_type == 'Tail_w_PDB':
            #print('here')
            #print(domain.attributes)
            tail_nterm_status = int(domain.attribute('N_Terminal_Status'))
            tail_cterm_status = int(domain.attribute('C_Terminal_Status'))
            
            if tail_nterm_status == 1:
                IDRs['N_w_pdb'].append([domain.sequence, sequence_len, protein.sequence, uniprot_id])
            elif tail_cterm_status == 1:
                IDRs['C_w_pdb'].append([domain.sequence, sequence_len, protein.sequence, uniprot_id])

            
            


            
        

In [6]:
print("Total number of IDRs: %i" %(len(IDRs['all'])))
print("Total number of N-terminal IDRs: %i" %(len(IDRs['N'])))
print("Total number of C-terminal IDRs: %i" %(len(IDRs['C'])))
print("Total number of Non-tails: %i" %(len(IDRs['non_tail'])))
print("Total number of N-terminal IDRs w PDB: %i" %(len(IDRs['N_w_pdb'])))
print("Total number of C-terminal IDRs w PDB: %i" %(len(IDRs['C_w_pdb'])))


Total number of IDRs: 34095
Total number of N-terminal IDRs: 5549
Total number of C-terminal IDRs: 5705
Total number of Non-tails: 22841
Total number of N-terminal IDRs w PDB: 326
Total number of C-terminal IDRs w PDB: 293


In [8]:
from localcider.sequenceParameters import SequenceParameters

# define the three names we're using
IDR_types = IDRs.keys()

# create empty directionaries for the parameters we're interested in
FCR = {}
NCPR = {}
FP = {} 
FN = {}
kappa = {} 
phospho = {}
IDR_len = {}
IDR_Sequence_len_ratio = {} 
hydropathy = {}
hydrophobicity = {}
aromaticity = {}
protein_charge = {} 
uniprot_id = {}

## Initialize empty lists into each dictionary
for n in IDR_types:
    FCR[n] = []
    NCPR[n] = []
    FP[n] = []
    FN[n] = []
    kappa[n] = []
    phospho[n] = []
    IDR_len[n] = []
    IDR_Sequence_len_ratio[n] = []
    hydropathy[n] = []
    hydrophobicity[n] = []
    aromaticity[n] = []
    protein_charge[n] = []
    uniprot_id[n] = []


# for each dataset
for key in IDR_types:
    print('On the %s set'%(key))
    
    # scan over each IDR and compute parameters
    for idx,val in enumerate(IDRs[key]):
        if (idx % 1000) == 0:
            print(idx)
        idr = val[0]
        sequence_len = val[1]
        protein_sequence = val[2]
        uniprot_id_val = val[3]
        
        SO = SequenceParameters(idr)
        
        FCR[key].append(SO.get_FCR()) 
        NCPR[key].append(SO.get_NCPR()) 
        FP[key].append(SO.get_fraction_positive())
        FN[key].append(SO.get_fraction_negative())
        kappa[key].append(SO.get_kappa())
        phospho[key].append(float(len(SO.get_all_phosphorylatable_sites()))/float(SO.get_length()))
        IDR_len[key].append(SO.get_length())
        IDR_Sequence_len_ratio[key].append(SO.get_length()/sequence_len)
        hydropathy[key].append(SO.get_mean_hydropathy())
        amino_acid_fraction = SO.get_amino_acid_fractions()
        hydrophobic_fraction = amino_acid_fraction['A'] + amino_acid_fraction['V'] + amino_acid_fraction['I'] + amino_acid_fraction['L'] + amino_acid_fraction['M']   
        hydrophobicity[key].append(hydrophobic_fraction)
        aromatic_fraction = amino_acid_fraction['F'] + amino_acid_fraction['W'] + amino_acid_fraction['Y']
        aromaticity[key].append(aromatic_fraction)
        #the following lines aren't used for the paper analysis
        
        #SO_p = SequenceParameters(protein_sequence)
        #protein_charge[key].append(SO_p.get_countPos() - SO_p.get_countNeg())
        
        #uniprot_id[key].append(uniprot_id_val)
        
        

On the all set
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
On the non_tail set
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
On the N set
0
1000
2000
3000
4000
5000
On the C set
0
1000
2000
3000
4000
5000
On the N_w_pdb set
0
On the C_w_pdb set
0


In [11]:
fcr_df=pd.DataFrame.from_dict(FCR,orient='index').transpose()
ncpr_df=pd.DataFrame.from_dict(NCPR,orient='index').transpose()
fp_df=pd.DataFrame.from_dict(FP,orient='index').transpose()
fn_df=pd.DataFrame.from_dict(FN,orient='index').transpose()
kappa_df=pd.DataFrame.from_dict(kappa,orient='index').transpose()
phospho_df=pd.DataFrame.from_dict(phospho,orient='index').transpose()
hydropathy_df=pd.DataFrame.from_dict(hydropathy,orient='index').transpose()
hydrophobicity_df=pd.DataFrame.from_dict(hydrophobicity,orient='index').transpose()
aromaticity_df=pd.DataFrame.from_dict(aromaticity,orient='index').transpose()
idr_len_df=pd.DataFrame.from_dict(IDR_len,orient='index').transpose()
idr_len_sequence_len_df=pd.DataFrame.from_dict(IDR_Sequence_len_ratio,orient='index').transpose()





In [12]:
save_path = './Output_Data/%s' 

fcr_df.to_csv(save_path % 'fcr.csv', index=False)
ncpr_df.to_csv(save_path % 'ncpr.csv', index=False)
fp_df.to_csv(save_path % 'fp.csv', index=False)
fn_df.to_csv(save_path % 'fn.csv', index=False)
kappa_df.to_csv(save_path % 'kappa.csv', index=False)
phospho_df.to_csv(save_path % 'phospho.csv', index=False)
idr_len_df.to_csv(save_path % 'idr_len.csv', index=False)
idr_len_sequence_len_df.to_csv(save_path % 'idr_len_sequence_len.csv', index=False)

hydropathy_df.to_csv(save_path % 'hydropathy.csv', index=False)
hydrophobicity_df.to_csv(save_path % 'hydrophobicity.csv', index=False)
aromaticity_df.to_csv(save_path % 'aromaticity.csv', index=False)

In [None]:
#Note the following are not relevant for the analysis in the paper. They were just some additional statistics I calculated

In [147]:
protein_charge_df=pd.DataFrame.from_dict(protein_charge,orient='index').transpose()
protein_charge_df.to_csv(save_path % 'protein_charge.csv', index=False)

In [10]:
save_path = './Output_Data/%s' 
uniprot_id_df=pd.DataFrame.from_dict(uniprot_id,orient='index').transpose()
uniprot_id_df.to_csv(save_path % 'uniprot_id.csv', index=False)

In [22]:
##for N and C w pdb, calculate cider parameters sliding window##


# create empty directionaries for the parameters we're interested in
FCR_window = {}
NCPR_window = {}
hydro_window = {}
uniprot_id_window = {}
idr_len_window = {}

## Initialize empty lists into each dictionary
for n in ['N_w_pdb', 'C_w_pdb']:
    FCR_window[n] = []
    NCPR_window[n] = []
    hydro_window[n] = []
    uniprot_id_window[n] = []
    idr_len_window[n] = []


# for each dataset
for key in ['N_w_pdb', 'C_w_pdb']:
    print('On the %s set'%(key))
    
    # scan over each IDR and compute parameters
    for idx,val in enumerate(IDRs[key]):
        
        if (idx % 10) == 0:
            print(idx)
            
        idr = val[0]
        sequence_len = val[1]
        protein_sequence = val[2]
        uniprot_id_val = val[3]
        
        if key == 'C_w_pdb':
            for str_idx in range(1,len(idr)):

                idr_subset = idr[0:str_idx]

                SO = SequenceParameters(idr_subset)

                FCR_window[key].append(SO.get_FCR()) 
                NCPR_window[key].append(SO.get_NCPR()) 
                hydro_window[key].append(SO.get_mean_hydropathy()) 
                uniprot_id_window[key].append(uniprot_id_val)
                idr_len_window[key].append(len(idr_subset))
        else:
            
            for str_idx in range(len(idr)-1,0,-1):

                idr_subset = idr[str_idx:len(idr)]

                SO = SequenceParameters(idr_subset)

                FCR_window[key].append(SO.get_FCR()) 
                NCPR_window[key].append(SO.get_NCPR()) 
                hydro_window[key].append(SO.get_mean_hydropathy()) 
                uniprot_id_window[key].append(uniprot_id_val)
                idr_len_window[key].append(len(idr_subset))
        

On the N_w_pdb set
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
On the C_w_pdb set
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290


In [23]:
fcr_window_df = pd.DataFrame.from_dict(FCR_window,orient='index').transpose()
ncpr_window_df = pd.DataFrame.from_dict(NCPR_window,orient='index').transpose()
hydro_window_df = pd.DataFrame.from_dict(hydro_window,orient='index').transpose()
uniprot_id_window_df = pd.DataFrame.from_dict(uniprot_id_window,orient='index').transpose()
idr_len_window_df = pd.DataFrame.from_dict(idr_len_window,orient='index').transpose()


save_path = './Output_Data/Window/%s' 
fcr_window_df.to_csv(save_path % 'fcr_window.csv', index=False)
ncpr_window_df.to_csv(save_path % 'ncpr_window.csv', index=False)
hydro_window_df.to_csv(save_path % 'hydro_window.csv', index=False)
uniprot_id_window_df.to_csv(save_path % 'uniprot_id_window.csv', index=False)
idr_len_window_df.to_csv(save_path % 'idr_len_window.csv', index=False)
