In [14]:
import numpy as np
from shephard.apis import uniprot

In [37]:
# define file 
in_file = '../shprd_data/human_proteome_validated.fasta'

# Build proteome 
new_proteome = uniprot.uniprot_fasta_to_proteome(in_file)

# Define first 5 residues of every protien as C and N terminal domains
for p in new_proteome:
    p.add_domain(1,5, 'N-Terminus')
    p.add_domain(p._len-4,p._len, 'C-Terminus')

In [38]:
# anylize C and N terminal domains 
for d in new_proteome.domains:
    if d.domain_type in ['N-Terminus','C-Terminus']:
        d.add_attribute('f_G', d.sequence.count('G')/len(d.sequence))
        d.add_attribute('f_S', d.sequence.count('S')/len(d.sequence))

In [39]:
# Caluclate fraction of Gly & Ser in C and N terminal regions 
N_terms = [(d.attribute('f_G'), d.attribute('f_S')) for d in new_proteome.domains if d.domain_type == 'N-Terminus']
C_terms = [(d.attribute('f_G'), d.attribute('f_S')) for d in new_proteome.domains if d.domain_type == 'C-Terminus']

print('Average N Termainal fractions: G:%.3f  S:%.3f' % tuple(map(np.mean, zip(*N_terms))))
print('Average C Termainal fractions: G:%.3f  S:%.3f' % tuple(map(np.mean, zip(*C_terms))))

Average N Termainal fractions: G:0.063  S:0.081
Average C Termainal fractions: G:0.055  S:0.095


In [41]:
# annotate list of proteins with C or N terminal poly-GS 
poly_GS_termi_proteins = [d.protein for d in new_proteome.domains if d.attribute('f_G') + d.attribute('f_S') == 1]

print('Number of protiens with poly-GS C-Terminal or N-Terminal:', len(poly_GS_termi_proteins))
print('Proteins:', [p.unique_ID for p in poly_GS_termi_proteins])

Number of protiens with poly-GS C-Terminal or N-Terminal: 2
Proteins: ['Q8NCU1', 'Q86UT6']
