### Read fasta and map domains
This notebook creates de novo domains at the N- and C-terminus and then investigates their composition


In [1]:
import numpy as np
from shephard.apis import uniprot

In [16]:
# define the input sequence file
in_file = '../shprd_data/human_proteome_validated.fasta'

# Build proteome 
new_proteome = uniprot.uniprot_fasta_to_proteome(in_file)

# Define the first 5 residues of every protein as C and N terminal domains
for p in new_proteome:
    p.add_domain(1,min(25, len(p)), 'N-Terminus')
    p.add_domain(max(len(p)-24,1),len(p), 'C-Terminus')

In [17]:
# for each domain in the human proteome...
for d in new_proteome.domains:
    
    # if the domain type is N or C terminus
    if d.domain_type in ['N-Terminus','C-Terminus']:
        
        # add fraction of glycine and serine in these N and C terminii
        d.add_attribute('f_G', d.sequence.count('G')/len(d))
        d.add_attribute('f_S', d.sequence.count('S')/len(d))

In [18]:
# Caluclate fraction of Gly & Ser in C and N terminal regions 


N_terms = []
C_terms = []

# for each domain
for d in new_proteome.domains:
    
    # if the domain is an N-terminal domain...
    if d.domain_type == 'N-Terminus':
        
        # add the fraction G and S together and then
        N_terms.append(d.attribute('f_G') + d.attribute('f_S'))

    # else iif it's a C-terminal domain add S and G fractions
    elif d.domain_type == 'C-Terminus':
        C_terms.append(d.attribute('f_G') + d.attribute('f_S'))
        
    

print(f'Average N-terminal GS fractions: {np.mean(N_terms)}')
print(f'Average C-terminal GS fractions: {np.mean(C_terms)}')

Average N-terminal GS fractions: 0.16092141661492163
Average C-terminal GS fractions: 0.1475482003152109


In [27]:
poly_GS_termi_proteins = []

# for each domain in human proteome
for d in new_proteome.domains:
    
    # if the N- and C-termini are entirely made of G and S...
    if d.attribute('f_G') + d.attribute('f_S') > 0.75:
        
        # save!
        poly_GS_termi_proteins.append(d.protein)

# annotate list of proteins with C or N terminal poly-GS 


print('Number of proteins with G and/or S rich C-Terminal or N-Terminal:', len(poly_GS_termi_proteins))
print('Proteins:', [p.unique_ID for p in poly_GS_termi_proteins])


Number of proteins with G and/or S rich C-Terminal or N-Terminal: 7
Proteins: ['Q15059', 'Q9H7D7', 'Q9HCS4', 'Q9Y2X9', 'Q2TAP0', 'P25440', 'Q12791']
