In [1]:
from shephard.interfaces import si_domains, si_sites
from shephard.apis import uniprot

In [2]:
# points to the local directory that matches where you hae the files from
# https://github.com/holehouse-lab/shephard-data/tree/main/data/proteomes/s_cerevisiae
# saved

# You will need to change this!!!
root_data_dir = ''



In [14]:
# build a Proteome object from the yeast proteome FASTA file (clean just means we removed/fixed)
# sequences that had non-canonical amino acids
yeast_proteome = uniprot.uniprot_fasta_to_proteome(f'{root_data_dir}/s_cerevisiae_clean.fasta')

# annotate the yeast proteome with all the ptms
si_sites.add_sites_from_file(yeast_proteome, f'{root_data_dir}/shprd_sites_ptms_s_cerevisiae.tsv')

n_sites = len(yeast_proteome.sites)
n_proteins = len(yeast_proteome)

# optional - annotate on domains and IDRs. Note that both ECOD domains (mostly evolutionarily-conserved folded/globular domains)
# and IDRs exist as Shephard Domain objects - for more info check out the preprint!
si_domains.add_domains_from_file(yeast_proteome, f'{root_data_dir}/shprd_domains_ecod_domainmapper_s_cerevisiae.tsv')

si_domains.add_domains_from_file(yeast_proteome, f'{root_data_dir}/shprd_domains_idrs_s_cerevisiae.tsv')

# this extracts all the IDR domains so you can count how many IDRs there are, because why would anyone care about
# anything except the number of IDRs :)
n_idrs = len(yeast_proteome.get_domains_by_type('IDR'))


print(f"Sanity check - we read in {n_proteins} proteins with {n_sites} ptms and {n_idrs} IDRs")

Sanity check - we read in 6060 proteins with 15761 ptms and 8302 IDRs


In [22]:
# This cell extracts the phosphosites as a list of Site objects (Site objects know about the Protein)
# they come from, so there's no need to track this independently
#

# this is going to be our list of Site objects where each Site is a phosphosite
phosphosites = []

# for each proteome in yeast proteome
for protein in yeast_proteome:    
    
    # for each site in that protein
    for site in protein.sites:
        
        # if the site type is one of the three types of phosphoresidues...
        if site.site_type in ['Phosphoserine', 'Phosphothreonine', 'Phosphotyrosine']:
            phosphosites.append(site)

# determine how many phosphosites were found
n_phosphosites = len(phosphosites)            

In [23]:
print(f"There are {n_phosphosites} phosphosites in the yeast proteome")

There are 14541 phosphosites in the yeast proteome


In [27]:
# Finally, this cell figures out the unique proteins from which these 
# phosphosites came (note this uses a set to keep track of unique uniprotIDs)
# which is the protein ID that UniProt-derived Proteome objects automatically
# use.
#

# this is going to be a set of unique UniProt IDs
unique_IDs = set([])

# cycle over each phosphosite
for phos in phosphosites:
    
    # if the UniProt ID associates with the protein from whence this phosphosite
    # came is not currently in the unique_IDs set, add it!
    if phos.protein.unique_ID not in unique_IDs:
        unique_IDs.add(phos.protein.unique_ID)

# and finally count the number of proteins        
n_unique_phosphorylated_proteins = len(unique_IDs)

In [28]:
print(f"There are {n_phosphosites} phosphosites spread over {n_unique_phosphorylated_proteins} proteins")

There are 14541 phosphosites spread over 2825 proteins
