### Statistics for table XXX
This notebook computes some basic statistics for table XXX

In [14]:
import numpy as np

import shephard
from shephard.apis import uniprot, fasta
from shephard.interfaces import si_domains, si_tracks, si_proteins, si_sites
from shephard.tools import domain_tools, sequence_tools

In [15]:
print('Reading in human proteome...')
af2_prot = shephard.proteome.Proteome([])
si_proteins.add_proteins_from_file(af2_prot,'../shprd_data/shprd_proteins_filtered_human_af2_f1acc.tsv')

print('Reading in all IDRs...')
si_domains.add_domains_from_file(af2_prot, '../shprd_data/shprd_domains_idrs_metapredict_v2.tsv' ) # metapredict2 IDRS

print('Reading in all PTMs...')
si_sites.add_sites_from_file(af2_prot, '../shprd_data/shprd_sites_filtered_proteomescout.tsv')

Reading in human proteome...
Reading in all IDRs...
Reading in all PTMs...


In [16]:
all_residues = 0
idr_residues = 0
all_ptm_sites = 0
idr_ptm_sites = 0

for protein in af2_prot:
    all_residues = all_residues + len(protein)
    
    # note PTMs are the ONLY sites so we can be non-discriminatory
    all_ptm_sites = all_ptm_sites + len(protein.sites)
    
    for domain in protein.domains:
        idr_residues = idr_residues + len(domain)
        idr_ptm_sites = idr_ptm_sites + len(domain.sites)
    


In [17]:
print(f'Residues in human proteome: {all_residues}')
print(f'Residues in human IDRs: {idr_residues}')
print(f'Fraction of human proteome in IDRs: {idr_residues/all_residues}')
print(f'Total number of PTM sites: {all_ptm_sites}')
print(f'Total number of PTM sites in IDRs: {idr_ptm_sites}')
print(f'Fraction of ptms in IDRs: {np.round(idr_ptm_sites/all_ptm_sites,3)}')
print(f"Fraction of positions in IDRs that are modified: {np.round(idr_ptm_sites/idr_residues,3)}")    
print(f"Fraction of positions in all proteins that are modified: {np.round(all_ptm_sites/all_residues,3)}")    

Residues in human proteome: 10483347
Residues in human IDRs: 3439190
Fraction of human proteome in IDRs: 0.32806221142923153
Total number of PTM sites: 326923
Total number of PTM sites in IDRs: 159631
Fraction of ptms in IDRs: 0.488
Fraction of positions in IDRs that are modified: 0.046
Fraction of positions in all proteins that are modified: 0.031
