# Statistics for table S1
##### Last updated 2022-08-24
This notebook computes some basic statistics for table S1

In [3]:
import numpy as np

import shephard
from shephard.apis import uniprot, fasta
from shephard.interfaces import si_domains, si_tracks, si_proteins, si_sites
from shephard.tools import domain_tools, sequence_tools

In [4]:
print('Reading in human proteome...')
af2_prot = shephard.proteome.Proteome([])
si_proteins.add_proteins_from_file(af2_prot,'../shprd_data/shprd_proteins_filtered_human_af2_f1acc.tsv')

print('Reading in all IDRs...')
si_domains.add_domains_from_file(af2_prot, '../shprd_data/shprd_domains_idrs_metapredict_v2.tsv' ) # metapredict2 IDRS

print('Reading in all PTMs...')
si_sites.add_sites_from_file(af2_prot, '../shprd_data/shprd_sites_filtered_proteomescout.tsv')

Reading in human proteome...
Reading in all IDRs...
Reading in all PTMs...


In [14]:
all_residues = 0
idr_residues = 0
all_ptm_sites = 0
idr_ptm_sites = 0

for protein in af2_prot:
    all_residues = all_residues + len(protein)
    
    # NB1: PTMs are the ONLY sites so we can be non-discriminatory
    # NB2: we use this format (with a list comprehension + set) because
    #      we want to 
    # 
    all_ptm_sites = all_ptm_sites + len(set([p.position for p in protein.sites]))
    
    for domain in protein.domains:
        idr_residues = idr_residues + len(domain)
        idr_ptm_sites = idr_ptm_sites + len(set([d.position for d in domain.sites]))
    


In [28]:
print(f'Residues in human proteome: {all_residues}')
print(f'Residues in human IDRs: {idr_residues}')
print(f'Fraction of Residues in IDRs: {np.round(idr_residues/all_residues,2)}')
print(f'Fraction of human proteome in IDRs: {idr_residues/all_residues}')
print(f'Total number of PTM sites: {all_ptm_sites}')
print(f'Total number of PTM sites in IDRs: {idr_ptm_sites}')
print(f'Fraction of ptms in IDRs: {np.round(idr_ptm_sites/all_ptm_sites,3)}')
print(f"Fraction of positions in IDRs that are modified: {np.round(idr_ptm_sites/idr_residues,3)}")    
print(f"Fraction of positions in all proteins that are modified: {np.round(all_ptm_sites/all_residues,3)}")    

Residues in human proteome: 10483347
Residues in human IDRs: 3439190
Fraction of Residues in IDRs: 0.33
Fraction of human proteome in IDRs: 0.32806221142923153
Total number of PTM sites: 312745
Total number of PTM sites in IDRs: 154758
Fraction of ptms in IDRs: 0.495
Fraction of positions in IDRs that are modified: 0.045
Fraction of positions in all proteins that are modified: 0.03
