## E. coli proteome-wide disorder analysis
This notebook is analagous to the one in `/human_idr_analysis` directory but for the E. coli proteome.

In [1]:
from shephard import apis, interfaces
import metapredict as meta
from matplotlib import pyplot  as plt
from sparrow import Protein

In [2]:

# read in the ecoli proteome
hp = apis.uniprot.uniprot_fasta_to_proteome('ecoli_proteome_clean.fasta')

# annotate the ecoli proteome with the IDRs
interfaces.si_domains.add_domains_from_file(hp, 'shprd_domains_idrs.tsv')


# variable - set shortest IDR for analysis
shortest_size  = 30

In [3]:
idr = 0
fp = 0
idr_proteins = []


# count find proteins with one or more IDR above length shortest_size
for p in hp:

    found = False

    
    for d in p.domains:
        if len(d) > shortest_size:
            found = True
            break

    if found:
        idr = idr + 1
        idr_proteins.append(p)
    else:
        fp = fp + 1

disordered_residues = 0
all_residues = 0
# count residues in IDRs > shortest
for p in hp:
    all_residues = all_residues + len(p)

    for d in p.domains:
        if len(d) > shortest_size:
            disordered_residues = disordered_residues + len(d)

# count the number of proteins that are bona fide IPDs
idp = 0
for p in hp:
    if np.sum([len(d) for d in p.domains]) == len(p):
        idp = idp + 1

# count the number of proteins that contain no IDRs at all (length 12 or longer)
fully_folded = 0
for p in hp:
    if len(p.domains) == 0:
        fully_folded = fully_folded + 1


# assemble all IDRs above length shortest_size for further analysis
all_idrs = []
for p in idr_proteins:
    for d in p.domains:
        all_idrs.append(d)


print(f"Fraction of proteins with IDRs of length {shortest_size} or longer:    {idr/len(hp):.3f}")
print(f"Fraction of proteins that contain no IDRs longer than {shortest_size}: {fp/len(hp):.3f}\n")
print(f"Fraction of proteins that are completely disordered:      {idp/len(hp):.3f}")
print(f"Fraction of proteins that are completely folded:          {fully_folded/len(hp):.3f}")
print(f"Fraction of residues in IDRs that are longer than {shortest_size}:     {disordered_residues/all_residues:.3f}")


            
            
    
        

Fraction of proteins with IDRs of length 30 or longer:    0.117
Fraction of proteins that contain no IDRs longer than 30: 0.883

Fraction of proteins that are completely disordered:      0.042
Fraction of proteins that are completely folded:          0.678
Fraction of residues in IDRs that are longer than 30:     0.027


In [4]:
fully_disordered_protein = 0
N_tail = 0
C_tail = 0
linker = 0

for idr in all_idrs:
    if idr.start == 1 and idr.end == len(idr.protein):
        fully_disordered_protein = fully_disordered_protein + 1
    elif idr.start == 1:
        N_tail = N_tail + 1
    elif idr.end == len(idr.protein):
        C_tail = C_tail + 1
    else:
        linker = linker + 1


print(f"Fraction of IDRs above {shortest_size} that are found as full proteins: {fully_disordered_protein/len(all_idrs):.3f}")
print(f"Fraction of IDRs above {shortest_size} that are N-terminal tails      : {N_tail/len(all_idrs):.3f}")
print(f"Fraction of IDRs above {shortest_size} that are C-terminal tails      : {C_tail/len(all_idrs):.3f}")
print(f"Fraction of IDRs above {shortest_size} that are linkers               : {linker/len(all_idrs):.3f}")

Fraction of IDRs above 30 that are found as full proteins: 0.188
Fraction of IDRs above 30 that are N-terminal tails      : 0.393
Fraction of IDRs above 30 that are C-terminal tails      : 0.230
Fraction of IDRs above 30 that are linkers               : 0.189
