<a href="https://colab.research.google.com/github/holehouse-lab/supportingdata/blob/master/other/fraction_disordered/fraction_disordered_across_list_of_proteins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## About
This notebook will let you compute what fraction of proteins have disordered regions using a FASTA file of full-length protein sequences as input.

NOTE: The setup cell (Step 1) takes ~1-2 minutes to run (sorry!) but then the actual disorder prediction will take very very few seconds.

## OUTPUT
This notebook JUST prints out the fraction of proteins with disordered regions (and the associated count), as well as average, min, and max fraction disorder and number of disordered residues. More sophisticated analysis probably necessitates some more involved informatics!


## Changelog

* Version 1.0 (July 29th 2024) - initial version

In [None]:
#@title Step 1: Download metapredict
#@markdown Press play to download metapredict.
import time
start = time.time()
# install then import metapredict
!pip install metapredict --quiet

# included for good measure but metapredict should have this
# as a dependencies!
!pip install protfasta --quiet
import metapredict as meta
# get stuff for getting files and what not
from google.colab import files

# import other goodies
import re
import os
from random import randint
import protfasta
from datetime import datetime
import time
import numpy as np
end = time.time()
print(f'Packages installed and ready to go (setup took {np.round(end-start,2)} seconds)!')



In [None]:
#@title Step 2: Choose a `.fasta` file to make predictions.
#@markdown Press the play button then choose the .fasta file containing sequences you'd like to predict disorder for. The outcome of the analysis will be printed below.

start = time.time()

# upload and save
uploaded = files.upload()
print('Uploading sequences...')

# get filename
try:
  # this ENSURES we overwrite an existing
  # file if it was there before...
  fn = list(uploaded.keys())[0]
  with open(fn,'wb') as fh:
    fh.write(uploaded[fn])
except Exception:
  raise Exception('No file uploaded')


# read sequences

try:
  input_seqs = protfasta.read_fasta(fn, expect_unique_header=False, return_list=True, invalid_sequence_action='convert' )
except Exception as e:
  print('ERROR: An exception occured when parsing your FASTA file.\n\nSorry about that! Please make sure you FASTA file is an appropriately formatted\nFASTA file, the error message below may help but if not please report this\nerror on the metapredict issue tracker:\n\nhttps://github.com/idptools/metapredict/issues ')
  raise Exception(e)

print(f'Read in FASTA file and found {len(input_seqs)} sequences')
end = time.time()

# if we get here assume we've read things in OK...
start = time.time()
# get datetime string for output file - this helps avoid overwriting
# and tells people when they generated the file!
now = datetime.now()
now_string = now.strftime("%d_%m_%Y_%H_%M_%S")

disorder_out = {}

# note - we use idx here because we don't require FASTA headers to be
# unique.
clean_seqs = {}
idx2name = {}

# build idx-2-seq dictionary
for idx, s in enumerate(input_seqs):
  name = s[0]
  seq = s[1]

  # remove commas so we can generate a bona fide CSV after
  name = name.replace(',',';')

  idx2name[idx] = name
  clean_seqs[idx] = seq

# clean seqs is a unique dictionary of index-to-sequence mapping, even if
# sequences are not unique OR the names associated with the sequences
# are not unique
batch_out = meta.predict_disorder_batch(clean_seqs,return_domains=True)

c = 0
disorder_per_protein = []
disorder_res_per_protein = []
for d in batch_out:
    p = batch_out[d]
    length_disordered = np.sum([len(x) for x in p.disordered_domains])
    fraction_disordered = length_disordered/len(p.sequence)

    if fraction_disordered > 0:
        c = c +1
    disorder_per_protein.append(fraction_disordered)
    disorder_res_per_protein.append(length_disordered)
print('')
print('OUTPUT:')
print(f"Fraction of proteins with disordered regions {c/len(batch_out):.3f} ({c}/{len(batch_out)})")

print('')
print(f"Mean fraction disordered across all proteins {np.mean(disorder_per_protein):.3f}")
print(f"Median fraction disordered across all proteins {np.median(disorder_per_protein):.3f}")
print(f"Max fraction disordered across all proteins {np.max(disorder_per_protein):.3f}")
print(f"Min fraction disordered across all proteins {np.min(disorder_per_protein):.3f}")

print('')
print(f"Mean number of disordered residues across all proteins {int(np.mean(disorder_res_per_protein))}")
print(f"Median number of disordered residues across all proteins {int(np.median(disorder_res_per_protein))}")
print(f"Max number of disordered residues across all proteins {int(np.max(disorder_res_per_protein))}")
print(f"Min number of disordered residues across all proteins {int(np.min(disorder_res_per_protein))}")


