# Batch disorder predictions using metapredict
#### *Version 1.2* (updated 2022-30-08)
This notebook provides a simple interface for performing batch predictions of disorder predictions using metapredict V2.

More more information of how metapredict works [please see our preprint]().


## TL/DR
Upload a FASTA file, get a CSV file with per-residue disorder scores for the sequences. No limit on the number of sequences that can be submitted, but in general google-colab notebooks can crash...

## Known issues:
Some anti-tracking tools and other plugins designed to prevent ads will impact the the ability of the notebook to work. 

Known errors include:

* `TypeError: google.colab._files is undefined`

To diagnose this, we suggest visiting the notebook in an Incognito window, noting you'll still need to sign in.

## More info
More details at the end of this page!

In [None]:
#@title Download metapredict
#@markdown Press play to download metapredict.
import time
start = time.time()
# install then import metapredict
!pip install metapredict --quiet

# included for good measure but metapredict should have this
# as a dependencies!
!pip install protfasta --quiet
import metapredict as meta
# get stuff for getting files and what not
from google.colab import files

# import other goodies
import re
import os
from random import randint
import protfasta
from datetime import datetime
import time
import numpy as np
end = time.time()
print(f'Packages installed and ready to go (setup took {np.round(end-start,2)} seconds)!')



In [None]:
#@title Choose a `.fasta` file to make predictions.
#@markdown Press the play button then choose the .fasta file containing sequences you'd like to predict disorder for. Your browser will download the disorder prediction results!
#@markdown The file will download as `<date_and_time>_disorder.csv`.

start = time.time()

# upload and save
uploaded = files.upload()
print('Uploading sequences...')

# get filename
try: 
    # this ENSURES we overwrite an existing
    # file if it was there before...
    fn = list(uploaded.keys())[0]  
    with open(fn,'wb') as fh:
    fh.write(uploaded[fn]) 
except Exception:
    raise Exception('No file uploaded')
  
# read sequences
try:
    input_seqs = protfasta.read_fasta(fn, expect_unique_header=False, return_list=True, invalid_sequence_action='convert' )
except Exception as e:
    print('ERROR: An exception occured when parsing your FASTA file.\n\nSorry about that! Please make sure you FASTA file is an appropriately formatted\nFASTA file, the error message below may help but if not please report this\nerror on the metapredict issue tracker:\n\nhttps://github.com/idptools/metapredict/issues ')  
    raise Exception(e)

print(f'Read in FASTA file and found {len(input_seqs)} sequences')


# if we get here assume we've read things in OK...

# get datetime string for output file - this helps avoid overwriting 
# and tells people when they generated the file!
now = datetime.now()
now_string = now.strftime("%d_%m_%Y_%H_%M_%S")

disorder_out = {}

# note - we use idx here because we don't require FASTA headers to be
# unique.

print('Predicting disorder scores...')
for idx, s in enumerate(input_seqs):

    if idx % 100 == 0:
        if idx != 0:
            print(f'On sequence {idx} of {len(input_seqs)}')
  
    name = s[0]
    seq = s[1]

    # remove commas from the name
    name = name.replace(',',';')

    # use a try/except block so that IF things fail we only
    # skip that sequence and can continue on our merry way with
    # the remainings sequences
    try:
        dis = meta.predict_disorder(s[1])
    except Exception:
        print(f'Failed on >{name}\n{seq}\n\n ({idx} of {len(input_seqs)})\nSkipping...\n\n')
        continue

    # save to the out dict
    disorder_out[idx] = [name, dis, seq]

    # open a handle and write out
    outstring = f'disorder_scores_{now_string}.csv'
    with open(outstring,'w') as fh:
        # cycle through calculated things
        for idx in disorder_out:

        # get name, disorder scores and sequence
        name = disorder_out[idx][0]
        disorder = disorder_out[idx][1]
        seq = disorder_out[idx][2]

        # convert to a comma-separated string
        disorder_string = ", ".join([str(i) for i in disorder])

        # write a line with 4 columns 
        fh.write(f"{idx}, {name}, {seq}, {disorder_string}\n")


end = time.time()
n_res = np.sum([len(x[1]) for x in input_seqs])
n_seqs = len(input_seqs)

r_per_second = np.round((end - start)/n_res,7)
s_per_second = np.round((end - start)/n_seqs,3)

print('\nPerformance statistics:')
print('----------------------------------')
print(f'Execution time was {time.strftime("%H:%M:%S", time.gmtime(end-start))} (hr:min:sec)')
print(f'{r_per_second} seconds per residue')
print(f'{s_per_second} seconds per sequence')


# finally prompt the output file
files.download(outstring)
print('Done!')


