## Local IDRome constructor

* Version 1.0 (initial release, May 27th 2023)

This notebook enables a complete IDR-ome annotation to be generated from an uniprot FASTA file

Specifically, notebook will:

1. Predict all IDRs
2. Calculate sequence properties for each IDR
3. Predict ensemble properties using ALBATROSS
4. Return a CSV file with all this information for easy exploration.


### Input:
The only input file required is a FASTA file obtained from UniProt.

### Output:
Once the notebook is complete, a CSV file called `local_IDRome_all.csv` will be written.

### Performance:
If GPU credits are available, the human proteome takes ~1 minute. If no GPU credits are available the human proteome takes more like 6-7 minutes. 

In [13]:
#@title Setup 

from google.colab import files
import io
import protfasta

from sparrow import Protein
from shephard.apis import fasta, uniprot
from shephard.apis import metapredict
from sparrow.predictors import batch_predict
import numpy as np

In [14]:
%%time


# define the location of the FASTA file (the example here was the UniProt human proteome)
input_fasta_file = '<human proteome FASTA file>'

proteome = uniprot.uniprot_fasta_to_proteome(input_fasta_file)

metapredict.annotate_proteome_with_disordered_domains(proteome)

data = {}
for d in proteome.domains:
    name = f"{d.protein.unique_ID}_{d.start}_{d.end}"
    name = name.replace(',',';')
    data[name] = d.sequence





100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2154/2154 [05:08<00:00,  6.99it/s]


CPU times: user 4min 54s, sys: 24.7 s, total: 5min 19s
Wall time: 5min 18s


In [15]:
%%time
rg = batch_predict.batch_predict(data, network='scaled_rg')  
re = batch_predict.batch_predict(data, network='scaled_re')  
asph = batch_predict.batch_predict(data, network='asphericity')  
nu = batch_predict.batch_predict(data, network='scaling_exponent')  
pref = batch_predict.batch_predict(data, network='prefactor')  


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [00:54<00:00, 19.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [00:53<00:00, 20.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [01:49<00:00,  9.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [01:38<00:00, 10.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [01:01<00:00, 17.50it/s]

CPU times: user 5min 52s, sys: 30.5 s, total: 6min 22s
Wall time: 6min 18s





In [16]:

outname = 'local_IDRome_all.csv'

try:
    os.remove(outname)
except Exception:
    pass

fh = open(outname,'w')

out_string = ''
out_string += "IDR ID, "
out_string += "FASTA header, "


out_string += "UniProtID, "

out_string += "IDR start, "
out_string += "IDR end, "
out_string += "IDR len, "


out_string += "Rg (A), "
out_string += "Re (A), "
out_string += "asphericity, "
out_string += "scaling_exponent, "
out_string += "prefactor, "

out_string += "FCR, "
out_string += "NCPR, "
out_string += "kappa, "
out_string += "frac_negative, "
out_string += "fract_positive, "
out_string += "fract_aro, "
out_string += "fract_pro, "
out_string += "fract_polar, "
out_string += "fract_ali, "
out_string += "sequence\n"

fh.write(out_string)


for d in proteome.domains:
    name = f"{d.protein.unique_ID}_{d.start}_{d.end}"
    name = name.replace(',',';')
    out_string = ''
    if name.find(',') > -1:
        raise Exception

    fasta_header = d.protein.name
    fasta_header = fasta_header.replace(',',';')

    out_string += f"{name}, "
    out_string += f"{fasta_header}, "
    out_string += f"{d.protein.unique_ID}, "
    
    out_string += f"{d.start}, "
    out_string += f"{d.end}, "
    out_string += f"{len(d.sequence)}, "


    out_string += f"{rg[name][1]:.2f}, "


    out_string += f"{re[name][1]:.2f}, "


    out_string += f"{asph[name][1]:.3f}, "


    out_string += f"{nu[name][1]:.3f}, "


    out_string += f"{pref[name][1]:.3f}, "

    local_protein = Protein(d.sequence)
    out_string += f"{round(local_protein.FCR,3)}, "
    out_string += f"{round(local_protein.NCPR,3)}, "
    out_string += f"{round(local_protein.kappa,3)}, "
    out_string += f"{round(local_protein.fraction_negative,3)}, "
    out_string += f"{round(local_protein.fraction_positive,3)}, "
    out_string += f"{round(local_protein.fraction_aromatic,3)}, "
    out_string += f"{round(local_protein.fraction_proline,3)}, "
    out_string += f"{round(local_protein.fraction_polar,3)}, "
    out_string += f"{round(local_protein.fraction_aliphatic,3)}, "
    out_string += f"{d.sequence}\n"
    fh.write(out_string)


fh.close()

