## About
This notebook computes predicted ensemble average properties for the human IDR-ome, and generates a SHEPHARD-compliant file. That file is then used to generate the panels in **Fig. 4** of Lotthammer et al.

In [13]:
from afrc import AnalyticalFRC
from sparrow import Protein
from shephard import interfaces, apis
from sparrow.predictors import batch_predict

import datetime

In [7]:
# define the location of the proteome file and IDR file (available from https://github.com/holehouse-lab/shephard-data)
rootdir = ''

In [48]:
# build a human proteome object
human_proteome = apis.uniprot.uniprot_fasta_to_proteome(f'{rootdir}/human_proteome_clean.fasta')

# annotate with pre-computed IDR-ome 
interfaces.si_domains.add_domains_from_file(human_proteome, f'{rootdir}/shprd_domains_idrs.tsv')

### Extract out IDRs
The code below extract out the human IDR-ome so we can batch predict

In [54]:
idrs = {}
for d in human_proteome.domains[0:10]:
    name = f"{d.protein.unique_ID}_{d.start}_{d.end}"
    idrs[name] = d.sequence

## Run the predictions
For clarity, we explicitly define the network version being used here, although in general we recommend using the lates network version.

In [29]:
network_version = 2

start = datetime.datetime.now()
predicted_rg = batch_predict.batch_predict(idrs, network='rg', version=network_version)
end = datetime.datetime.now()
print(f'Rg took {end-start}s')

start = datetime.datetime.now()
predicted_re = batch_predict.batch_predict(idrs, network='re', version=network_version)
end = datetime.datetime.now()
print(f'Re took {end-start}s')

start = datetime.datetime.now()
predicted_scaled_rg = batch_predict.batch_predict(idrs, network='scaled_rg', version=network_version)
end = datetime.datetime.now()
print(f'Scaled rg took {end-start}s')

start = datetime.datetime.now()
predicted_scaled_re = batch_predict.batch_predict(idrs, network='scaled_re', version=network_version)
end = datetime.datetime.now()
print(f'Scaled re took {end-start}s')

start = datetime.datetime.now()
predicted_scaling = batch_predict.batch_predict(idrs, network='scaling_exponent', version=network_version)
end = datetime.datetime.now()
print(f'Scaling exponent took {end-start}s')

start = datetime.datetime.now()
predicted_prefactors = batch_predict.batch_predict(idrs, network='prefactor', version=network_version)
end = datetime.datetime.now()
print(f'Scaling prefactor took {end-start}s')

start = datetime.datetime.now()
predicted_asph = batch_predict.batch_predict(idrs, network='asphericity', version=network_version)
end = datetime.datetime.now()
print(f'Asphericity  took {end-start}s')




100%|███████████████████████████████████████████| 27/27 [00:01<00:00, 17.94it/s]
100%|███████████████████████████████████████| 1052/1052 [00:50<00:00, 20.97it/s]


Rg took 0:00:51.741073s


100%|███████████████████████████████████████████| 27/27 [00:01<00:00, 18.22it/s]
100%|███████████████████████████████████████| 1052/1052 [00:48<00:00, 21.48it/s]


Re took 0:00:50.606831s


100%|███████████████████████████████████████| 1079/1079 [00:53<00:00, 20.35it/s]


Scaled rg took 0:00:53.110851s


100%|███████████████████████████████████████| 1079/1079 [00:53<00:00, 20.29it/s]


Scaled re took 0:00:53.370196s


100%|███████████████████████████████████████| 1079/1079 [01:36<00:00, 11.14it/s]


Scaling exponent took 0:01:36.925283s


100%|███████████████████████████████████████| 1079/1079 [00:59<00:00, 18.19it/s]


Scaling prefactor took 0:00:59.331811s


100%|███████████████████████████████████████| 1079/1079 [01:48<00:00,  9.96it/s]

Asphericity  took 0:01:48.422547s





In [49]:
idx = 0
for d in human_proteome.domains:
    
    name = f"{d.protein.unique_ID}_{d.start}_{d.end}"
    
    rg = np.round(predicted_rg[name][1],2)
    re = np.round(predicted_re[name][1],2)
    
    d.add_attribute('radius_of_gyration', rg)
    d.add_attribute('radius_of_gyration_sn', np.round(predicted_scaled_rg[name][1],2))

    d.add_attribute('end_to_end_distance', re)
    d.add_attribute('end_to_end_distance_sn', np.round(predicted_scaled_re[name][1],2))
    

    d.add_attribute('scaling_exponent', np.round(predicted_scaling[name][1],4))
    d.add_attribute('scaling_prefactor', np.round(predicted_prefactors[name][1],4))
    d.add_attribute('asphericity', np.round(predicted_asph[name][1],3))

    afrcp = AnalyticalFRC(d.sequence)

    d.add_attribute('rg_afrc_norm', rg/afrcp.get_mean_radius_of_gyration())
    d.add_attribute('re_afrc_norm', re/afrcp.get_mean_end_to_end_distance())

    if idx % 500 == 0:
        print(f"On {idx}")
        
    idx = idx + 1


On 0
On 500
On 1000
On 1500
On 2000
On 2500
On 3000
On 3500
On 4000
On 4500
On 5000
On 5500
On 6000
On 6500
On 7000
On 7500
On 8000
On 8500
On 9000
On 9500
On 10000
On 10500
On 11000
On 11500
On 12000
On 12500
On 13000
On 13500
On 14000
On 14500
On 15000
On 15500
On 16000
On 16500
On 17000
On 17500
On 18000
On 18500
On 19000
On 19500
On 20000
On 20500
On 21000
On 21500
On 22000
On 22500
On 23000
On 23500
On 24000
On 24500
On 25000
On 25500
On 26000
On 26500
On 27000
On 27500
On 28000
On 28500
On 29000
On 29500
On 30000
On 30500
On 31000
On 31500
On 32000
On 32500
On 33000
On 33500


In [53]:
# save file
interfaces.si_domains.write_domains(human_proteome, 'IDRs_with_properties_v{network_version}.tsv')