## Notebook to build folded domains based on ppLDDT
This notebook uses SHEPHARD to build the 'folded' domains, as defined based on their predicted ppLDDT scores. This will include large folded domains, but also short helices as we set the minimums size to a 'domain' of 5 residues.

In [2]:
from shephard import interfaces
from shephard.apis import fasta
from shephard.tools import domain_tools

def pLDDT_binerize(vals):
    return_scores = []
    for i in vals:
        if i > 65:
            return_scores.append(1)
        else:
            return_scores.append(0)
            
    return return_scores

In [3]:
# read in the yeast proteome and the per-residue predicted pLDDT scores for each sequence
yeast_proteome = fasta.fasta_to_proteome('data/yeast_sequence_dataset.fasta',use_header_as_unique_ID=True)
interfaces.si_tracks.add_tracks_from_file(yeast_proteome,'data/pLDDT_scores_SHPRD.tsv', mode='values')


In [4]:
# use the SHEPHARD build_domains_from_track_values function to define discrete domains based on the pLDDT socres
pLDDT_domains = domain_tools.build_domains_from_track_values(yeast_proteome,'pLDDT', pLDDT_binerize, 'structured_domains', minimum_region_size=5,)

On 500 of 5430
On 1000 of 5430
On 1500 of 5430
On 2000 of 5430
On 2500 of 5430
On 3000 of 5430
On 3500 of 5430
On 4000 of 5430
On 4500 of 5430
On 5000 of 5430


In [5]:
# add those domains to the yeast proteome
interfaces.si_domains.add_domains_from_dictionary(yeast_proteome, pLDDT_domains)

In [6]:
# write the domains the file for future use
interfaces.si_domains.write_domains(yeast_proteome,'data/pLDDT_domains_SHPRD.tsv')