## Build all LCDs as SHEPHARD domains
##### Last updated 2022-09-01
This notebook provides code for building all single-amino acid or chemically consistent low-complexity domains.

Note that to use this code requires the Python package [sparrow](https://github.com/idptools/sparrow). sparrow is in active development and we'd encourage you to avoid integrating it into your standard workflow, however, we have provided it so all analysis in the paper can be unambigiously reproduced with ease.

In [1]:
from shephard.apis import uniprot
from shephard import interfaces
from sparrow import Protein

In [2]:
# name of a FASTA file from uniprot. The example here uses the cleaned human proteome
# - i.e., the human proteome with proteins that lack non-standard amino acids, but this
# could be any FASTA file generated from UniProt (e.g. mouse proteome etc)
filename = '../../shprd_data/human_proteome_validated.fasta'

In [3]:
# read in FASTA file from uniprot
human_proteome = uniprot.uniprot_fasta_to_proteome(filename)

In [4]:
residue_groupings = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

for r in residue_groupings:
    for p in human_proteome:
        s = Protein(p.sequence)

        # get pLCDs from the sequencd
        b = s.low_complexity_domains(mode='holt', residue_selector=r, max_interruption=5, minimum_length=20, fractional_threshold=0.25)

        # if we found 1 or more pLCDs...
        if len(b) >0:
            for d in b:
                p.add_domain(d[1]+1, d[2], f'{r}_at_25pct')

    
    
    

In [5]:
residue_groupings = ['ILVMA','EDRK','RGYWF','PEST']

for r in residue_groupings:
    for p in human_proteome:
        s = Protein(p.sequence)

        # get pLCDs from the sequencd
        b = s.low_complexity_domains(mode='holt', residue_selector=r, max_interruption=5, minimum_length=20, fractional_threshold=0.5)

        # if we found 1 or more pLCDs...
        if len(b) >0:
            for d in b:
                p.add_domain(d[1]+1, d[2], f'{r}_at_50pct')

    
    
    

In [6]:
residue_groupings = ['ILVMA','EDRK','RGYWF','PEST']

for r in residue_groupings:
    for p in human_proteome:
        s = Protein(p.sequence)

        # get pLCDs from the sequencd
        b = s.low_complexity_domains(mode='holt', residue_selector=r, max_interruption=5, minimum_length=20, fractional_threshold=0.75)

        # if we found 1 or more pLCDs...
        if len(b) >0:
            for d in b:
                p.add_domain(d[1]+1, d[2], f'{r}_at_50pct')

    
    
    

ProteinException: Domain [ILVMA_at_50pct_193_220] already found in proteins sp|A6NCQ9|RN222_HUMAN RING finger protein 222 OS=Homo sapiens OX=9606 GN=RNF222 PE=4 SV=1

In [None]:
residue_groupings = ['ED','RK']

for r in residue_groupings:
    for p in human_proteome:
        s = Protein(p.sequence)

        # get pLCDs from the sequencd
        b = s.low_complexity_domains(mode='holt', residue_selector=r, max_interruption=5, minimum_length=20, fractional_threshold=0.5)

        # if we found 1 or more pLCDs...
        if len(b) >0:
            for d in b:
                p.add_domain(d[1]+1, d[2], f'{r}_at_50pct')

    
    
    

In [None]:
residue_groupings = ['YWF']

for r in residue_groupings:
    for p in human_proteome:
        s = Protein(p.sequence)

        # get pLCDs from the sequencd
        b = s.low_complexity_domains(mode='holt', residue_selector=r, max_interruption=5, minimum_length=20, fractional_threshold=0.25)

        # if we found 1 or more pLCDs...
        if len(b) >0:
            for d in b:
                p.add_domain(d[1]+1, d[2], f'{r}_at_25pct')

    
    
    

In [None]:
interfaces.si_domains.write_domains(human_proteome, 'shprd_domains_human_LCDs_all.tsv')

In [None]:
d = human_proteome.get_domains_by_type('PEST_at_50pct')