## About
This notebook uses the precomputed full protein sequence alignments to extract out parlogous/orthologous IDRs. This notebook unavoidably make extenisve use of internal holehouse lab tools, to the extent that extracting out that code from this notebook is not trivial.

IF you are reading this and you want intermediate data that you can't access, please message Alex and he'll re-write this in a way that intermediate data is read in instead of being calculated. However, this would require a major re-restructuring of the code so I didn't want to spend the time doing this until someone needed it. Happy to do it though :).

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pickle

import protfasta
import metapredict as meta

from finches import Mpipi_frontend
from finches.data import fingerprints
mf = Mpipi_frontend()

In [None]:
from yeastevo import Pillars
from housetools.sequence_tools.sequence_conservation import ConservationCalculator
from housetools.interfaces.eggnog import nogtools
import yeastevo

# activate and pre-load the conservation object
CC = ConservationCalculator()

# build and read in the sequence information
fungi_matrix = Pillars()

# get the set of IDs and read in alligned sequences
all_valid_IDs = fungi_matrix.all_aligned_scerevisiae_YSNs()

# build mapping of YSN to amino acid sequence with for. cerevisiae 
ysn2seq = {}
for ysn in all_valid_IDs:    
    full_seq = fungi_matrix.get_orthologs(ysn, silent=True)['Scerevisiae'][0][1]    
    ysn2seq[ysn] = full_seq

In [None]:
# OPTIONS - options that can change script output defined here
metapredict_version = 'v3'

# shortest IDR (sans alignment chars) for 
min_idr_length = 50

# minimum level of disorder we'll accept for an orthologous "IDR" (identified by excising equvalent positions based on S. cereviase IDR)
fraction_disordered = 0.8

# defines is we generate output pickle files with globally aligned or unaligned sequences
remove_gaps = True


In [None]:
# predict ysn to DisorderObject objects so we can map IDR boundaries - note this will take ~20-30 seconds for metapredict
ysn2disorder = meta.predict_disorder(ysn2seq, 
                                     show_progress_bar=True, 
                                     return_domains=True, 
                                     version=metapredict_version)

In [None]:
def get_orthologous_idrs(ysn, fungi_matrix, ysn2disorder, min_idr_length=50, fraction_disordered_threshold=0.8):
    """
    Extract out orthologous IDRs using S. cerevisiae as the reference frame. Algorithmically, this
    function does the following:

    1. Using S. cerevisiae as the reference, extracts the start/end of each IDR in the protein
    associated with the input gene name (ysn).

    2. Gets the corresponding subregions from the orthologs. Specifically this translates the real-sequence
    indexing from S. cerevisae into the aligned-sequence positions, and then extracts out all the positions
    from the orthologs in the aligned space, and then removes the alignment character.

    3. Filter those orthologous IDRs. First for each orthologous IDR, we remove sequences where there is no IDR.
    Second, we predict disorder for those orthologous IDRs, and then only keep IDRs that are both >80% disordered
    and over 50 amino acids. The 80% and 50 amino acids can be changed by the fraction_disordered and min_idr_length
    parameters

    Finally, we return a dictionary where the keys are are the yeast gene names, and the values are also dictionaries.
    For the value dicts, the keys are IDR_X_START_END where X is an identifier that starts at 1 and is incremented, and
    START and END are the positions in the original S. cerevisae protein where the IDR started. The values are ANOTHER
    dictionary, here here, kere are gene_name_species_gene_name and values are the sequence. So in short the return
    structure is

    
    idr_dict_filtered[IDR_IDENTIFIER][SPECIES_IDENTIFIER] = IDR from species

    Parameters
    -----------

    ysn : str
        Name of the yeast 

    fungi_matrix : yeastevo.pillars.Pillars
        Pre-constructed and initiallized Pillars object from the yeastevo repository

    ysn2disorder : dict
        Dictoinary that maps the ysn to a DisorderObject, which holds the disorder 
        domains and boundaries

    min_idr_length : int
        Shortest IDR length we'll use

    fraction_disordered_threshold :float
        Minimum fraction disorder we will tolerate for an IDR of interest.


    Returns
    -----------
    dict
        Returns a dictionary where keys are are the yeast gene names, and the values 
        are also dictionaries. For the value dicts, the keys are IDR_X_START_END where 
        X is an identifier that starts at 1 and is incremented, and START and END are 
        the positions in the original S. cerevisae protein where the IDR started. 
        The values are ANOTHER dictionary, here here, kere are gene_name_species_gene_name 
        and values are the sequence. 
    
    """

    if fraction_disordered_threshold < 0 or fraction_disordered_threshold > 1:
        raise Exception('Fraction disordered only makes sense between 0 and 1')

    if min_idr_length < 1:
        raise Exception('Shortest IDR length only makes sense as a positive integer')

    # get the orthologous IDRs
    seq_dict = fungi_matrix.get_aligned_scerevisiae_sequences(ysn)

    ## get the S. cerevisae protein key from the seq_dict
    # Note this warns if we find multiple Scerevisiae keys but always choose the 
    # first in the list.
    possible_yeast_indices = [x for x in seq_dict.keys() if x.find('Scerevisiae') >-1]    
    
    if len(possible_yeast_indices) == 1:
        reference_key = possible_yeast_indices[0]
    elif len(possible_yeast_indices) > 1:        
        reference_key = possible_yeast_indices[0]
        print(possible_yeast_indices)
        print(f'Found multiple S. cerevisiae indices; using {reference_key}')
    else:
        raise Exception('Should not get here...')

    # buld out the ortholgous IDRs
    idr_idx = 1
    idr_dict = {}
    idr_dict_aligned = {}

    # for each IDR in S. cerevisiae
    for boundaries in ysn2disorder[ysn].disordered_domain_boundaries:

        # get the reference key for the yeast protein        
        start = boundaries[0]
        end  = boundaries[1]-1 # note we do a -1 because the boundaires are in Python slice syntax

        # extract out the orthologous IDRs that 
        idr_dict[f"IDR{idr_idx}_{start}_{end}"] = nogtools.get_alignment_subregion_v2(seq_dict, 
                                                                                        reference_key, 
                                                                                        start=start, 
                                                                                        end=end)
        

        idr_dict_aligned[f"IDR{idr_idx}_{start}_{end}"] = nogtools.get_alignment_subregion_v2(seq_dict, 
                                                                                              reference_key, 
                                                                                              start=start,
                                                                                              end=end,
                                                                                              remove_gaps=False)
                                                                                                  
        
            
        
        idr_idx = idr_idx + 1

    # now we filter through, note we ALWAYS include S. cerevisiae
    idr_dict_filtered = {}

    
    # for each IDR in our newly built IDR dictionary
    for d in idr_dict:

        # remove orthologs where no IDRs were found
        tmp = {}
        tmp_aligned = {}
        for k in idr_dict[d]:

            # WEIRD situation where for some reason the cerevisiae IDR is length 0? This in theory should
            # never happen so we warn about it but deal
            if k == reference_key and len(idr_dict[d][k]) == 0:
                print(f'WARNING: Somehow  S. cervisiae IDR was length 0 for {k}. Skipping...')
                tmp = {}
                tmp_aligned
                break

            # if IDR has at least one residue
            if len(idr_dict[d][k]) > 0:                
                tmp[k] = idr_dict[d][k]
                tmp_aligned[k] = idr_dict_aligned[d][k]

        # predict disorder (note we use same metapredict version as used earlier!)
        b = meta.predict_disorder(tmp, return_domains=True,silence_warnings=True, version=metapredict_version)

        # finally filyer based on IDR length and fraction disordered
        tmp2 = {}
        for k in b:

            # calculate what fraction of the IDR falls within a predicted disordered domain
            fraction_disordered = np.sum([len(x) for x in b[k].disordered_domains])/len(b[k].sequence)
            
            # if we're on the S. cerevisiae protein include regardless of anything else...
            if k == reference_key:
                tmp2[k] = tmp_aligned[k]

            
            # else check the protein we've indentified is sufficiently disordered and sufficiently long
            elif fraction_disordered > fraction_disordered_threshold and len(b[k].sequence) > min_idr_length:
                tmp2[k] = tmp_aligned[k]
        
        idr_dict_filtered[d] = tmp2
        
    
    return idr_dict_filtered
    

In [None]:
ysn2idrvar = {}
ysn2idrs = {}

# for each YSN calculate variance in epsilon values across orthologous IDRs
for ysn in tqdm(all_valid_IDs):

    # get filtered orthologous IDRs associated with this YSN
    orth_idrs = get_orthologous_idrs(ysn, fungi_matrix, ysn2disorder)
    
    idr2eps_var = {}
    idr2eps_count = {}
    tmp_ysn2idrs = {}

    # for each orthologous IDR calculate variance in epsilon values across the orthologs
    for idr_key in orth_idrs:    
        ortholog_matrix = []
        if len(orth_idrs[idr_key]) > 15:
            tmp_ysn2idrs[idr_key] = orth_idrs[idr_key]
            for k in orth_idrs[idr_key]:
            
                local_seq = orth_idrs[idr_key][k].replace('-','')
                tmp = []
                
                for f in fingerprints.mpipi_fingerprints:    
                    f_seq = fingerprints.mpipi_fingerprints[f]
                    tmp.append(mf.epsilon(local_seq, f_seq)/len(local_seq))
                ortholog_matrix.append(tmp)
        
            idr2eps_var[idr_key] = np.var(np.array(ortholog_matrix).transpose(),axis=1)    
    
    ysn2idrvar[ysn] = idr2eps_var
    ysn2idrs[ysn] = tmp_ysn2idrs
    
    
        
    

In [None]:
with open(f'generated_data/ysn2idrvar_metapredict_{metapredict_version}_disorder_{fraction_disordered}_{min_idr_length}.pkl', 'wb') as file:    
    pickle.dump(ysn2idrvar, file)

In [None]:
with open(f'generated_data/ysn2idrs_{metapredict_version}_{metapredict_version}_disorder_{fraction_disordered}_{min_idr_length}.pkl', 'wb') as file:    
    pickle.dump(ysn2idrs, file)

## Finally generate sequence data
Finally we save out FASTA files for the aligned and unaligned IDRs

In [2]:
# NB set REBUILT to false cos we don't need to rebuild all these!
REBUILD = False

if REBUILD:

    IN_PICKLE = f'generated_data/ysn2idrs_{metapredict_version}_{metapredict_version}_disorder_{fraction_disordered}_{min_idr_length}.pkl'
    OUTDIR = f'idr_msas/metapredict_{metapredict_version}/full_protein_aligned/'
    
    # read in the generate IDR file
    with open(IN_PICKLE, 'rb') as file:
        data = pickle.load(file)


    ## FIRST we write out the ALIGNED sequences using the "full" alignment (i.e. alignment
    ## that was done on the full-length proteins
    
    for ysn in data:
        for idr in data[ysn]:
            protfasta.write_fasta(data[ysn][idr], f'{OUTDIR}/ALIGNED_{ysn}_{idr}.fasta')
    
    ## THEN we write out JUST the IDRs from the othologs - these sequences can then be 
    ## re-aligned post-factor using clustal-omega
    OUTDIR = f'idr_msas/metapredict_{metapredict_version}/'
    for ysn in data:
        for idr in data[ysn]:
            tmp = {}
            for k in data[ysn][idr]:
                tmp[k] = data[ysn][idr][k].replace('-','')
            protfasta.write_fasta(tmp, f'{OUTDIR}/{ysn}_{idr}.fasta')
