## Synthetic IDR evolution
NOTE - this notebook probably doesn't actually need to be run...

In [None]:
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'avenir',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)
from sparrow import Protein
from sparrow.predictors import batch_predict

import protfasta

from finches import CALVADOS_frontend, Mpipi_frontend

mf = Mpipi_frontend()
cf = CALVADOS_frontend()
from sparrow import Protein

from shephard import interfaces, apis
from housetools import nucleic_acid_tools 

from mutagenesis import run_mutagenesis

from tqdm import tqdm

def compare(s1,s2, verbose=True):
    count_diffs = 0
    for i in range(len(s1)):
        if s1[i] != s2[i]:
            count_diffs = count_diffs + 1
            if verbose:
                print(f"{i+1}: {s1[i]} vs. {s2[i]}")

    if verbose:
        print(f"{count_diffs} changes")
    return count_diffs

def average_mutations(lib):
    vals = []
    for k in lib:

        # nb abf1_idr must be defined outside!
        vals.append(compare(abf1_idr, lib[k], verbose=False))
    return np.mean(vals)

idr2 = 'NNNNNNDGELSGTNLRSNSIDYAKHQEISSAGTSSNTTKNVNNNKNDSNDDNNGNNNNDASNLMESVLDKTSSHRYQPKKMPSVNKWSKPDQITHSDVSMVGLDESNDGGNENVHPTLAEVDAQEARETAQLAIDKINSYKRSIDDKNGDGHNNSSRNVVDENLINDMDSEDAHKSKRQHLSDITLEERNEDDKLPHEVAEQLRLLSSHLKEVE'

from housetools.sequence_tools.sequence_conservation import ConservationCalculator

from shephard.apis import uniprot
yp = uniprot.uniprot_fasta_to_proteome('data/s_cerevisiae_clean.fasta')
interfaces.si_domains.add_domains_from_file(yp,'data/shprd_domains_idrs_s_cerevisiae.tsv')
interfaces.si_protein_attributes.add_protein_attributes_from_file(yp, 'data/abf1_interactors_stringdb.tsv')

mf = Mpipi_frontend()

In [None]:
abf1 = yp.protein('P14164')

rad7_idr = 'MYRSRNRPKRGGENEVKGPNSALTQFLREEGISAENIKQKWYQRQSKKQEDATDEKKGKAEDDSFTAEISRVVEDEEIDEIGTGSGTETERAQVSYDARMKLVPADSDEEEYETSHISDTPVSLSSANDRESLTKKRQNTAKIIQNRRRKRKRAADLLDRRVNKVSSLQSLCITKISENISKWQKEADESSKLVFNKLRDVLGGVST'
abf1_idr = 'NNNNNNDGELSGTNLRSNSIDYAKHQEISSAGTSSNTTKNVNNNKNDSNDDNNGNNNNDASNLMESVLDKTSSHRYQPKKMPSVNKWSKPDQITHSDVSMVGLDESNDGGNENVHPTLAEVDAQEARETAQLAIDKINSYKRSIDDKNGDGHNNSSRNVVDENLINDMDSEDAHKSKRQHLSDITLEERNEDDKLPHEVAEQLRLLSSHLKEVE'

x = mf.intermolecular_idr_matrix(abf1_idr, rad7_idr)

abf1_idr1_attractive_interaction_score = np.sum(x[0][0][x[0][0] < 0])
print(abf1_idr1_attractive_interaction_score)

In [None]:
# the code here selects some possible additional

hits = []

# this is a list of uniprot IDs for the STRINGDM-defined interactors for Abf1
abf1_interactors = abf1.attribute('partners').split(', ')

# for each interactor
for interactor in abf1_interactors:

    # for each IDR in the interactor 
    for idr in yp.protein(interactor).domains:     

        # make sure IDRs are long enough!!
        if len(idr.sequence) > 80:

            # calculate the interaction matrix for IDR2 to the normal sequence
            x = mf.intermolecular_idr_matrix(abf1_idr, idr.sequence)    

            # safe=False means we can overwrite if we re-run this - collapse down all the
            # attractive interactions only
            idr.add_attribute('idr2_interaction_score', np.sum(x[0][0][x[0][0] < 0]), safe=False)

            # if IDRs are between 80 and 170 in length...
            if len(idr.sequence) > 80 and len(idr.sequence) < 170:

                # and if IDRs are attractive enough (-2000 is a relatively arbitrary define)
                if idr.attribute('idr2_interaction_score') < -2000:
                    hits.append(idr)

# generate intermaps for those hits
for idr in hits:
    mf.interaction_figure(abf1_idr, idr.sequence, fname=f'figures/{idr.protein.unique_ID}_{idr.start}_{idr.end}.pdf', zero_folded=False)

## Selection without compensation

In [None]:
for i in hits:
    print(i)
    print(i.sequence)
    print("")

### Build synthetic libraries
Cells below generate the synthetic libraries of orthologs either (real library) under selection or with no selection (null_library, uses a BIG delta threshold so everything gets accepted).

In [None]:
libsize = 100

for idr in tqdm(hits[1:]):

    # null library
    null_library = run_mutagenesis(idr2, 
                                   idr.sequence,
                                   num_mutants=libsize, 
                                   delta_threshold=2000, 
                                   num_loops=14,
                                   target_size=30,
                                   number_of_events=3,
                                   mode='mpipi',
                                   verbose=True)


    real_library = run_mutagenesis(idr2, 
                                   idr.sequence,
                                   num_mutants=libsize,
                                   delta_threshold=50, 
                                   num_loops=19,
                                   target_size=30,
                                   number_of_events=3,
                                   mode='mpipi',
                                   selection_type='attractive_sumdiff',                                    
                                   verbose=True)


    null_average = round(average_mutations(null_library))
    real_average = round(average_mutations(real_library))

    protfasta.write_fasta(null_library, f'out_libraries/null_{idr.protein.unique_ID}_{idr.start}_{idr.end}_size_{len(null_library)}_muts_{null_average}.fasta')
    protfasta.write_fasta(real_library, f'out_libraries/real_{idr.protein.unique_ID}_{idr.start}_{idr.end}_size_{len(real_library)}_muts_{real_average}.fasta')


