# About
This notebook generates the synthetic IDR libraries

In [24]:
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'avenir',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

from tqdm import tqdm


import protfasta
from finches import CALVADOS_frontend, Mpipi_frontend

mf = Mpipi_frontend()
cf = CALVADOS_frontend()


# NB: this function requires the housetools package, which is a holehouselab
# specific package. If you want to run this code yourself, please contact us
# so we can provide you with a newer version of this code!
from mutagenesis import run_mutagenesis, compare


# define the IDRs we are going to use for the figure 5 library generation
idr2 = 'NNNNNNDGELSGTNLRSNSIDYAKHQEISSAGTSSNTTKNVNNNKNDSNDDNNGNNNNDASNLMESVLDKTSSHRYQPKKMPSVNKWSKPDQITHSDVSMVGLDESNDGGNENVHPTLAEVDAQEARETAQLAIDKINSYKRSIDDKNGDGHNNSSRNVVDENLINDMDSEDAHKSKRQHLSDITLEERNEDDKLPHEVAEQLRLLSSHLKEVE'
rad7_idr = 'MYRSRNRPKRGGENEVKGPNSALTQFLREEGISAENIKQKWYQRQSKKQEDATDEKKGKAEDDSFTAEISRVVEDEEIDEIGTGSGTETERAQVSYDARMKLVPADSDEEEYETSHISDTPVSLSSANDRESLTKKRQNTAKIIQNRRRKRKRAADLLDRRVNKVSSLQSLCITKISENISKWQKEADESSKLVFNKLRDVLGGVST'


  r.loc['H','q'] = 1. / ( 1 + 10**(self.pH-6) )


## Selection without compensation
For the libraries built here, the selection type used is the default (`maxdiff`), which means that the allowable maximum data is the difference between any pair of elements in the interaction matrix. This means no pair of residues can lead to a big change and no compensatory changes are tolerated.

In [None]:
rebuild_library = False

# if True, will rebuild the sequence libraries
if rebuild_library:

    print("Rebuilding sequence libraries...")

    # run mutagenesis with specified parameters. These settings
    # should yield ~28 mutations on average
    new_seq_library = run_mutagenesis(idr2, 
                    rad7_idr,
                    num_mutants=350, 
                    delta_threshold=0.5, 
                    num_loops=20,
                    target_size=30,
                    number_of_events=3,
                    mode='mpipi',
                    verbose=True)

    # should be 28 mutations on average
    protfasta.write_fasta(new_seq_library, f'chemlib_{len(new_seq_library)}.fasta')

    new_seq_library_tight = run_mutagenesis(idr2, 
                                      rad7_idr,
                                      num_mutants=350, 
                                      delta_threshold=0.1, 
                                      num_loops=30,
                                      target_size=30,
                                      number_of_events=3,
                                      mode='mpipi',
                                      verbose=True)

    # should be 22 mutations on average
    protfasta.write_fasta(new_seq_library_tight, f'chemlib_{len(new_seq_library_tight)}_tight.fasta')


    new_seq_library_null_28 = run_mutagenesis(idr2, 
                                      rad7_idr,
                                      num_mutants=350, 
                                      delta_threshold=20, 
                                      num_loops=14,
                                      target_size=30,
                                      number_of_events=3,
                                      mode='mpipi',
                                      verbose=True)
    
    # should be 28 on average
    protfasta.write_fasta(new_seq_library_null_28, f'chemlib_{len(new_seq_library_tight)}_null_28.fasta')


    new_seq_library_null_21 = run_mutagenesis(idr2, 
                                      rad7_idr,
                                      num_mutants=350, 
                                      delta_threshold=20, 
                                      num_loops=11,
                                      target_size=30,
                                      number_of_events=3,
                                      mode='mpipi',
                                      verbose=True)
    
    # should be 21 on average
    protfasta.write_fasta(new_seq_library_null_21, f'chemlib_{len(new_seq_library_tight)}_null_21.fasta')



else:
    print("Reading in existing sequence libraries...")
    new_seq_library = protfasta.read_fasta('libraries/chemlib_350.fasta')
    new_seq_library_tight = protfasta.read_fasta('libraries/chemlib_350_tight.fasta')

    # note - THIS is going to be the null we use (has 28 mutations on average), but note
    # that even though we're gonna compare this with the selection done using attractive_sumdiff
    # insteadf of maxdiff, because we're applying NO selection it doesn't matter the selection
    # regime we're using as everything gets accepted.
    new_seq_library_null_28 = protfasta.read_fasta('libraries/chemlib_350_null_28.fasta')
    new_seq_library_null_21 = protfasta.read_fasta('libraries/chemlib_350_null_21.fasta')


Reading in existing sequence libraries...


## Selection with compensation
Here we define the `selection_type` as mode `attractive_sumdiff`, which means the different allowed reflects the sum of the differences between the interaction matrix of the wild type and the mutant, but only for the attractive interactions (that is, increase in repuslive interactions is tolerated as long as not at expense of attractive interactions, and attraction can be redistributed across the sequence). Note that the numerical `delta_threshold` value must change depending on the `selection_type` (i.e. here it's 50, which is MUCH less permissive than 20 if `maxdiff` were used, for example).

In [None]:
rebuild_library = False

if rebuild_library:

    # this gives ~28 mutations per sequence
    new_seq_library_v2 = run_mutagenesis(idr2, 
                    rad7_idr,
                    num_mutants=350, 
                    delta_threshold=50, 
                    num_loops=19,
                    target_size=30,
                    number_of_events=3,
                    mode='mpipi',
                    selection_type='attractive_sumdiff',                                    
                    verbose=True)

    protfasta.write_fasta(new_seq_library_v2, f'chemlib_v2_{len(new_seq_library_v2)}.fasta')

else:
    print("Reading in existing sequence libraries...")

    
    new_seq_library_v2 = protfasta.read_fasta('libraries/chemlib_v2_350.fasta')


Reading in existing sequence libraries...


In [27]:
# define the library of interest for the remainder of the analysis
LIBRARY_OF_INTEREST = new_seq_library_v2

### Write intermaps out
If we set `WRITE_MAPS` to true, the cell below will generate an intermap image for each synthetic ortholog to enable easy visualization of how these variants change the instantaneous intermap.

In [28]:
WRITE_MAPS = False

if WRITE_MAPS:

    # this will take 2-3 minutes to run 
    for k in tqdm(LIBRARY_OF_INTEREST)  :
        s = LIBRARY_OF_INTEREST[k]
        mf.interaction_figure(s, rad7_idr, no_disorder=True,zero_folded=False)
        plt.savefig(f'figures/individual_intermaps/idr2_rad7_intermap_{k}.png')
        plt.close()
    

In [29]:
mutations_real = []
mutations_null = []
for k in new_seq_library_v2:
    mutations_real.append(compare(idr2, new_seq_library_v2[k], verbose=False))

for k in new_seq_library_null_28:    
    mutations_null.append(compare(idr2, new_seq_library_null_28[k], verbose=False))
    
print(f"Real library: {np.mean(mutations_real):.2f} +/- {np.std(mutations_real):.2f} mutations per sequence")    
print(f"Null library: {np.mean(mutations_null):.2f} +/- {np.std(mutations_null):.2f} mutations per sequence")

Real library: 27.69 +/- 4.13 mutations per sequence
Null library: 27.83 +/- 2.91 mutations per sequence


## Large library construction
Finally, we go ahead and generate a large library. 

In [11]:
# BIG LIBRARIES
rebuild_library = False

if rebuild_library:
    new_seq_library_big = run_mutagenesis(idr2, 
                    rad7_idr,
                    num_mutants=650, 
                    delta_threshold=0.5, 
                    num_loops=20,
                    target_size=30,
                    number_of_events=3,
                    mode='mpipi',
                    verbose=True)

    # should be 28 mutations on average
    protfasta.write_fasta(new_seq_library_big, f'chemlib_{len(new_seq_library_big)}.fasta')


    new_seq_library_null_28_big = run_mutagenesis(idr2, 
                                      rad7_idr,
                                      num_mutants=650, 
                                      delta_threshold=20, 
                                      num_loops=14,
                                      target_size=30,
                                      number_of_events=3,
                                      mode='mpipi',
                                      verbose=True)
    
    # should be 28 on average
    protfasta.write_fasta(new_seq_library_null_28_big, f'chemlib_{len(new_seq_library_tight)}_null_28.fasta')
else:
    print("Reading in existing sequence libraries...")
    new_seq_library_big = protfasta.read_fasta('libraries/chemlib_650.fasta')
    new_seq_library_null_28_big = protfasta.read_fasta('libraries/chemlib_650_null_28.fasta')




Reading in existing sequence libraries...
