## About
This notebook will generate the performance figure (Fig. S2)

In [None]:
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

matplotlib.rcParams['xtick.major.size'] = 1.5  # length of major xticks
matplotlib.rcParams['xtick.major.width'] = 0.25   # width of major xticks
matplotlib.rcParams['ytick.major.size'] = 1.5  # length of major yticks
matplotlib.rcParams['ytick.major.width'] = 0.25   # width of major yticks


# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'avenir',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

from tqdm import tqdm
import pickle
from sparrow import Protein
import protfasta
from tqdm.auto import tqdm

In [None]:
from goose import create
import datetime

from finches import Mpipi_frontend
mf = Mpipi_frontend()

In [None]:
all_seq_lens = np.linspace(40,2000,50,dtype=int)

### Create sequences
The code below uses GOOSE to generate a large number of sequences which we'll then run through FINCHES-based predictions. We separate this out because right now the sequence generation part is actually slower than the FINCHES epsilon prediction part...

In [None]:
n_seqs = 50
all_sequences = {}

all_seq_lens = np.linspace(40,2000,50,dtype=int)

for i in tqdm(all_seq_lens):    
    all_sequences[i] = []
    for j in range(n_seqs):        
        all_sequences[i].append(create.seq_fractions(int(i)))
        
    
    

## Predict performance
Finally, using the sequences generated in the cell above, we predict homotypic epsilon for all of these to guage how predition scales with sequence length

In [None]:
timings = []
for i in tqdm(all_seq_lens):    
    start_time = datetime.datetime.now()
    for s in all_sequences[i]:
        e = mf.epsilon(s,s)    
    end_time = datetime.datetime.now()
    timings.append(end_time - start_time)



In [None]:
sequences_per_second = []
for i in timings:
    secs = i.seconds + i.microseconds/1000000
    sequences_per_second.append((n_seqs)/secs)
    

In [None]:
fig = plt.figure(figsize=(4.5, 2.5), dpi=450)
plt.plot(all_seq_lens, sequences_per_second, color='k',lw=0.5)
plt.yscale('log')
plt.ylabel('Sequence per second',fontsize=9)
plt.xlabel('Sequence length (residues)',fontsize=9)
plt.xticks(fontsize=7)
plt.yticks(fontsize=7)
plt.tight_layout()
plt.savefig('performance.pdf')