## Build linear sequence profiles
This notebook constructs data need for Figure 5. There is no need to re-run this because this repository comes with the computed output of this notebook `shprd_re_vector_track.tsv.tgz` - however, we provide this notebook so the full workflow can be reproduced!

In [None]:
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'arial',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

from afrc import AnalyticalFRC
from sparrow import Protein
from shephard import interfaces, apis

from scipy.signal import savgol_filter

from sparrow.predictors import batch_predict

In [1]:
# define the directory where the proteome-wide data are held. These data can be obtained from
# https://github.com/holehouse-lab/shephard-data/tree/main/data/proteomes/human

rootdir = ''

In [None]:
# read in human proteome
human_proteome = apis.uniprot.uniprot_fasta_to_proteome(f'{rootdir}/human_proteome_clean.fasta')


In [None]:
#
# The function below takes in an amino acid sequence and constructs a linear vector based on the predictor network
# defined by the keyword network
#

def build_track(seq, fragsize=51, network='scaled_re', show_progress_bar=False):
    """
    Internal function that constructs tracks for linear sequence properties.

    Takes in an amino acid sequence, breaks it into $fragsize sized fragements, 
    and for each window in the sequence that's fragsize in dimenions and once all 
    windows are scanned the N- and C-termini of the sequence are assigned values 
    that extend from the first and last residue such that the entire track is the
    same length as the sequence.

    Parameters
    --------------------
    seq : str
        Amino acid string

    fragsize : int
        Window size being used

    network : str
        Selector that defines which network to use. Default
        is scaled_re (i.e. the default end-to-end distance)

    show_progress : bool
        Flag which defines if a progress bar is shown 
        or not during predictions

    Returns
    --------------------
    np.array
        Returns an array which is the associated track
    
    """
    
    if len(seq) < fragsize:
        return np.array([])
    
    vector_s = []
    all_frags = {}
    for idx in range(0, (len(seq)-fragsize)+1):
        f = seq[idx:idx+fragsize]
        all_frags[idx] = f
    bps = batch_predict.batch_predict(all_frags, network, show_progress_bar=show_progress_bar)
    
    for i in range(len(bps)):
        vector_s.append(bps[i][1])
        
    ext_len = int((fragsize-1)/2)
    n_ext = [vector_s[0]]*ext_len
    c_ext = [vector_s[-1]]*ext_len
    
    n_ext.extend(vector_s)
    n_ext.extend(c_ext)
    
    if len(n_ext) != len(seq):
        print(ext_len)
        print(len(n_ext))
        print(len(seq))

        raise Exception('error')
    return np.array(n_ext)

    
    

### Slow step
The cell below takes a few hours on CPUs to run because we're performing predictions for ~2 million 51-residue IDRs. Performance is much faster on GPUs and the code COULD be optimized to run even faster on GPUs, but given we really only need to run this once, ever, this is probably fine...

In [None]:
for idx, prot in enumerate(human_proteome):
    if idx % 50 == 0:
        print(idx)
    
    if 're_vector' not in prot.track_names:
        re_vector = build_track(prot.sequence, 51, 'scaled_re', show_progress_bar=False)
        if len(re_vector) == 0:
            continue 
        prot.add_track('re_vector', values=re_vector, safe=False)
    
    


In [None]:

# Note - this file is already written and saved as a compressed shprd_re_vector_track.tgz file
interfaces.si_tracks.write_all_values_tracks_single_file(human_proteome, 'shprd_re_vector_track.tsv')