### Code to extract disordered IDRs from the set of transcription factors
This notebook combines raw sequence data with experimental data to generate an anotated proteome which in turn can be used for later analysis

Run using SHEPHARD or higher, v.0.1.4 and metapredict v2 or higher (we use v2 in this notebook instead of metapredict-hybrid)

In [2]:
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# set fonts
font = {'family' : 'arial', 'weight' : 'normal'}

matplotlib.rc('font', **font)

# import from shephard and metapredict
from shephard.apis import uniprot
from shephard import interfaces
import metapredict as meta
from shephard import Proteome
from shephard import tools

import protfasta

## Read in data
The next two cells read in the sequence information and then the experimental data

In [3]:
F = uniprot.uniprot_fasta_to_proteome('data/uniprot_sequences.fasta')
for protein in F:
    gn = protein.name.split('GN=')[1].split()[0]
    protein.add_attribute('gene_name', gn)


In [4]:
# read in TF data and parse into a dictionary
with open('data/real_data.tsv','r') as fh:
    content = fh.readlines()
exp_data = {}
for l in content:
    sline = l.strip().split('\t')
    exp_data[sline[0]] = [float(sline[1]),float(sline[2])]


### Sanity check 1
Checks for proteins found in experimental set where we don't haves sequences. Should ONLY be the control

In [5]:
# for each line in experimental data...
exp_data_validates = {}
for n in exp_data:
    
    
    # ask if we find any of the gene names 
    hit_count = 0
    for protein in F:
        if n.find(protein.attribute('gene_name')) > -1:
            hit_count = hit_count + 1
    if hit_count == 0:
        print(f'Did not find [{n}] in the protein sequence dataset')
    elif hit_count > 1:
        print(f'Found multiple gene names compatible with experimental line [{n}]')
    else:
        exp_data_validates[n] = exp_data[n]
        
        

Did not find [Control] in the protein sequence dataset


### Sanity check 2
Checks for proteins found in sequence dataset where we don't have experimental data. This should yeild nothing.

In [6]:
# for each line in experimental data...
for protein in F:
    
    
    gn = protein.attribute('gene_name')
    
    # ask if we find any of the gene names 
    hit_count = 0
    exp_name = None
    for n in exp_data:
        if n.find(gn) > -1:
            hit_count = hit_count + 1
            exp_name  = n

    if hit_count == 0:
        print(f'Did not find [{gn}] in the experimental data dataset')
        
    elif hit_count > 1:
        print(f'Found multiple experimental entries compatible with gene name [{gn}]')
        
    # this is scenario where we found exatly 1 experimental data hit that matched ]
    # the protein gene name
    else:
        protein.add_attribute('activity', exp_data[exp_name][0])
        protein.add_attribute('intensity', exp_data[exp_name][1])
        protein.add_attribute('exp_name', exp_name)
        
        
        

## Annotated proteins with disorder

In [8]:
for protein in F:
    
    # compute IDR boundaires and local disorder tracks
    d = meta.predict_disorder_domains(protein.sequence)
    
    idrs = d.disordered_domain_boundaries
    fds = d.folded_domain_boundaries
    for x in idrs:
        protein.add_domain(x[0]+1, x[1], 'IDR')
    for x in fds:
        protein.add_domain(x[0]+1, x[1], 'folded')        
    protein.add_track('disorder', values=d.disorder)

## Plotting functions
Define functions that plot annotated IDRs, including separating IDRs and and folded domanis

In [10]:
def plot_disorder_domain(p):
    figure(num=None, figsize=(4, 1.5), dpi=300, facecolor='w', edgecolor='k')
    ax = plt.gca()
    plt.plot(p.track('disorder').values, 'k',linewidth=0.5)
    
    
    c = 0
    for domain in p.domains:
        if domain.domain_type == 'IDR':
            ax.axvspan(domain.start-1, domain.end, color='r', alpha=0.3, linewidth=0.0)        
            c= c+len(domain)
        elif domain.domain_type == 'folded':
            ax.axvspan(domain.start-1, domain.end, color='skyblue', alpha=0.3, linewidth=0.0)        
    
    plt.ylim([0,1])
    plt.xlim([1,len(p)+1])
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.title('%s: %1.1f %% disordered'%(p.attribute('exp_name'), (c/len(p))*100),fontsize=8)
    plt.xlabel('Residue number', fontsize=7)
    plt.ylabel('Disorder/structure',fontsize=7)
    
    plt.tight_layout()

    newname = p.attribute('exp_name').replace('/','-')
    plt.savefig(f'figures_metapredict_v2/{newname}.png')
    plt.close()
    

## Plot data
Next we plot the data using the two methods

In [11]:
for p in F:
    plot_disorder_domain(p)

## Write out
Finally we write the data out to SHEPHARD domains files

In [12]:
P_reduced = Proteome([])
long_idr_proteins = []

for p in F:
    
    idrs = [d for d in p.domains if d.domain_type =='IDR']
    
    if len(tools.sequence_tools.build_mega_string(idrs)) > 60:
        long_idr_proteins.append(p)

P_reduced.add_proteins(long_idr_proteins)    

## Save data
Write out annotations into the `filtered_v2/` directories. Note in the paper we use the metapredict-hybrid data which is basically identical, but these notebooks perform analysis in the way it should be done going forward.

In [13]:
uniprot.uniprot_proteome_to_fasta(P_reduced, 'data/filtered_v2/sequences_final.fasta')
interfaces.si_proteins.write_proteins(P_reduced, 'data/filtered_v2/proteins.tsv')
interfaces.si_domains.write_domains(P_reduced, 'data/filtered_v2/shprd_IDRs.tsv', domain_types=['IDR'])
interfaces.si_tracks.write_all_values_tracks_single_file(P_reduced, 'data/filtered_v2/shprd_tracks.tsv')
interfaces.si_protein_attributes.write_protein_attributes(P_reduced, 'data/filtered_v2/shprd_protein_attributes.tsv')

In [14]:
out = {}
for p in P_reduced:
    for d in p.domains:
        if d.domain_type == 'IDR':
            header=f"{p.attribute('gene_name')} {p.unique_ID} start={d.start} end={d.end}"
            out[header] = d.sequence
            
protfasta.write_fasta(out,'shared_data/filtered_v2/all_idrs_analyzed.fasta')