## Comparing subproteome vs. full proteome
This notebook compares proteome-wide statistics for the subproteome where we have Ts/Tp data vs. all yeast proteins.


In [1]:
import numpy as np
import matplotlib
import os
import sys

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'arial',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

from shephard.apis.fasta import fasta_to_proteome
from shephard import interfaces
from shephard.tools import attribute_tools

from sparrow import Protein

from shephard import Proteome
    

In [2]:
# build the subproteome - this is the proteome for the sequences where we have mass spec data; we then annotate with the SJF dataset
subproteome = Proteome()
interfaces.si_proteins.add_proteins_from_file(subproteome,'../data/experimental_data/shprd_proteome_soluble_only_v2.tsv')
interfaces.si_protein_attributes.add_protein_attributes_from_file(subproteome, '../data/shprd_files/sjf_data/shprd_fried_yeast.tsv', safe=False)


# build the full yeast proteome and again annotate with the SJF annotations
full_yp = fasta_to_proteome('../data/UP000002311_559292_YEAST_v4_uniprot_id.fasta', use_header_as_unique_ID=True)
interfaces.si_protein_attributes.add_protein_attributes_from_file(full_yp, '../data/shprd_files/sjf_data/shprd_fried_yeast.tsv', safe=False)


In [5]:
print(f"Subproteome size: {len(subproteome)}")
print(f"Yeast proteome size: {len(full_yp)}")

Subproteome size: 3226
Yeast proteome size: 6039


In [3]:
def get_vals(local_proteome, attribute):
    """
    Function which takes a Proteome object and the name of an attribute
    that may be associated with that object and returns a list of those
    attributes. 

    Practically, the code here cycles through each Protein in the Proteome
    and asks (1) does 
    
    """
    vals = []
    for p in local_proteome:
        if attribute in p.attributes:
            try:
                v = float(p.attribute(attribute))
                if v > -1:
                    vals.append(v)
            except Exception:
                pass
    return vals

def compute_values(local_proteome, fx):
    vals = []
    for p in local_proteome:
        vals.append(fx(p.sequence))
    return vals



In [149]:
def plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax, logy=False):
    # Create the violin plot
    fig, ax = plt.subplots(figsize=(2, 2.5), dpi=200)
    
    vp = ax.violinplot([data_subproteome, data_yp], showmeans=False, showmedians=True)
    
    
    colors = ['skyblue', 'lightcoral']  # one color for each dataset
    
    
    # Set color for all violins
    for body, color in zip(vp['bodies'], colors):
        body.set_facecolor(color)
        body.set_edgecolor('black')
        body.set_alpha(1)
        body.set_linewidth(1)
    
    
    # Set color for means and medians
    vp['cmedians'].set_color('black')
    
    # Optional: also style min/max lines
    for part in ['cbars', 'cmins', 'cmaxes']:
        if part in vp:
            vp[part].set_color('black')
            vp[part].set_linewidth(0.8)
    
    # Set the x-tick labels
    plt.xticks([1, 2], ['Exp.', 'All'])
    
    # Add axis labels
    plt.ylabel(ylabel)
    plt.ylim([ymin, ymax])

    if logy:
        ax.set_yscale('log')
    
    plt.tight_layout()

    outname = ylabel.replace(' ','_')
    plt.savefig(f'outfigures/proteome_comparison/{outname}.pdf')
    plt.close()
    





data_subproteome = get_vals(subproteome, 'Molecular_Weight')
data_yp = get_vals(full_yp, 'Molecular_Weight')
ylabel = 'Molecular weight'
ymin=0
ymax=310000

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)



In [150]:
data_subproteome = get_vals(subproteome, 'Isoelectric_Point')
data_yp = get_vals(full_yp, 'Isoelectric_Point')
ylabel = 'Isoelectric Point'
ymin=2
ymax=14

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [151]:
p.attributes

['Locus_Tag',
 'Gene',
 'SGD_ID',
 'UniProt_ID',
 'Length',
 'Molecular_Weight',
 'Isoelectric_Point',
 'Median_Abundance',
 'Cellular_Components',
 'Complexes_SGD',
 'Complexes_CYC2008',
 'pSup_42C',
 'delta_pSup_30C_42C',
 'delta_pSup_42C_46C',
 'E_Values',
 'Fold_Regions',
 'Architectures',
 'X_Groups',
 'T_Groups',
 'F_Groups',
 'Number_of_Domains',
 'Percentage_ECOD_Annotated',
 'Sequence',
 'Percent_Disordered',
 'Disordered_Ranges',
 'Folded_Ranges',
 'CCT_sites',
 'Ssb_sites',
 'SG_MayorLab',
 'SG_ParkerLab',
 'SG_BukauLab',
 'SG_DrummondLab']

In [152]:
data_subproteome = get_vals(subproteome, 'Median_Abundance')
data_yp = get_vals(full_yp, 'Median_Abundance')
ylabel = 'Median Abundance'
ymin=1
ymax=1000000

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax,logy=True)

In [153]:
data_subproteome = get_vals(subproteome, 'Percent_Disordered')
data_yp = get_vals(full_yp, 'Percent_Disordered')
ylabel = 'Percentage disorder'
ymin=0
ymax=100

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [154]:
data_subproteome = get_vals(subproteome, 'Number_of_Domains')
data_yp = get_vals(full_yp, 'Number_of_Domains')
ylabel = 'Number of domains'
ymin=0
ymax=9

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [155]:
def fx(s):
    return Protein(s).FCR
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Fraction of charged residues (FCR)'
ymin=0
ymax=0.75

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [156]:
def fx(s):
    return Protein(s).NCPR
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Net charge per residues'
ymin=-0.45
ymax=0.75

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [157]:
def fx(s):
    return Protein(s).hydrophobicity
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Hydrophobicity'
ymin=1.5
ymax=7

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [158]:
def fx(s):
    return len(s)
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Length'
ymin=0
ymax=3000

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [159]:
def fx(s):
    return Protein(s).fraction_aromatic
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Fraction aromatic residues'
ymin=0
ymax=0.4

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [160]:
def fx(s):
    return Protein(s).fraction_aliphatic
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Fraction aliphatic residues'
ymin=0
ymax=0.7

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [161]:
def fx(s):
    return Protein(s).fraction_polar
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Fraction polar residues'
ymin=0
ymax=0.7

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)

In [162]:
def fx(s):
    return Protein(s).fraction_proline
data_subproteome = compute_values(subproteome, fx)
data_yp = compute_values(full_yp, fx)
ylabel = 'Fraction polar residues'
ymin=0
ymax=0.3

plot_violin(data_subproteome, data_yp, ylabel, ymin, ymax)