In [4]:
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'avenir',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)
from scipy.signal import savgol_filter
from sparrow import Protein
import protfasta
import random
from scipy import stats
import metapredict as meta


In [5]:
viable = protfasta.read_fasta('data/orthologs_viable.fasta')
inviable = protfasta.read_fasta('data/orthologs_inviable.fasta')

aligned = protfasta.read_fasta('data/orthologs_aligned.fasta', alignment=True)

In [7]:
def build_analysis(s):
    
    return_data = []
    P = Protein(s)
    
    return_data.append(len(P)) # 0
    return_data.append(P.FCR)  # 1
    return_data.append(P.NCPR) # 2
    return_data.append(P.fraction_aliphatic) # 3
    return_data.append(P.fraction_aromatic) # 4
    return_data.append(P.fraction_aromatic+P.fraction_aliphatic) # 5
    return_data.append(P.hydrophobicity) # 6
    return_data.append(np.mean(P.predictor.disorder())) # 7
    return_data.append(P.fraction_polar) # 8
    return_data.append(np.mean(meta.predict_pLDDT(P.sequence))) # 9
    return_data.append(P.fraction_positive) # 10
    return_data.append(P.fraction_negative) # 11
    return_data.append(P.fraction_proline) # 12
    
    return return_data


viable_ana = np.array([build_analysis(viable[x]) for x in viable]).transpose()
inviable_ana = np.array([build_analysis(inviable[x]) for x in inviable]).transpose()


In [8]:
def plot_points(s1, s2, yaxis_name, outname):
    
    jitter_s1 = np.array([random.normalvariate(0,0.05) for i in range(len(s1))])
    jitter_s2 = np.array([random.normalvariate(0,0.05) for i in range(len(s2))])
    
    x1 = jitter_s1 +1
    x2 = jitter_s2 +2
    
    
    figure(num=None, figsize=(2, 3.5), dpi=200, facecolor='w', edgecolor='k')
    plt.plot(x1, s1,'bo', alpha=0.3, markeredgewidth=0)
    plt.plot(x2, s2,'ro', alpha=0.3, markeredgewidth=0)
    plt.plot([0.75,1.25], [np.mean(s1), np.mean(s1)],'-b')
    plt.plot([1.75,2.25], [np.mean(s2), np.mean(s2)],'-r')
    
    plt.xticks([1,2],['viable','inviable'])
    
    t = stats.ttest_ind(s1, s2, equal_var = False)
    
    if t.pvalue > 0.05:
        plt.title('p > 0.05')
    else:
        plt.title(f'p = {round(t.pvalue,5)}')
        
    plt.ylabel(yaxis_name)
    plt.tight_layout()
    plt.savefig(outname)
    plt.close()

    
    
plot_points(viable_ana[0], inviable_ana[0], yaxis_name='length', outname='figures/lengths.pdf')        
plot_points(viable_ana[1], inviable_ana[1], yaxis_name='FCR', outname='figures/FCR.pdf')        
plot_points(viable_ana[2], inviable_ana[2], yaxis_name='NCPR', outname='figures/NCPR.pdf')        
plot_points(viable_ana[3], inviable_ana[3], yaxis_name='f_ali', outname='figures/f_ali.pdf')        
plot_points(viable_ana[4], inviable_ana[4], yaxis_name='f_aro', outname='figures/f_aro.pdf')        
plot_points(viable_ana[5], inviable_ana[5], yaxis_name='f_ali+f_aro', outname='figures/f_aro_f_ali.pdf')        
plot_points(viable_ana[6], inviable_ana[6], yaxis_name='hydrophobicity', outname='figures/hydrophobicity.pdf')        
plot_points(viable_ana[7], inviable_ana[7], yaxis_name='disorder', outname='figures/disorder.pdf')        
plot_points(viable_ana[8], inviable_ana[8], yaxis_name='f_polar', outname='figures/f_polar.pdf')        
plot_points(viable_ana[9], inviable_ana[9], yaxis_name='ppLDDT', outname='figures/ppLDDT.pdf')        

plot_points(viable_ana[10], inviable_ana[10], yaxis_name='f_positive', outname='figures/f_positive.pdf')        
plot_points(viable_ana[11], inviable_ana[11], yaxis_name='f_negative', outname='figures/f_negative.pdf')        
plot_points(viable_ana[12], inviable_ana[12], yaxis_name='f_proline', outname='figures/f_proline.pdf')        

### Compute alignment and similarity


In [9]:
def calculate_identity(wt,s):
    
    hits = 0
    for i in range(len(wt)):
        if wt[i] == '-':
            continue
        if wt[i] == s[i]:
            hits = hits +1 
            
    if len(wt.replace('-','')) < len(s.replace('-','')):
        return 100*(hits/len(wt.replace('-','')))
    else:
        return 100*(hits/len(s.replace('-','')))
    
        

In [13]:
lookup = {}
lookup['A'] = ['A','G','L']
lookup['C'] = ['C','S','A','G']
lookup['D'] = ['D','E']
lookup['E'] = ['D','E']
lookup['F'] = ['F','W','Y']
lookup['G'] = ['S','Q','N','T','G','A']
lookup['H'] = ['S','Q','N','T','G']
lookup['I'] = ['I','L','V','M']
lookup['K'] = ['K','R']
lookup['L'] = ['I','L','V','M']
lookup['M'] = ['I','L','V','M']
lookup['N'] = ['S','Q','N','T','G']
lookup['P'] = ['P']
lookup['Q'] = ['S','Q','N','T','G']
lookup['R'] = ['K','R']
lookup['S'] = ['S','Q','N','T','G']
lookup['T'] = ['S','Q','N','T','G']
lookup['V'] = ['I','L','V','M']
lookup['W'] = ['F','W','Y']
lookup['Y'] = ['F','W','Y']


def calculate_similarity(wt,s):
    
    def similar(r1,r2):
        if r1 in lookup[r1]:
            return True
    
    hits = 0
    for i in range(len(wt)):
        if wt[i] == '-' or s[i] == '-':
            continue
        if wt[i] in lookup[s[i]]:
            hits = hits +1 
            
    if len(wt.replace('-','')) < len(s.replace('-','')):
        return 100*(hits/len(wt.replace('-','')))
    else:
        return 100*(hits/len(s.replace('-','')))
    
        

In [14]:
wt_aligned = aligned['WT_IDR2 [VIABLE]']
print(wt_aligned)

---------NNNNNNDGELSGTNLRS-NSI-------------DYAKHQEISSAGTSSNTTKNVNN---NKNDSNDDNNGNNNNDAS-----------NLMESVLDKTSS---HRYQPKKMPSVNKWSKPDQITHSDVSM--------VGLDES----NDGGNENVHPTLAEVDAQEARETAQLAIDKIN------SY--------KRSIDDK----NGDG--------HN------------------------------------------------------NSSRN--V--VDENL------IN--------DMDS-E--DAHKSKRQHLSDI----------------TLEER-NEDDKLPHEVAEQLRLLSSHLKEVE


In [15]:
out_data = []
identity_viable = []
identity_inviable = []

similarity_viable = []
similarity_inviable = []


for i in viable:
    if i == 'WT_IDR2 [VIABLE]':
        continue
        
    tmp = calculate_identity(wt_aligned, aligned[i])
    identity_viable.append(tmp)    
    out_data.append(f"Identity, {i}, {round(tmp,3)}")

    tmp = calculate_similarity(wt_aligned, aligned[i])
    similarity_viable.append(tmp)    
    out_data.append(f"Similarity, {i}, {round(tmp,3)}")

for i in inviable:
        
    tmp = calculate_identity(wt_aligned, aligned[i])
    identity_inviable.append(tmp)
    out_data.append(f"Identity, {i}, {round(tmp,3)}")
    
    tmp = calculate_similarity(wt_aligned, aligned[i])
    similarity_inviable.append(tmp)    
    out_data.append(f"Similarity, {i}, {round(tmp,3)}")

    
        
        

In [16]:
s1 = identity_viable
s2 = identity_inviable
yaxis_name = 'Identity (%)'
jitter_s1 = np.array([random.normalvariate(0,0.05) for i in range(len(s1))])
jitter_s2 = np.array([random.normalvariate(0,0.05) for i in range(len(s2))])

x1 = jitter_s1 +1
x2 = jitter_s2 +2


figure(num=None, figsize=(2, 3.5), dpi=200, facecolor='w', edgecolor='k')
plt.plot(x1, s1,'bo', alpha=0.3, markeredgewidth=0)
plt.plot(x2, s2,'ro', alpha=0.3, markeredgewidth=0)
plt.plot([0.75,1.25], [np.mean(s1), np.mean(s1)],'-b')
plt.plot([1.75,2.25], [np.mean(s2), np.mean(s2)],'-r')

plt.xticks([1,2],['viable','inviable'])

t = stats.ttest_ind(s1, s2, equal_var = False)

if t.pvalue > 0.05:
    plt.title('p > 0.05')
else:
    plt.title(f'p = {round(t.pvalue,5)}')

plt.ylabel(yaxis_name)
plt.tight_layout()
plt.savefig('figures/identity.pdf')
plt.close()


In [17]:
s1 = similarity_viable
s2 = similarity_inviable
yaxis_name = 'Similarity (%)'
jitter_s1 = np.array([random.normalvariate(0,0.05) for i in range(len(s1))])
jitter_s2 = np.array([random.normalvariate(0,0.05) for i in range(len(s2))])

x1 = jitter_s1 +1
x2 = jitter_s2 +2


figure(num=None, figsize=(2, 3.5), dpi=200, facecolor='w', edgecolor='k')
plt.plot(x1, s1,'bo', alpha=0.3, markeredgewidth=0)
plt.plot(x2, s2,'ro', alpha=0.3, markeredgewidth=0)
plt.plot([0.75,1.25], [np.mean(s1), np.mean(s1)],'-b')
plt.plot([1.75,2.25], [np.mean(s2), np.mean(s2)],'-r')

plt.xticks([1,2],['viable','inviable'])

t = stats.ttest_ind(s1, s2, equal_var = False)

if t.pvalue > 0.05:
    plt.title('p > 0.05')
else:
    plt.title(f'p = {round(t.pvalue,5)}')

plt.ylabel(yaxis_name)
plt.tight_layout()
plt.savefig('figures/similarity.pdf')
plt.close()
