# Generating panels for Figure S3

In [None]:
import os, progressbar, re, subprocess, time

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.sandbox.stats.multicomp as multicomp

from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq

In [None]:
def is_significant(mut_tol):
    if mut_tol > 2:
        return 'HIGH'
    if mut_tol < -2:
        return 'LOW'
    else:
        return 'non significant'


def cluster(x):
    if x in HG_high_agg['proteinID_x'].values:
        return 'HIGH'
    else:
        return 'LOW'

In [None]:
HG_cds_fasta = '../../data/ortholog_dataset/uni_HG_cds_orthologs.faa'
MM_cds_fasta = '../../data/ortholog_dataset/uni_MM_cds_orthologs.faa'

In [None]:
MM_non_valids = []
for seqRecord in SeqIO.parse(MM_cds_fasta, format='fasta') :
    try:
        AA_seq = seqRecord.seq.translate(to_stop=True)
        if AA_seq[0] != 'M':
            MM_non_valids.append(seqRecord.id)
    except:
        MM_non_valids.append(seqRecord.id)

In [None]:
HG_non_valids = []
for seqRecord in SeqIO.parse(HG_cds_fasta, format='fasta') :
    try:
        AA_seq = seqRecord.seq.translate(to_stop=True)
        if AA_seq[0] != 'M':
            HG_non_valids.append(seqRecord.id)
    except:
        HG_non_valids.append(seqRecord.id)

In [None]:
len(HG_non_valids)

In [None]:
#### Table with all per-protein aggregation propensity scores
prot_agg_table = pd.read_csv('../../data/aggregation_propensity/HGMM_agg_scores.csv', sep=',')
prot_agg_table['delta_aggregation'] = prot_agg_table['Aggregation_x'] - prot_agg_table['Aggregation_y']
prot_agg_table['delta_agg_z-scores'] = stats.zscore(prot_agg_table['delta_aggregation'])
prot_agg_table['difference_of_aggregation'] = prot_agg_table['delta_agg_z-scores'].apply(is_significant)

prot_muttol_table = pd.read_csv('../../data/mutation_tolerance/all_mt_scores_for_revisions.csv', sep='\t')

HG_high_agg = prot_agg_table[prot_agg_table['delta_agg_z-scores'] > 2]
HG_low_agg = prot_agg_table[prot_agg_table['delta_agg_z-scores'] < -2]

sign_agg_table = prot_muttol_table[(prot_muttol_table['proteinID_x'].isin(HG_high_agg['proteinID_x'])) | (prot_muttol_table['proteinID_x'].isin(HG_low_agg['proteinID_x'])) ].reset_index(drop=True)
sign_agg_table['AGG_DIFF'] = sign_agg_table['proteinID_x'].apply(cluster)

In [None]:
len(prot_muttol_table)

In [None]:
prot_muttol_table[(prot_muttol_table['proteinID_x'].isin(HG_non_valids)) ]

In [None]:
prot_muttol_table[~(prot_muttol_table['proteinID_x'].isin(HG_non_valids)) | (prot_muttol_table['proteinID_y'].isin(MM_non_valids))]

In [None]:
outliers = prot_muttol_table[(prot_muttol_table['proteinID_x'].isin(HG_non_valids)) | (prot_muttol_table['proteinID_y'].isin(MM_non_valids))]
no_outliers = prot_muttol_table.drop(outliers.index)

In [None]:
def generate_figure_4A(prot_muttol_table, col_x, col_y, plot_title):
    sns.set_context("paper", font_scale=2)
    sns.set_style("ticks") 
    sns.despine(offset=20)

    prot_muttol_table['diff_mut'] = prot_muttol_table[col_x] - prot_muttol_table[col_y]
    prot_muttol_table['diff_mut_z-scores'] = stats.zscore(prot_muttol_table['diff_mut'])
    prot_muttol_table['MT_DIFF'] = prot_muttol_table['diff_mut_z-scores'].apply(is_significant)

    if plot_title == 'only_beneficial_mutations':
        label_y = 'Proportion of beneficial mutations'
    elif plot_title == 'only_detrimental_mutations':
        label_y = 'Proportion of detrimental mutations'
    else : 
        label_y = 'Mouse lenient mutation tolerance'

    fig = plt.figure(figsize= (8, 8))
    sns.scatterplot(x=prot_muttol_table[col_x], y=prot_muttol_table[col_y], hue=prot_muttol_table['MT_DIFF'], hue_order=['non significant', 'HIGH', 'LOW'], palette=['black', 'purple', 'salmon'],legend=False)
    plt.ylabel(f'{label_y} in mouse')
    plt.xlabel(f'{label_y} in naked mole-rat')
    
    
    print(f'Stats for mouse vs. naked mole-rat {plot_title} ')
    corr, pval = stats.pearsonr( x=prot_muttol_table[col_x], y=prot_muttol_table[col_y])
    print('Correlation between mutation tolerance')
    print(corr, pval)

    # fig.savefig(f'../../figures/revisions/mutation_tolerance/{plot_title}.png', format='png', dpi=300)
    # fig.savefig(f'../../figures/revisions/mutation_tolerance/FIGURE_{plot_title}.svg', format='svg', dpi=300)

def generate_figure_4D(sign_agg_table, prot_muttol_table, col_y, plot_title):
    sns.set_context("paper", font_scale=2)
    sns.set_style("ticks")
    sns.despine(offset=20)

    plt.rcParams["figure.figsize"] = (8, 8)

    if plot_title == 'only_beneficial_mutations':
        label_y = 'Proportion of beneficial mutations in mouse'
    elif plot_title == 'only_detrimental_mutations':
        label_y = 'Proportion of detrimental mutations in mouse'
    else : 
        label_y = 'Mouse lenient mutation tolerance'

    j = sns.jointplot(data=sign_agg_table, y=col_y, x='Aggregation_y',
                    hue='AGG_DIFF', palette=['red', 'blue'], legend=False)
    j.set_axis_labels('Mouse whole-protein sequence \naggregation propensity score',
                    label_y, fontsize=16)

    MT_vs_HIGH_AGG = prot_muttol_table[prot_muttol_table['proteinID_x'].isin(
        HG_high_agg['proteinID_x'])]
    MT_vs_LOW_AGG = prot_muttol_table[prot_muttol_table['proteinID_x'].isin(
        HG_low_agg['proteinID_x'])]

    print(f'\nStats for {plot_title} in mouse')
    print('Difference of distribution in mutation tolerance between proteins with sgnificant difference of aggregation propensity in mouse')
    ks, pval = stats.kstest(MT_vs_HIGH_AGG[col_y], MT_vs_LOW_AGG[col_y])
    print(ks, pval)
    print(stats.ttest_ind(MT_vs_HIGH_AGG[col_y], MT_vs_LOW_AGG[col_y]))
    # plt.annotate(f'KS test p-value={pval:0.2e}', xy=(2,4), xycoords='figure points')

    # j.savefig(f'../../figures/revisions/mutation_tolerance/{plot_title}_mouse.png', format='png', dpi=300)
    # j.savefig('../../figures/revisions/mutation_tolerance/FIGURE4_C_v1.svg', format='svg', dpi=300)

def generate_figure_4E(sign_agg_table, prot_muttol_table, col_x, plot_title):
    sns.set_context("paper", font_scale=2)
    sns.set_style("ticks")
    sns.despine(offset=20)

    plt.rcParams["figure.figsize"] = (8, 8)

    if plot_title == 'only_beneficial_mutations':
        label_y = 'Proportion of beneficial mutations in naked mole-rat'
    elif plot_title == 'only_detrimental_mutations':
        label_y = 'Proportion of detrimental mutations in naked mole-rat'
    else :
        label_y = 'Naked mole-rat lenient mutation tolerance'

    j = sns.jointplot(data=sign_agg_table, y=col_x, x='Aggregation_x',
                      hue='AGG_DIFF', palette=['red', 'blue'], legend=False)
    j.set_axis_labels('Naked mole-rat whole-protein sequence \naggregation propensity score',
                      label_y, fontsize=16)


    MT_vs_HIGH_AGG = prot_muttol_table[prot_muttol_table['proteinID_x'].isin(
        HG_high_agg['proteinID_x'])]
    MT_vs_LOW_AGG = prot_muttol_table[prot_muttol_table['proteinID_x'].isin(
        HG_low_agg['proteinID_x'])]

    print(f'\nStats for {plot_title} in naked mole-rat')
    print('Difference of distribution in mutation tolerance between proteins with sgnificant difference of aggregation propensity in naked mole-rat')
    ks, pval = stats.kstest(MT_vs_HIGH_AGG[col_x], MT_vs_LOW_AGG[col_x])
    print(ks, pval)
    print(stats.ttest_ind(MT_vs_HIGH_AGG[col_x], MT_vs_LOW_AGG[col_x]))
    # plt.annotate(f'KS test p-value={pval:0.2e}', xy=(2,4), xycoords='figure points')
    # j.savefig(f'../../figures/revisions/mutation_tolerance/{plot_title}_nkr.png', format='png', dpi=300)
    # j.savefig('../../figures/revisions/mutation_tolerance/FIGURE4_D_v1.svg', format='svg', dpi=300)


In [None]:
# col1 = 'mutTol_0_x'
# col2= 'mutTol_0_y'
# title = 'mutation_tolerance_0'
# generate_figure_4A(no_outliers, col1, col2, title)
# generate_figure_4D(sign_agg_table, no_outliers, col2, title)
# generate_figure_4E(sign_agg_table, no_outliers, col1, title)

In [None]:
col1 = 'mutTol_minus0_x'
col2= 'mutTol_minus0_y'
title = 'mutation_minus0'
generate_figure_4A(no_outliers, col1, col2, title)
generate_figure_4D(sign_agg_table, no_outliers, col2, title)
generate_figure_4E(sign_agg_table, no_outliers, col1, title)

In [None]:
# col_x = 'mutTol_plus0_x'
# col_y = 'mutTol_plus0_y'
# title = 'mutation_plus0'
# generate_figure_4A(no_outliers, col_x, col_y, title)
# generate_figure_4D(sign_agg_table, no_outliers, col_y, title)
# generate_figure_4E(sign_agg_table, no_outliers, col_x, title)

In [None]:
col1 = 'bm_r_x'
col2= 'bm_r_y'
title = 'only_beneficial_mutations'
generate_figure_4A(no_outliers, col1, col2, title)
generate_figure_4D(sign_agg_table, no_outliers, col2, title)
generate_figure_4E(sign_agg_table, no_outliers, col1, title)

In [None]:
# col1 = 'dm_r_x'
# col2= 'dm_r_y'
# title = 'only_detrimental_mutations'
# generate_figure_4A(no_outliers, col1, col2, title)
# generate_figure_4D(sign_agg_table, no_outliers, col2, title)
# generate_figure_4E(sign_agg_table, no_outliers, col1, title)