In [12]:
import pandas as pd
import plotly.io as pio
from pathlib import Path
import matrix_transform
import numpy as np
import plotly.graph_objects as go
import snakemake_funcs as sf
from Bio.Seq import Seq
import visualize
from scipy import stats
import plotly.express as px
import re
import function_bio_rep
import statsmodels.stats.multitest as multitest

%matplotlib inline
pio.orca.config.use_xvfb = True
colors = ['#D81B60', '#1E88E5', '#FFC107', '#31B547']

In [2]:
grouped_aa = ['H', 'K','R','D','E','C','M','N','Q','S','T','A',\
             'I','L','V','F','W','Y','G','P','*']
wt_full = ('MSGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICT'
           'SEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVLKLKV'
           'DTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIK'
           'GSFLNGSCGSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYG'
           'PFVDRQTAQAAGTDTTITVNVLAWLYAAVINGDRWFLNRFTTTLND'
           'FNLVAMKYNYEPLTQDHVDILGPLSAQTGIAVLDMCASLKELLQNG'
           'MNGRTILGSALLEDEFTPFDVVRQCSGVTFQ')
spreadsheet = "sample_spreadsheet_042021.csv"

In [3]:
def p_values_means(comp1, comp2, cond):
    '''
    Calculates p values when synonymous codings are not being considered. 
    comp1 being the condition on the x axis
    comp2 being the condition on the y axis
    cond being the conditionn taking on values of '_nosyn' or ''.
    '''
    summary = []
    file1 = '/home/jennysheng/Yeast/wt_STOP_matrices'+ cond + '/'+ comp1 + '.csv'
    file2 = '/home/jennysheng/Yeast/wt_STOP_matrices'+ cond + '/'+ comp2 + '.csv'
    cond1_df = pd.read_csv(file1, index_col = 0)
    cond2_df = pd.read_csv(file2, index_col = 0)
    cond1_mean = pd.melt(cond1_df.reset_index(), id_vars='index')
    cond2_mean = pd.melt(cond2_df.reset_index(), id_vars='index')
    file1_std = pd.read_csv('/home/jennysheng/Yeast/std_wt_STOP' + \
                            cond + '/'+ comp1 + '.csv', index_col = 0)
    file2_std = pd.read_csv('/home/jennysheng/Yeast/std_wt_STOP' + \
                            cond + '/'+ comp2 + '.csv', index_col = 0)
    cond1_std = pd.melt(file1_std.reset_index(), id_vars='index')
    cond2_std = pd.melt(file2_std.reset_index(), id_vars='index')
    norm_merge = cond1_mean.merge(cond2_mean, on = ['index', 'variable'])
    std_merge = cond1_std.merge(cond2_std, on = ['index', 'variable'])
    std_merge.columns = ['index', 'variable', 'std_x', 'std_y']
    #number of observations
    nobs1 = '/home/jennysheng/Yeast/len_variants'+ cond + '/'+ comp1 + '.csv'
    nobs2 = '/home/jennysheng/Yeast/len_variants'+ cond + '/'+ comp2 + '.csv'
    nobs1 = pd.read_csv(nobs1, index_col = 0)
    nobs2 = pd.read_csv(nobs2, index_col = 0)
    cond1_nobs = pd.melt(nobs1.reset_index(), id_vars='index')
    cond2_nobs = pd.melt(nobs2.reset_index(), id_vars='index')
    nobs_merge = cond1_nobs.merge(cond2_nobs, on = ['index', 'variable'])
    nobs_merge.columns = ['index', 'variable', 'nobs_x', 'nobs_y']
    all_stats = norm_merge.merge(std_merge, on = ['index', 'variable'])
    all_stats = all_stats.merge(nobs_merge, on = ['index', 'variable'])
    for ind, row in all_stats.iterrows():
        if row['nobs_x'] > 1 and row['nobs_y'] > 1 and row['value_x'] != 0 and row['value_y'] !=0:
            stat, p_val = stats.ttest_ind_from_stats(
                    row['value_x'], row['value_x'], row['nobs_x'],
                    row['value_y'], row['value_y'], row['nobs_y'], equal_var = False)
            summary.append([row['variable'], row['index'], stat, p_val])
        else:
            summary.append([row['variable'], row['index'], np.nan, np.nan])

    summary = pd.DataFrame(summary, columns = ['residue', 'codon', 
                                               't_stat', 'p_value'])
    return(summary)

In [4]:
p_val_syn = p_values_means('Glu_Gal', 'Glu_Gc', '')
# p_val_syn.to_csv('pval_Glu_Gal_Glu_Gc_syn.csv')

In [5]:
p_val_nosyn = p_values_means('Glu_Gal', 'Glu_Gc', '_nosyn')
# p_val_nosyn.to_csv('pval_Glu_Gal_Glu_Gc_nosyn.csv')