In [1]:
# Loads the all-against-all statistical comparison generated with EC50_plot_4 and applies a p-val correction for multiple hypothesis testing.

# The outcome is printed with the same format as the input file:


# Day:	0
# 	Non-survival	Vs	Mild	->	pval: 0.12613769431297328
# 		Non-survival: 4 instances
# 		Mild: 13 instances
# 	Non-survival	Vs	Severe	->	pval: 0.3894739290014503
# 		Non-survival: 4 instances
# 		Severe: 9 instances
# 	Mild	Vs	Severe	->	pval: 0.46248196059978564
# 		Mild: 13 instances
# 		Severe: 9 instances
# ###################################
# Day:	1
# 	Non-survival	Vs	Mild	->	pval: 0.09076155031139914
# 		Non-survival: 8 instances
# 		Mild: 16 instances
# 	Non-survival	Vs	Severe	->	pval: 0.1928126253627327
# 		Non-survival: 8 instances
# 		Severe: 15 instances
# 	Mild	Vs	Severe	->	pval: 0.8275489161908975
# 		Mild: 16 instances
# 		Severe: 15 instances
# ###################################


In [19]:
# imports
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multitest import fdrcorrection

In [6]:
class CORRECTPVAL(object):
    
    def __init__(self, infile):
        self.infile = infile
#         print('infile', self.infile)
        self.df = pd.DataFrame(columns=['Day','Comparison','Size1','Size2', 'pval'])
        self.df_filt = pd.DataFrame()
        self.mindatsize = 5
        
    
    @staticmethod
    def update_df(df, day, pair, size1, size2, pval):
        try:
            day
            pair
            size1
            size2
            pval
        except:
            print('Variables not yet defined')
        else:
            df = df.append({'Day': day,'Comparison': pair,'Size1': size1,'Size2': size2,
                            'pval': pval}, ignore_index=True)
            return(df)
        
    def readfile(self):
        f = open(self.infile,'r')
        for s in f:
            s = s.rstrip()
        #     print(s)
            ls = s.split('\t')
            if (s == '###################################'):
        #         df = update_df(df, day, pair, size1, size2, pval, pvalcor)
                continue
            elif ls[0] == 'Day:':
                day = int(ls[1])
        #         print('Day is', day)
            elif ls[2] == 'Vs':
                cat1 = ls[1]
                cat2 = ls[3]
                pair = cat1 + '_Vs_' + cat2
        #         print('Pair is', pair)
                pval = ls[5]
                pval = float(pval.replace('pval: ', '') )
#                 pvalcor = np.nan
        #         print('Pval is', pval)
        #         print('Pvalcor is', pvalcor)
            elif 'instances' in ls[2]:
                ls2 = ls[2].split(': ')
                cat = ls2[0]
                #cat = cat[0:-1]
#                 print('cat is', cat)
                if cat == cat1:
                    ls3 = ls2[1].split(' ')
                    size1 = int(ls3[0])
        #             print('Size1 is', size1)
                elif cat == cat2:
                    ls3 = ls2[1].split(' ')
                    size2 = int(ls3[0])
        #             print('Size2 is', size2)
                    self.df = self.update_df(self.df, day, pair, size1, size2, pval)
                else:
                    exit('I dont recognize category', cat)
        f.close()
        self.df_filt = self.df.loc[(self.df['Size1'] >= self.mindatsize) & (self.df['Size2'] >= self.mindatsize)].copy()
        self.df_filt.dropna(how='any', inplace = True)
        
    def correctpval_alldays(self, alphasig):
        # Extract P-values and correct
        pval_ls = self.df_filt['pval'].tolist()
        pvalcor_ls = pval_ls
        if len(pval_ls) > 0:
            pvalcor_ls = multipletests(pval_ls, alpha=alphasig, method='fdr_bh', is_sorted=False, returnsorted=False)
#             pvalcor_ls = fdrcorrection(pval_ls, alpha=alphasig, method='indep', is_sorted=False)
#             pvalcor_ls = fdrcorrection.fdrcorrection_twostage(pval_ls, alpha=alphasig, method='indep', is_sorted=False)
        else:
            return None
        
        # Make a dictionary linking P-values and corrected P-values
        pval_dc = {}
        for pval, pvalcor in zip(pval_ls, pvalcor_ls[1]):
        #     print(pval, '\t', pvalcor)
            pval_dc[pval] = pvalcor        
        
        # Make a dictionary linking P-values and whether null hypothesis is rejected
        pval_reject = {}
        for pval, reject in zip(pval_ls, pvalcor_ls[0]):
        #     print(pval, '\t', pvalcor)
            pval_reject[pval] = reject         
        
        # Fill corrected P-values in filtered dataframe
        for index, row in self.df_filt.iterrows():
            pval = row['pval']
            if pval in pval_dc.keys():
                self.df_filt.loc[index,'pvalcor_alldays'] = pval_dc[pval]
                self.df_filt.loc[index,'rejectNull_alldays_' + str(alphasig)] = pval_reject[pval]
    
    def correctpval_byday(self, alphasig):       
        day_ls = self.df_filt['Day'].tolist()
        day_set = set(day_ls)
        for i in day_set:
            df = self.df_filt.loc[self.df_filt['Day'] == i]
            # Extract P-values and correct
            pval_ls = df['pval'].tolist()
            pvalcor_ls = multipletests(pval_ls, alpha=alphasig, method='fdr_bh', is_sorted=False, returnsorted=False)
            # Make a dictionary linking P-values and corrected P-values
            pval_dc = {}
            for pval, pvalcor in zip(pval_ls, pvalcor_ls[1]):
            #     print(pval, '\t', pvalcor)
                pval_dc[pval] = pvalcor        
            # Make a dictionary linking P-values and whether null hypothesis is rejected
            pval_reject = {}
            for pval, reject in zip(pval_ls, pvalcor_ls[0]):
            #     print(pval, '\t', pvalcor)
                pval_reject[pval] = reject         
            # Fill corrected P-values in filtered dataframe
            for index, row in df.iterrows():
                pval = row['pval']
                if pval in pval_dc.keys():
                    self.df_filt.loc[index,'pvalcor_byday'] = pval_dc[pval]
                    self.df_filt.loc[index,'rejectNull_byday_' + str(alphasig)] = pval_reject[pval]

    
    def save_correction(self):
        # Save filtered dataframe with corrected P-values
        outfile = self.infile
        outfile = outfile.replace('.txt', '_pvalcor.xlsx')
        self.df_filt.to_excel(outfile)