In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from os import path, listdir, mkdir
from scipy.stats import iqr
sns.set(rc={'figure.facecolor':'white'})
sns.set(style = 'whitegrid')
markers = {"UP": "o", "DOWN": "X"}

In [50]:
def metrics(path_to_data, begin_pattern, end_pattern, method, reg_type, output_dir, 
            dataname_to_plot = None, fold_change = 2, alpha = 0.01):
    
    if method not in ['semi-dynamic', 'static', 'dynamic']:
        print('Error: check method')
        return
    elif reg_type not in ['DOWN', 'DE', 'UP+DOWN', 'UP']:
        print('Error: check reg_type')
        return
    for d in path_to_data.values():
        if path.exists(d) == False:
            print('Error: {} not exist'.format(d))
            return
    
    sample_df = pd.DataFrame(columns = ('path to file', 'dataset', 'dataname'))
    for d in path_to_data.keys():
        for file in listdir(path_to_data[d]):
            if file.startswith(begin_pattern):
                dataname = file.split(begin_pattern)[1].split(end_pattern)[0]
    
                dataset = d
                if dataname_to_plot:
                    dataname = dataname_to_plot[dataname]
                tmp = pd.DataFrame.from_dict({'dataname' : [dataname],
                                                   'dataset' : [dataset],
                                                   'path to file':[path.join(path_to_data[d], file)]})
                sample_df = pd.concat([sample_df, tmp], ignore_index = True)
                
    sample_df.to_csv(path.join(output_dir, 'metrics_sample_file.tsv'), sep = '\t', index = 'dataname')
    
    metrics = pd.DataFrame(columns = ('b1', 'b2', 'log10(b2)', 'dataname', 'dataset', 'reg_type'))

    order = sample_df['dataset'].unique()
    sns.set_palette('hls', len(order))
    figsize = (10, sample_df.shape[0]*0.5)

    for i in sample_df.index:
        df = pd.read_csv(sample_df.loc[i, 'path to file'], sep = '\t')
        df = df[['log2(fold_change)', '-log10(fdr_BH)']]
        dataset = sample_df.loc[i, 'dataset']
        dataname = sample_df.loc[i, 'dataname']

        if method == 'static':
            t_h = -np.log10(alpha)
            up_fold = np.log2(fold_change)
            down_fold = np.log2(1/fold_change)

        elif method == 'semi-dynamic':
            t_h = np.quantile(df['-log10(fdr_BH)'], 0.75) + 1.5*iqr(df['-log10(fdr_BH)'])
            up_fold = np.log2(fold_change)
            down_fold = np.log2(1/fold_change)

        else:  
            t_h = np.quantile(df['-log10(fdr_BH)'], 0.75) + 1.5*iqr(df['-log10(fdr_BH)'])
            up_fold = np.quantile(df['log2(fold_change)'], 0.75) + 1.5*iqr(df['log2(fold_change)'])
            down_fold = np.quantile(df['log2(fold_change)'], 0.25) - 1.5*iqr(df['log2(fold_change)'])

        if reg_type == 'UP':
            de_df = df[(df['-log10(fdr_BH)'] > t_h) & (df['log2(fold_change)'] >= up_fold)]
            
        elif reg_type == 'DOWN':
            de_df = df[(df['-log10(fdr_BH)'] > t_h) & (df['log2(fold_change)'] < down_fold)]
            
        elif reg_type == 'DE' or 'UP+DOWN':
            de_df = df[(df['-log10(fdr_BH)'] > t_h) & ((df['log2(fold_change)'] < down_fold)|(df['log2(fold_change)'] >= up_fold)) ]


        if reg_type == 'UP+DOWN':
            
            up_df = de_df[de_df['log2(fold_change)'] > 0]
            down_df = de_df[de_df['log2(fold_change)'] < 0]
            tmp_up = [np.abs(up_df.loc[i, 'log2(fold_change)'] * up_df.loc[i, '-log10(fdr_BH)']) for i in up_df.index]
            tmp_down = [np.abs(down_df.loc[i, 'log2(fold_change)'] * down_df.loc[i, '-log10(fdr_BH)']) for i in down_df.index]

            metrics = pd.concat([metrics, pd.DataFrame.from_dict({'b1' : [np.sum(tmp_up), np.sum(tmp_down)],
                                                                 'b2' : [np.prod(tmp_up), np.prod(tmp_down)],
                                                                 'reg_type' : ['UP', 'DOWN'],
                                                                 'dataset' : [dataset, dataset],
                                                                 'dataname' : [dataname, dataname]})], 
                                axis = 0, ignore_index = True)

        else:
            tmp = [np.abs(de_df.loc[i, 'log2(fold_change)'] * de_df.loc[i, '-log10(fdr_BH)']) for i in de_df.index]
            metrics = pd.concat([metrics, pd.DataFrame.from_dict({'b1' : [np.sum(tmp)],
                                                                 'b2' : [np.prod(tmp)],
                                                                 'dataset' : [dataset],
                                                                 'dataname' : [dataname]})], 
                                axis = 0, ignore_index = True)

    metrics['log10(b2)'] = metrics['b2'].apply(lambda x: np.log10(x))

    if path.exists(output_dir) == False:
        mkdir(output_dir)
    metrics.to_csv(path.join(output_dir, 'metrics_{}_{}.tsv'.format(method, reg_type)), sep = '\t')


    metric = ['b1', 'log10(b2)']

    metric_name = [
               '$ \pi_{1} = \sum_{i = 1}^{n} | \  log2FC_{i} \cdot log10FDR_{i} \ | $',
                '$ \pi_{2} = log_{10} ( \prod_{i = 1}^{n} \ | \  log2FC_{i} \cdot log10FDR_{i} \ | ) $'
              ]
    
    cm = 1/2.54
    for m, n in zip(metric, metric_name):
        f, ax = plt.subplots(figsize = figsize)
        if reg_type == 'UP+DOWN':
            sns.scatterplot(x = m, y = 'dataname', hue = 'dataset', hue_order = order,
                    data = metrics.sort_values(by = m, ascending = False), 
                    style = 'reg_type', markers = markers,  ax = ax, s = 100) 
        else:    
            sns.scatterplot(x = m, y = 'dataname', hue = 'dataset',
                        hue_order = order,
                        data = metrics.sort_values(by = m, ascending = False), marker = 'o',  ax = ax, s = 100) 

        ax.tick_params(axis = 'both', labelsize = 10)
        ax.set_ylabel(None)
        ax.set_xlabel(n, fontsize = 12)
        ax.set_title('{} thresholds'.format(method), fontsize = 10)
        f.tight_layout()
        plt.legend(fontsize = 10)
        plot_name = 'metric_{}_{}_{}.png'.format(m, method, reg_type)
        plt.savefig(path.join(output_dir, plot_name), dpi = 300)
        plt.close()
        




In [51]:
path_to_data = {'DBTRG&HOS IFNa' : '/home/kae-13-1/Metrics/NSAF_files/new_mv_strategy/kNN/a172_dbtrg',
                    'GBM2017 IFNa' : '/home/kae-13-1/Metrics/NSAF_files/new_mv_strategy/kNN/glioblastoma_2017_infa',
                    'GBM2017 IFNb' : '/home/kae-13-1/Metrics/NSAF_files/new_mv_strategy/kNN/glioblastoma_2017_infb',
                    'GBM2019 IFNa' : '/home/kae-13-1/Metrics/NSAF_files/new_mv_strategy/kNN/glioblastoma_2019',
                    'MRC5 IFNa': '/home/kae-13-1/Metrics/NSAF_files/new_mv_strategy/kNN/time_infa',
                    'MRC5 IFNg' : '/home/kae-13-1/Metrics/NSAF_files/new_mv_strategy/kNN/time_infg'}

dataname_to_plot = {'time_infg_48':'MRC5 IFNg 500 48h',
                   'time_infg_24' : 'MRC5 IFNg 500 24h',
                   'glioblastoma_2017_infa_b' : 'GBM2017b IFNa-2b 100 24h',
                   'a172_dbtrg_DBTRG' : 'DBTRG IFNa-2b 1000 24h',
                   'a172_dbtrg_A172' : 'HOS IFNa-2b 1000 24h',
                   'time_infg_4' : 'MRC5 IFNg 500 4h',
                   'glioblastoma_2017_infa_i' : 'GBM2017i IFNa-2b 100 24h',
                   'glioblastoma_2019_5522' : 'GBM5522 IFNa-2b 100 24h',
                   'time_infa_4' : 'MRC5 IFNa-2b 500 4h',
                   'glioblastoma_2019_AN' : 'AN IFNa-2b 100 24h',
                   'glioblastoma_2017_infb_i' : 'GBM2017i IFNb-2b 1000 24h',
                   'glioblastoma_2017_infb_b' : 'GBM2017b IFNb-2b 1000 24h',
                    'time_infa_24' : 'MRC5 IFNa-2b 500 24h',
                    'glioblastoma_2019_6067' : 'GBM6067 IFNa-2b 100 24h',
                    'time_infa_48' : 'MRC5 IFNa-2b 500 48h',
                    'glioblastoma_2019_3821' : 'GBM3821 IFNa-2b 100 24h',
                    'glioblastoma_2019_4114' : 'GBM4114 IFNa-2b 100 24h'}

begin_pattern = 'NSAF_t-test_'
end_pattern = '.tsv'


method = 'static' #'semi-dynamic', 'static'
reg_type = 'UP+DOWN' #'DOWN', 'DE' 'UP+DOWN', 'UP'
fold_change = 3
alpha = 0.01
output_dir = '/home/kae-13-1/Metrics/test_fun'

In [52]:
metrics(path_to_data, 'NSAF_t-test_', '.tsv', method = 'dynamic', reg_type = 'UP+DOWN', output_dir = '/home/kae-13-1/Metrics/test_fun',
       dataname_to_plot = dataname_to_plot)