In [4]:
import pandas as pd
from os import path, listdir, mkdir
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import iqr
import seaborn as sns
sns.set(rc={'figure.facecolor':'white'})
sns.set(style = 'whitegrid')

In [5]:
def volcano(plt_dir, data_dir, data_name, 
            method, begin_pattern = 'NSAF_t-test', end_pattern = None, alpha = 0.05, fold_change = 2):   
    
    if method not in ['semi-dynamic', 'static', 'dynamic']:
        print('Error: check method')
        return
    if path.exists(data_dir) == False:
        print('Error: {} not exist'.format(data_dir))
        return
    
    output = path.join(plt_dir, data_name) 
    if path.exists(output) == False:
        mkdir(output)
       
    for rf in listdir(data_dir):    
        if rf.startswith(begin_pattern) and '.string.enrichment.tsv' not in rf and '.string.enrichment_targets_in_terms.csv' not in rf and 'clusters' not in rf:
            input_file = path.join(data_dir, rf)

            strain = rf.split(begin_pattern)[1].split(end_pattern)[0]
            print(strain)

            d = pd.read_csv(input_file, delimiter = '\t', header = 0)
            b = np.quantile(d['-log10(fdr_BH)'], 0.75) + 1.5*iqr(d['-log10(fdr_BH)'])
            dyn = 10**(-b)

            if method == 'dynamic':
                th = b
                up_fold = np.quantile(d['log2(fold_change)'], 0.75) + 1.5*iqr(d['log2(fold_change)'])
                down_fold = np.quantile(d['log2(fold_change)'], 0.25) - 1.5*iqr(d['log2(fold_change)'])
                print(round(up_fold, 3), round(down_fold, 3))
                add_name = '_dynamic'
            elif method == 'semi-dynamic': 
                th = b
                up_fold = np.log2(fold_change)
                down_fold = np.log2(1/fold_change)
                add_name = '_semi-dynamic'
            elif method == 'static':
                th = -np.log10(alpha)
                up_fold = np.log2(fold_change)
                down_fold = np.log2(1/fold_change)
                add_name = '_static'

            up = d[['-log10(fdr_BH)','log2(fold_change)']][(d['-log10(fdr_BH)'] > th)
                                                                        &(d['log2(fold_change)'] >= up_fold)]
            down = d[['-log10(fdr_BH)','log2(fold_change)']][(d['-log10(fdr_BH)'] > th)
                                                                        &(d['log2(fold_change)'] < down_fold)]
            marg = d[['-log10(fdr_BH)','log2(fold_change)']][(d['-log10(fdr_BH)'] > th)
                                                                        &(abs(d['log2(fold_change)']) < up_fold)]
            #диаграмма рассеяния

            y_lim = np.max(d['-log10(fdr_BH)']) + 5
            g = sns.JointGrid(x = 'log2(fold_change)', y = '-log10(fdr_BH)', 
                              data = d, ylim = (-0.25, y_lim), height = 8)

            g.plot_joint(sns.scatterplot, color = 'green', s = 10, label = '%s' % strain)     
            
            #вертикальные пороги
            g.ax_joint.plot([up_fold]*len(np.arange(-0.1, y_lim, 0.1)), np.arange(-0.1, y_lim, 0.1), color = "grey", 
                            label = 'f_c up = %.3f' % 2**(up_fold))
            g.ax_joint.plot([down_fold]*len(np.arange(-0.1, y_lim, 0.1)), np.arange(-0.1, y_lim, 0.1), color = "grey", 
                            label = 'f_c down = %.3f' % 2**(down_fold))
            #boxplot
            g.plot_marginals(sns.boxplot, linewidth = 0.5, fliersize = 3)

            #горизонтальная линия с оптимизированным порогом
            g.ax_joint.plot(d['log2(fold_change)'], [b]*len(d['log2(fold_change)']), color = "black", linestyle = ':',
                label = 'fdr = %.5f' % dyn)

            #горизонтальная линия с alpha = 0.05 (default)
            g.ax_joint.plot(d['log2(fold_change)'], [-np.log10(alpha)]*len(d['log2(fold_change)']), color = "red", 
                             linestyle = ':', label = 'fdr = %g' % alpha)
           
            g.ax_joint.set_xlabel('log2(fold_change)', fontsize = 12)
            g.ax_joint.set_ylabel('-log10(fdr_BH)', fontsize = 12)
            g.ax_joint.tick_params(axis = 'both', labelsize = 12)
   
            legendMain = g.ax_joint.legend(loc = 'upper left', fontsize = 12)

            plt.text(x = 0.7, y = 0.9,s = 'up = {}\ndown = {}\nDE = {}'.format(up.shape[0], down.shape[0], 
                                                                            up.shape[0] + down.shape[0]), 
                horizontalalignment = 'left', 
                verticalalignment = 'top', 
                transform  = g.ax_joint.transAxes, fontsize = 12)
            
            plt.savefig(path.join(output, 'volcano_{}_{}{}.png'.format(data_name, strain, add_name)), dpi = 300)
            plt.close()
            


In [6]:
volcano(plt_dir = '/home/kae-13-1/plants',
        data_dir =  '/home/kae-13-1/plants', 
        data_name = 'plants_US', method = 'static', alpha = 0.05, fold_change = 1.5, 
        begin_pattern = 'ms1diffacto_out_', end_pattern = '_sum_each_run.txt')

Leaves_US_Roots_US
