In [72]:
import pandas as pd
import random
from os import path, listdir, mkdir
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread, imshow
import numpy as np
import urllib.request
import urllib.error

Data processing pipeline (pay attention that begin_pattern and end_pattern are changing throughout the pipeline):
1. de_genes_list select DE proteins based by the selected method (see 'method')
        input: tab-separated file with quantitative analysis results; 
        name of the file should fit the following structure begin_pattern|dataname|end_pattern and dataname should be
        unique; 
        file should contain 'log2(fold_change)', '-log10(fdr_BH)' and 'Gene' columns
        output: output: 3 comma-separated files with positively and negatively regulated DE proteins and their aggregate
        (UP, DOWN, DE):
            genes_*_UP.csv, genes_*_DOWN.csv, genes_*_DE.csv
2. choose what proteins set you would like to analise futher by choosing reg_type (see 'reg_type')
3. show_string_picture and load_go_enrichment provide GO terms enrichment analysis 
        input:  comma-separated file with DE proteins
        output: protein network plot dataname_reg_type.png and tab-separated file with identified GO terms 
            *.string.enrichment.tsv
4. enrichment_calculation select processes from GO analysis results and calculate Enrichment and GO score (formulas can be
    found below
        input: tap-separated file with identified GO terms
        output: tab-separated file with identified processes, their enrichments and GO scores (sorted by GO score):
            *.string.enrichment_processes.tsv
5. top_processes combine topN processes for different data in one dataframe 
        input: tab-separated files with identified processes, their enrichments and GO scores (sorted by GO score)
        ouput: tab-separated file with processes and their GO scores calculated on different data:
            topN_processes_*.tsv
6. processes_scatterplot creates plot showing GO score value of topN processes in different data
        input: tab-separated file with processes and their GO scores calculated on different data
        output: plot showing GO score value of topN processes in different data:
            processes_scatterplot_*.png
            
        

$$ Enrichment = log_{10}(\frac{proteome \ size}{number \ of \ DE \ genes} \cdot \frac{number \ of \ DE \ genes \ contributing \ to \ process}{number \ of \ genes \ in \ background}) $$

$$ GO \ score = Enrichment \cdot |log_{10} (fdr)| $$

Parameters:

**common**

begin_pattern - string, corresponding to the beginning of the all filenames

end_pattern - string, corresponding to the end of all filenames

output_dir - output path

**de_gene_list**

method - how the thresholds for DE proteins should be chosen :
    'static' - thresholds for the fdr and fold change are filxed and the same for all the files (see 'fold change' and
        'alpha') 
    'semi-dynamic' - thresholds for fold change are fixed (see 'fold change'), threshold for fdr is Q3 + 1.5 IQR
    'dynamic' - upper and lower fold change thesholds are Q3+1.5 IQR and Q1-1.5 IQR respectively, 
        threshold for fdr is Q3+1.5 IQR
                
reg_type - what proteins should be used for calculation:
    'UP' - only positively regulated proteins that passed the thresholds
    'DOWN' - only negatively regulated proteins that passed the thresholds
    'DE' - all the proteins that passed the thresholds; metrics is calculated across all proteins
    'UP+DOWN' - all the proteins that passed the thresholds; separate metrics for positively and negatively regulated
        proteins is calculated
    
fold_change - threshold value for fold change; used if method is 'static' or 'semi-dynamic'

alpha - threshold value for fdr; used if method is 'static'

**show_string_picture** and **load_go_enrichment**

genes - list of DE genes from comma-separated file with DE proteins

filename - name of the output file

species - ID of the organism from https://string-db.org/cgi/input?sessionId=bpn8YVolT20O&input_page_active_form=organisms

**top_processes**

N - number of top processes to be taken into analysis from each file; default 10

dataname_to_plot - dictionary containing draft data names from filenames as keys and final datanames displayed
    in the plot as values; default None
    
add_name - additional discription to the output filename; default None


In [193]:
def de_gene_list(path_to_dir, output_dir, method, begin_pattern, end_pattern, fold_change = 2,
    alpha = 0.01):
    
    for file in listdir(path_to_dir):
        if (file.endswith(end_pattern)) and (file.startswith(begin_pattern)): 
            
            df = pd.read_csv(path.join(path_to_dir, file), delimiter = '\t')
            dataname = file.split(begin_pattern)[1].split(end_pattern)[0]
            if method == 'static':
                t_h = -np.log10(alpha)
                up_fold = np.log2(fold_change)
                down_fold = np.log2(1/fold_change)
                
            elif method == 'semi-dynamic':
                t_h = np.quantile(df['-log10(fdr_BH)'], 0.75) + 1.5*iqr(df['-log10(fdr_BH)'])
                up_fold = np.log2(fold_change)
                down_fold = np.log2(1/fold_change)
                
            else:  
                t_h = np.quantile(df['-log10(fdr_BH)'], 0.75) + 1.5*iqr(df['-log10(fdr_BH)'])
                up_fold = np.quantile(df['log2(fold_change)'], 0.75) + 1.5*iqr(df['log2(fold_change)'])
                down_fold = np.quantile(df['log2(fold_change)'], 0.25) - 1.5*iqr(df['log2(fold_change)'])
            
            df['fold_change'] = df['log2(fold_change)'].apply(lambda x: 2**x)    
            up = df[(df['-log10(fdr_BH)'] > t_h) & (df['log2(fold_change)'] >= up_fold)]
            down = df[(df['-log10(fdr_BH)'] > t_h) & (df['log2(fold_change)'] < down_fold)]
            DE =  df[(df['-log10(fdr_BH)'] > t_h) & ((df['log2(fold_change)'] < down_fold) |(df['log2(fold_change)'] >= up_fold)) ]
                      
            up[['fold_change', '-log10(fdr_BH)', 'Gene']].to_csv(path.join(output_dir, '{}_{}_{}_UP.csv'.format('genes', dataname, method)), index = True)          
            down[['fold_change', '-log10(fdr_BH)', 'Gene']].to_csv(path.join(output_dir, '{}_{}_{}_DOWN.csv'.format('genes', dataname, method)), index = True)
            DE[['fold_change', '-log10(fdr_BH)', 'Gene']].to_csv(path.join(output_dir, '{}_{}_{}_DE.csv'.format('genes', dataname, method)), index = True)


def show_string_picture(genes, filename, species):
    output_format = "image"
    method = "network"
    species = species
    request_url = "https://string-db.org/api" + "/" + output_format + "/" + method + "?"
    request_url += "species=" + species
    request_url += "&identifiers={}"
    request_url = request_url.format("%0d".join(genes))
    try:
        urllib.request.urlretrieve(request_url, filename="string.png")
        time.sleep(3)
    except urllib.error.HTTPError as exception:
        print(exception)
    img = imread('string.png')
    plt.figure(dpi = 600)
    imgplot = imshow(img)
    plt.savefig(filename, bbox_inches='tight')
    plt.close()
        
def load_go_enrichment(genes, filename, species):
    string_api_url = "https://string-db.org/api/"
    output_format = "tsv"
    method = "enrichment"
    request_url = string_api_url + output_format + "/" + method + "?"
    request_url += "identifiers=" + "%0d".join([str(itm) for itm in list(genes)])
    request_url += "&" + "species=" + species
    try:
        urllib.request.urlretrieve(request_url, filename)
        time.sleep(3)
    except urllib.error.HTTPError as exception:
        print(exception)
        
def enrichment_calculation(path_to_dir, end_pattern):
    n_genes = pd.DataFrame(columns = ['n', 'dataname'])
   
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern):
            df = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            dataname = file.split(end_pattern)[0]
            n_genes = pd.concat([n_genes, pd.DataFrame.from_dict({'n' : [df.shape[0]],
                                                                  'dataname' : [dataname]})], 
                                                                    axis = 0)
    n_genes = n_genes.set_index('dataname')
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern):
            dataname = file.split(end_pattern)[0]   
            d = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            d = d[d['category'] == 'Process']
            d['-log10(fdr)'] = d['fdr'].apply(lambda x: -np.log10(x))
            n_g = n_genes.loc[dataname, 'n']

            d['enrichment'] = d['number_of_genes']*20000/d['number_of_genes_in_background']/n_g
            d['enrichment'] = d['enrichment'].apply(lambda x: np.log10(x))

            d['metric'] = d['-log10(fdr)'] * d['enrichment']

            d.to_csv(path.join(path_to_dir, dataname + '.string.enrichment_processes.tsv'), sep = '\t')
            
def top_processes(path_to_dir, begin_pattern, end_pattern, output_dir, N = 10, dataname_to_plot = None, add_name = None):
    processes_set = set()
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern):
            d = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            d = d.sort_values(by = 'metric', ascending = False)
            df = d.head(N)
            tmp = list(df['description'].values)
            processes_set.update(tmp)
    processes = pd.DataFrame(list(processes_set), columns = ['description'])
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern): 
            dataname = file.split(begin_pattern)[1].split(end_pattern)[0]
            if dataname_to_plot:
                dataname = dataname_to_plot[dataname]
            d = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            processes = processes.merge(right = d[['description','metric']], how = 'left',
                                                               on = 'description')
            processes = processes.rename(columns = {'metric' : dataname})
    if add_name:
        output_filename = 'top{}_processes_{}.tsv'.format(N, add_name)
    else:
        output_filename = 'top{}_processes.tsv'.format(N)
    processes.to_csv(path.join(output_dir, output_filename), sep = '\t')
    return processes

def processes_scatterplot(path_to_dir, output_dir, N = 10, add_name = None):
    for file in listdir(path_to_dir):
        begin_pattern = 'top{}_'.format(N)
        if (file.startswith(begin_pattern)) and ('_processes' in file):
            dataset = pd.read_csv(path.join(path_to_dir, file), sep = '\t', index_col = 0)
            tmp_df = pd.DataFrame(columns = ['description', 'value', 'data'])
            dataset = dataset.set_index('description')
            dataset = dataset.fillna(value = 0)
            cols = dataset.columns

            for col in cols:
                for i in dataset.index:
                    tmp = pd.DataFrame([[col, i, dataset.loc[i, col]]], columns = ['data', 'description', 'value'])
                    tmp_df = pd.concat([tmp_df, tmp], ignore_index = True)
            tmp_df['description'] = tmp_df['description'].apply(lambda x: x.replace('Antigen processing and presentation', 'APP'))

            tmp_df = tmp_df.sort_values(by = 'value', ascending = False)

            y_len = len(tmp_df['description'].unique())
            f, ax = plt.subplots(figsize = (10, y_len*0.5))
            sp = sns.scatterplot(x = 'value', y = 'description', hue = 'data',
                                 edgecolor = 'black', linewidth = .1,
                                 data = tmp_df, ax = ax, s = 30, alpha = 0.9)
            ax.set_xlabel('Score')
            ax.set_ylabel(None)
            if add_name:
                output_figname = 'processes_scatterplot_{}.png'.format(add_name)
            else:
                output_figname = 'processes_scatterplot.png'
            f.savefig(path.join(output_dir, output_figname), dpi = 600, bbox_inches = 'tight')
            plt.close()

## Example usage

In [183]:
ffolders = ['/home/kae-13-1/Metrics/Mar2022/16_03/kNN/a172_dbtrg',
            '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/glioblastoma_2017_infa',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/glioblastoma_2017_infb',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/glioblastoma_2019',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/time_infa',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/time_infg']

begin_pattern = 'NSAF_t-test_'
end_pattern = '.tsv'
method = 'static'
fold_change = 2
alpha = 0.01
output_dir = '/home/kae-13-1/Metrics/test_fun'
add_name = 'GO_terms_analysis_testing'

In [195]:
### STEP 1
for folder in ffolders:
    de_gene_list(folder, output_dir, method, begin_pattern, end_pattern, fold_change, alpha)
    
### STEP 2
begin_pattern = 'genes_'
reg_type = 'UP'
end_pattern = reg_type + '.csv'
species = '9606' #human

### STEP 3
for file in listdir(output_dir):
    if (file.endswith(end_pattern)) & (file.startswith(begin_pattern)):
        df = pd.read_csv(path.join(output_dir, file))
        df = df[df['Gene'].notna()]
        gene = df['Gene']
        dataname = file.split(end_pattern)[0] + reg_type
        output_dir_GO = path.join(output_dir, 'GO')
        if path.exists(output_dir_GO) == False:
            mkdir(output_dir_GO) 
        species = species
        show_string_picture(gene, path.join(output_dir_GO, dataname + '.png'), species)
        time.sleep(3)
        load_go_enrichment(gene, path.join(output_dir_GO, dataname + '.string.enrichment.tsv'), species)
        time.sleep(3)
        
### STEP 4
end_pattern = '.string.enrichment.tsv'
enrichment_calculation(output_dir_GO, end_pattern)

### STEP 5
end_pattern = '.string.enrichment_processes.tsv'
N = 10
processes_df = top_processes(output_dir_GO, begin_pattern, end_pattern, output_dir, N, add_name = add_name)

### STEP 6
processes_scatterplot(output_dir, output_dir, N, add_name)