In [72]:
import pandas as pd
import random
from os import path, listdir, mkdir
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread, imshow
import numpy as np
import urllib.request
import urllib.error

In [92]:
def de_gene_list(data_dir, output_dir, method, begin_pattern, end_pattern, fold_change = 2,
    alpha = 0.01):
    
    for file in listdir(data_dir):
        if (file.endswith(end_pattern)) and (file.startswith(begin_pattern)): 
            
            df = pd.read_csv(path.join(data_dir, file), delimiter = '\t')
            dataname = file.split(begin_pattern)[1].split(end_pattern)[0]
            if method == 'static':
                t_h = -np.log10(alpha)
                up_fold = np.log2(fold_change)
                down_fold = np.log2(1/fold_change)
                
            elif method == 'semi-dynamic':
                t_h = np.quantile(df['-log10(fdr_BH)'], 0.75) + 1.5*iqr(df['-log10(fdr_BH)'])
                up_fold = np.log2(fold_change)
                down_fold = np.log2(1/fold_change)
                
            else:  
                t_h = np.quantile(df['-log10(fdr_BH)'], 0.75) + 1.5*iqr(df['-log10(fdr_BH)'])
                up_fold = np.quantile(df['log2(fold_change)'], 0.75) + 1.5*iqr(df['log2(fold_change)'])
                down_fold = np.quantile(df['log2(fold_change)'], 0.25) - 1.5*iqr(df['log2(fold_change)'])
            
            df['fold_change'] = df['log2(fold_change)'].apply(lambda x: 2**x)    
            up = df[(df['-log10(fdr_BH)'] > t_h) & (df['log2(fold_change)'] >= up_fold)]
            down = df[(df['-log10(fdr_BH)'] > t_h) & (df['log2(fold_change)'] < down_fold)]
            DE =  df[(df['-log10(fdr_BH)'] > t_h) & ((df['log2(fold_change)'] < down_fold) |(df['log2(fold_change)'] >= up_fold)) ]
                      
            up[['fold_change', '-log10(fdr_BH)', 'Gene']].to_csv(path.join(output_dir, '{}_{}_{}_UP.csv'.format('genes', dataname, method)), index = True)          
            down[['fold_change', '-log10(fdr_BH)', 'Gene']].to_csv(path.join(output_dir, '{}_{}_{}_DOWN.csv'.format('genes', dataname, method)), index = True)
            DE[['fold_change', '-log10(fdr_BH)', 'Gene']].to_csv(path.join(output_dir, '{}_{}_{}_DE.csv'.format('genes', dataname, method)), index = True)

In [166]:
def show_string_picture(genes, filename, species):
    output_format = "image"
    method = "network"
    species = species
    request_url = "https://string-db.org/api" + "/" + output_format + "/" + method + "?"
    request_url += "species=" + species
    request_url += "&identifiers={}"
    request_url = request_url.format("%0d".join(genes))
    try:
        urllib.request.urlretrieve(request_url, filename="string.png")
        time.sleep(3)
    except urllib.error.HTTPError as exception:
        print(exception)
    img = imread('string.png')
    plt.figure(dpi = 600)
    imgplot = imshow(img)
    plt.savefig(filename, bbox_inches='tight')
    plt.close()
        
def load_go_enrichment(genes, filename, species):
    string_api_url = "https://string-db.org/api/"
    output_format = "tsv"
    method = "enrichment"
    request_url = string_api_url + output_format + "/" + method + "?"
    request_url += "identifiers=" + "%0d".join([str(itm) for itm in list(genes)])
    request_url += "&" + "species=" + species
    try:
        urllib.request.urlretrieve(request_url, filename)
        time.sleep(3)
    except urllib.error.HTTPError as exception:
        print(exception)
        
def enrichment_calculation(path_to_dir, end_pattern):
    n_genes = pd.DataFrame(columns = ['n', 'dataname'])
   
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern):
            df = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            dataname = file.split(end_pattern)[0]
            n_genes = pd.concat([n_genes, pd.DataFrame.from_dict({'n' : [df.shape[0]],
                                                                  'dataname' : [dataname]})], 
                                                                    axis = 0)
    n_genes = n_genes.set_index('dataname')
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern):
            dataname = file.split(end_pattern)[0]   
            d = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            d = d[d['category'] == 'Process']
            d['-log10(fdr)'] = d['fdr'].apply(lambda x: -np.log10(x))
            n_g = n_genes.loc[dataname, 'n']

            d['enrichment'] = d['number_of_genes']*20000/d['number_of_genes_in_background']/n_g
            d['enrichment'] = d['enrichment'].apply(lambda x: np.log10(x))

            d['metric'] = d['-log10(fdr)'] * d['enrichment']

            d.to_csv(path.join(path_to_dir, dataname + '.string.enrichment_updated.tsv'), sep = '\t')
            
def top_processes(path_to_dir, begin_pattern, end_pattern, output_dir, dataname_to_plot = None):
    processes_set = set()
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern):
            d = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            d = d.sort_values(by = 'metric', ascending = False)
            df = d.head(10)
            tmp = list(df['description'].values)
            processes_set.update(tmp)
    processes = pd.DataFrame(list(processes_set), columns = ['description'])
    for file in listdir(path_to_dir):
        if file.endswith(end_pattern): 
            dataname = file.split(begin_pattern)[1].split(end_pattern)[0]
            if dataname_to_plot:
                dataname = dataname_to_plot[dataname]
            d = pd.read_csv(path.join(path_to_dir, file), sep = '\t')
            processes = processes.merge(right = d[['description','metric']], how = 'left',
                                                               on = 'description')
            processes = processes.rename(columns = {'metric' : dataname})
    processes.to_csv(path.join(output_dir, 'processes.tsv'), sep = '\t')
    return processes

def processes_scatterplot(dataset, plt_dir):
    tmp_df = pd.DataFrame(columns = ['description', 'value', 'data'])
    dataset = dataset.set_index('description')
    dataset = dataset.fillna(value = 0)
    cols = dataset.columns

    for col in cols:
        for i in dataset.index:
            tmp = pd.DataFrame([[col, i, dataset.loc[i, col]]], columns = ['data', 'description', 'value'])
            tmp_df = pd.concat([tmp_df, tmp], ignore_index = True)
    tmp_df['description'] = tmp_df['description'].apply(lambda x: x.replace('Antigen processing and presentation', 'APP'))
    
    tmp_df = tmp_df.sort_values(by = 'value', ascending = False)
    
    y_len = len(tmp_df['description'].unique())
    f, ax = plt.subplots(figsize = (10, y_len*0.5))
    sp = sns.scatterplot(x = 'value', y = 'description', hue = 'data',
                         edgecolor = 'black', linewidth = .1,
                         data = tmp_df, ax = ax, s = 30, alpha = 0.9)
    ax.set_xlabel('Score')
    ax.set_ylabel(None)
    f.savefig(path.join(plt_dir, 'processes_scatterplot.png'), dpi = 600, bbox_inches = 'tight')
    plt.close()

## Example usage

In [167]:
ffolders = ['/home/kae-13-1/Metrics/Mar2022/16_03/kNN/a172_dbtrg',
            '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/glioblastoma_2017_infa',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/glioblastoma_2017_infb',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/glioblastoma_2019',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/time_infa',
             '/home/kae-13-1/Metrics/Mar2022/16_03/kNN/time_infg']

begin_pattern = 'NSAF_t-test_'
end_pattern = '.tsv'
reg_type = 'UP'
method = 'static'
fold_change = 2
alpha = 0.01
output_dir = '/home/kae-13-1/Metrics/test_fun'
species = '9606' ## HUMAN 

In [168]:
for folder in ffolders:
    de_gene_list(folder, output_dir, method, begin_pattern, end_pattern, fold_change, alpha)
begin_pattern = 'genes_'
end_pattern = reg_type + '.csv'
for file in listdir(output_dir):
    if (file.endswith(end_pattern)) & (file.startswith(begin_pattern)):
        df = pd.read_csv(path.join(output_dir, file))
        df = df[df['Gene'].notna()]
        gene = df['Gene']
        dataname = file.split(end_pattern)[0] + reg_type
        output_dir_GO = path.join(output_dir, 'GO')
        if path.exists(output_dir_GO) == False:
            mkdir(output_dir_GO) 
        species = species
        show_string_picture(gene, path.join(output_dir_GO, dataname + '.png'), species)
        time.sleep(3)
        load_go_enrichment(gene, path.join(output_dir_GO, dataname + '.string.enrichment.tsv'), species)
        time.sleep(3)
output_dir_GO = path.join(output_dir, 'GO')
end_pattern = '.string.enrichment.tsv'
enrichment_calculation(output_dir_GO, end_pattern)
end_pattern = '.string.enrichment_updated.tsv'
processes_df = top_processes(output_dir_GO, begin_pattern, end_pattern, output_dir )     
processes_scatterplot(processes_df, output_dir)