In [18]:
import sys, os
import numpy as np
import pandas as pd
import scipy.stats as stat

In [24]:
template_file = open('../template_script/egsea_template.R', 'r')
template_file_info = template_file.read()
template_file.close()

In [25]:
# load gene expression files in raw read count matrix format
expr_dir = '../data/count_matrix/'
all_file = os.listdir(expr_dir)
all_file.sort()
all_file

['AARS.txt',
 'AATF.txt',
 'ABCF1.txt',
 'ADAR.txt',
 'AGO1.txt',
 'AKAP1.txt',
 'AKAP8.txt',
 'AKAP8L.txt',
 'APOBEC3C.txt',
 'ASCC1.txt']

In [26]:
pathway_file = '../pathway_files/K562_DESEq2.gmt'

In [27]:
# directory for storing all output results
result_dir = '../example_results/EGSEA_results/'
# write the scripts for each target to a seperate R script to enable parallel running
script_dir = '../example_scripts/EGSEA_scripts/'

In [28]:
for expr_file in all_file:
    target = expr_file.split('.')[0]
    new_template_info = template_file_info.replace('TARGET_data_matrix.txt', expr_dir+expr_file)
    new_template_info = new_template_info.replace('pathway_file_name', pathway_file)
    new_template_info = new_template_info.replace('output_file', result_dir+target+'.egsea_result.txt')
    script = open(script_dir+target+'.r', 'w')
    script.write(new_template_info)
    script.close()        

Run the R scripts. And the results will be saved in example_results/EGSEA_results/

In [36]:
# extract EGSEA results and its underlying methods results
methods_egsea = ['EGSEA', 'camera', 'safe', 'gage', 'padog', 'plage', 
                 'zscore', 'gsva', 'ssgsea', 'ora']
all_result_file = os.listdir(result_dir)
all_result_file.sort()
rank_dict = dict()
for m in methods_egsea:
    rank_dict[m] = dict()
for file in all_result_file:
    target = file.split('.')[0]
    df = pd.read_csv(result_dir+file, sep='\t')
    df['pathway_rank'] = stat.rankdata(df['avg.rank'])
    egsea_rank = df.loc[target]['pathway_rank']
    rank_dict['EGSEA'][target] = egsea_rank
    # extract results of underlying methods of EGSEA
    for m in methods_egsea[1:]:
        df[m+'_rank'] = stat.rankdata(df[m])
        method_rank = df.loc[target][m+'_rank']
        rank_dict[m][target] = method_rank

In [41]:
rank_df = pd.DataFrame.from_dict(rank_dict)
# write the results to seperate files
for m in methods_egsea:
    rank_df[m].to_csv('../example_results/'+m+'_result.txt', sep='\t', header=None)