# Prepare GSEA

## Content
- GSEA needs two files as input, one is expression dataset in TXT format, the other is phenotype dataset in CLS format
- See the explaination of TXT and CLS format in GSEA doc: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats
- This is just some spetial format needed by GSEA, so we prepare based on their requirment...

**NOTE: GSEA is mainly designed for human data, so I will not perform this with mouse data... But one the files are prepared, its pretty straight forward to run it...**

In [12]:
import pandas as pd
import pathlib

In [13]:
# make a sub dir called GSEA for new files
pathlib.Path('GSEA').mkdir(exist_ok=True)

## Load data

In [23]:
gene_meta = pd.read_csv('gene_metadata.csv.gz', index_col='gene_id')

## Prepare GSEA files

In [63]:
deg_result_paths = list(pathlib.Path().glob('*vs*.deg_results.csv.gz'))

In [71]:
for path in deg_result_paths:
    pair_name = '.'.join(path.name.split('.')[:-3])
    """
    Prepare expression data
    """
    
    # load the original DESeq2 output to get the normalized counts
    deg_with_norm_count = pd.read_csv(path, index_col=0)
    
    # take the last four columns, which are nrom counts
    # use reindex to select only DEGs
    nrom_count_df = deg_with_norm_count.iloc[:, -4:].reindex(deg_for_this_pair.index)
    
    # change the index into gene names
    nrom_count_df.index = nrom_count_df.index.map(gene_meta['gene_name'])
    
    # Add modifications to match the TXT format as requied by GSEA
    # See here: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CLS:_Categorical_.28e.g_tumor_vs_normal.29_class_file_format_.28.2A.cls.29
    nrom_count_df.index.name = 'NAME'
    nrom_count_df['DESCRIPTION'] = 'na' # add a DESCRIPTION col
    
    # move DESCRIPTION col into the first
    col_names = list(nrom_count_df.columns)
    reordered_col_names = ['DESCRIPTION'] + col_names[:-1]
    nrom_count_df = nrom_count_df[reordered_col_names]
    
    nrom_count_df.to_csv(f'GSEA/{pair_name}.expression_data.txt', sep='\t')
    
    """
    Prepare phenotype data
    """
    # prepare the CLS format as required by GSEA
    # see here: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CLS:_Categorical_.28e.g_tumor_vs_normal.29_class_file_format_.28.2A.cls.29
    
    # get sample names and their class number (0 for time 1, 1 for time 2)
    time1, time2 = pair_name.split('_vs_')
    sample_names = nrom_count_df.columns[1:]
    sample_dev_times = sample_names.str.split('_').str[1]

    cls_format_str = f"""
4 2 1
# {time1} {time2}
{' '.join(sample_dev_times)}
"""
    # the above cls_format_str create a string like this:
    """    
4 2 1
# E10.5 E14.5
E10.5 E14.5 E14.5 E10.5
"""
    with open(f'GSEA/{pair_name}.phenotype_data.cls', 'w') as f:
        f.write(cls_format_str)


In [72]:
print(cls_format_str)


4 2 1
# E10.5 P0
P0 E10.5 P0 E10.5



In [73]:
nrom_count_df

Unnamed: 0_level_0,DESCRIPTION,forebrain_P0_2,forebrain_E10.5_1,forebrain_P0_1,forebrain_E10.5_2
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cdc45,na,314.416946,3516.109813,230.042058,3574.620925
Dlg3,na,244.669525,1009.880927,242.044252,1113.534396
Uhrf1,na,859.111091,23704.051793,759.138790,24678.145384
Fndc5,na,453.911788,32.548427,528.096550,54.604139
Ube2c,na,774.971345,12599.760032,812.148482,12513.123384
...,...,...,...,...,...
Gm49325,na,498.195865,524.293582,435.079544,585.044341
Sfta3-ps,na,63.104810,449.520169,106.019383,511.913799
Bc1,na,13236.510573,540.127952,7083.295007,584.069267
Gm50241,na,21.034937,1599.271363,188.034377,1500.638735
