# Prepare GSEA

## Content
- GSEA needs two files as input, one is expression dataset in TXT format, the other is phenotype dataset in CLS format
- See the explaination of TXT and CLS format in GSEA doc: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats
- This is just some spetial format needed by GSEA, so we prepare based on their requirment...

**NOTE: GSEA is mainly designed for human data, so I will not perform this with mouse data... But one the files are prepared, its pretty straight forward to run it...**

In [1]:
import pandas as pd
import pathlib

In [2]:
# make a sub dir called GSEA for new files
pathlib.Path('GSEA').mkdir(exist_ok=True)

## Load data

In [3]:
gene_meta = pd.read_csv('gene_metadata.csv.gz', index_col='gene_id')

## Prepare GSEA files

In [4]:
deg_result_paths = list(pathlib.Path().glob('*vs*.deg_results.csv.gz'))

In [5]:
for path in deg_result_paths:
    pair_name = '.'.join(path.name.split('.')[:-3])
    """
    Prepare expression data
    """
    
    # load the original DESeq2 output to get the normalized counts
    deg_with_norm_count = pd.read_csv(path, index_col=0)
    
    # take the last four columns, which are nrom counts
    nrom_count_df = deg_with_norm_count.iloc[:, -4:].copy()
    
    # change the index into gene names
    nrom_count_df.index = nrom_count_df.index.map(gene_meta['gene_name'])
    
    # Add modifications to match the TXT format as requied by GSEA
    # See here: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CLS:_Categorical_.28e.g_tumor_vs_normal.29_class_file_format_.28.2A.cls.29
    nrom_count_df.index.name = 'NAME'
    nrom_count_df['DESCRIPTION'] = 'na' # add a DESCRIPTION col
    
    # move DESCRIPTION col into the first
    col_names = list(nrom_count_df.columns)
    reordered_col_names = ['DESCRIPTION'] + col_names[:-1]
    nrom_count_df = nrom_count_df[reordered_col_names]
    
    nrom_count_df.to_csv(f'GSEA/{pair_name}.expression_data.txt', sep='\t')
    
    """
    Prepare phenotype data
    """
    # prepare the CLS format as required by GSEA
    # see here: http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CLS:_Categorical_.28e.g_tumor_vs_normal.29_class_file_format_.28.2A.cls.29
    
    # get sample names and their class number (0 for time 1, 1 for time 2)
    time1, time2 = pair_name.split('_vs_')
    sample_names = nrom_count_df.columns[1:]
    sample_dev_times = sample_names.str.split('_').str[1]

    cls_format_str = f"""
4 2 1
# {time1} {time2}
{' '.join(sample_dev_times)}
"""
    # the above cls_format_str create a string like this:
    """    
4 2 1
# E10.5 E14.5
E10.5 E14.5 E14.5 E10.5
"""
    with open(f'GSEA/{pair_name}.phenotype_data.cls', 'w') as f:
        f.write(cls_format_str)


In [6]:
print(cls_format_str)


4 2 1
# E10.5 P0
P0 E10.5 P0 E10.5



In [7]:
nrom_count_df

Unnamed: 0_level_0,DESCRIPTION,forebrain_P0_2,forebrain_E10.5_1,forebrain_P0_1,forebrain_E10.5_2
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gnai3,na,9202.231171,14212.226704,8662.583742,14009.861823
Cdc45,na,314.416946,3516.109813,230.042058,3574.620925
H19,na,676.439274,3572.409795,578.105693,4100.185758
Scml2,na,157.208473,312.288963,181.033097,275.945914
Apoh,na,6.642612,0.000000,6.001097,0.000000
...,...,...,...,...,...
AC093451.1,na,4.428408,0.000000,0.000000,0.000000
AC154013.1,na,0.000000,1.759374,0.000000,3.900296
AC127285.1,na,2.214204,6.157811,1.000183,20.476552
CT868690.1,na,7.749713,0.879687,2.000366,0.000000
