In [None]:
# https://gseapy.readthedocs.io/en/latest/gseapy_example.html

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import gseapy as gp

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
dbs = gp.get_library_name()


## Read in naming file

In [None]:
inputFile = 'data/omics_revamped_LLM_DF.tsv'

In [None]:
outputFile = 'data/omics_test_revamped_LLM_Enrichr_DF.tsv' 

In [None]:
genesets_LLM = pd.read_csv(inputFile, sep = "\t") 

In [None]:
# genesets_LLM.GeneSetID.unique()

In [None]:
# Remove "HALLMARK_" prefix for MSigDB gene sets
genesets_LLM['GeneSetName'] = genesets_LLM['GeneSetName'].str.removeprefix("HALLMARK_").str.replace('_', ' ')


In [None]:
genesets_LLM.columns

In [None]:
columnsToKeep = ['Source', 'GeneSetID', 'GeneSetName', 'GeneList', 'n_Genes', 'LLM Name',
       'LLM Analysis', 'Score']
geneSeparator = " "

In [None]:
LLM_DF = genesets_LLM

In [None]:
nGeneSets = LLM_DF.shape[0]

In [None]:
nGeneSets

In [None]:
LLM_repeated_DF = LLM_DF[LLM_DF[['Source', 'GeneSetID', 'GeneSetName', 'GeneList']].duplicated() == True]


In [None]:
LLM_DF.shape

## Go through each system and run Enrichr 

In [None]:
# Each system will have multiple rows

In [None]:
additionalCols = ['Rank', 'Overlap', 'P-value', 'Adjusted P-value', 'Genes', 'GO term', 'GO ID' ]

In [None]:
expandedColumnsToKeep =columnsToKeep;
expandedColumnsToKeep.extend(additionalCols)

In [None]:
expanded_LLM_genes_DF = pd.DataFrame(columns=expandedColumnsToKeep)

In [None]:
genesCol = 'GeneList'

In [None]:
?gp.enrichr

In [None]:
for geneSetInd in range(nGeneSets):
    print(geneSetInd)
    geneSetRow = LLM_DF.iloc[geneSetInd]
    geneSetGenes = geneSetRow[genesCol].split(geneSeparator)

    # Perform enrichment analysis
    enr = gp.enrichr(gene_list=geneSetGenes, # or "./tests/data/gene_list.txt",
                 gene_sets=['GO_Biological_Process_2023'],
                 organism='human', # don't forget to set organism to the one you desired! e.g. Yeast
                 outdir=None, # don't write to disk
                )
    
    # threshold
    # sigRes_DF =  enr.results.loc[enr.results['Adjusted P-value'] <=0.1]
    sigRes_DF = enr.results # remove thresholding, because want a name for each one
    sigRes_DF = sigRes_DF.sort_values('Adjusted P-value')
    if sigRes_DF.shape[0] == 0:
        
        print([geneSetInd, ' has not enirchment'])
        #continue # skip to next gene set
        sigRes_DF.loc[0, ['GO term', 'GO ID', 'nothing']] = 'NaN'
        sigRes_DF.loc[0, ['Rank']]  = 1
        sigRes_DF.loc[0, ['Overlap']]  = 0
        sigRes_DF.loc[0, ['P-value']]  = 1
        sigRes_DF.loc[0, ['Adjusted P-value']]  = 1
        sigRes_DF.loc[0, ['Genes']]  = ''
        
    else:
        expandedCols = sigRes_DF["Term"].str.split('(', expand = True)
        
        if expandedCols.shape[1] == 3:
            sigRes_DF[['GO term', 'GO ID', 'nothing']] = expandedCols
        else: # ==2
            sigRes_DF[['GO term', 'GO ID']] = expandedCols

        sigRes_DF[['GO ID', 'nothing']] = sigRes_DF["GO ID"].str.split(')', expand = True)

        # Get ranking of enriched GO terms
        sigRes_DF['Rank'] = sigRes_DF.reset_index().index

    
    # Doing what should be a simple colbind 
    geneSetRow_DF = geneSetRow.to_frame().T
    geneSetRow_repeated_DF = geneSetRow_DF.loc[geneSetRow_DF.index.repeat(sigRes_DF.shape[0])]
    geneSetRow_repeated_DF = geneSetRow_repeated_DF.reset_index()
    
    # system_Row_sigRes = systemRow_repeated_DF.join(sigRes_DF)
    geneSetRowRow_sigRes = pd.concat([geneSetRow_repeated_DF, sigRes_DF],axis=1)
    
    # Doing what should be a simple  rowbind
    if expanded_LLM_genes_DF.shape[0] == 0:
        expanded_LLM_genes_DF = geneSetRowRow_sigRes[expandedColumnsToKeep]
    else:
        expanded_LLM_genes_DF =  pd.concat([expanded_LLM_genes_DF, geneSetRowRow_sigRes[expandedColumnsToKeep]],axis=0)
    
    if (geneSetInd%10 == 1):
        expanded_LLM_genes_DF.to_csv(outputFile, sep = "\t") # Save to not loose work

In [None]:
expanded_LLM_genes_DF.to_csv(outputFile, sep = "\t") # Too large 

In [None]:
expanded_LLM_genes_DF.shape

In [None]:
expanded_LLM_genes_copy_DF = expanded_LLM_genes_DF;

In [None]:
def select_min_rows(df, group_vars, min_var):
    # Convert the target column to numeric, non-numeric values become NaN
    df[min_var] = pd.to_numeric(df[min_var], errors='coerce')

    # Reset index of the DataFrame
    df_reset = df.reset_index()

    # Get the indices of the minimum values in the grouped DataFrame
    min_indices = df_reset.groupby(group_vars)[min_var].idxmin()

    # Use these indices to select rows from the reset DataFrame
    return df_reset.loc[min_indices].set_index('index')

In [None]:
reduced_LLM_genes_DF = select_min_rows(expanded_LLM_genes_copy_DF, ['Source', 'GeneSetID', 'GeneSetName', 'GeneList'], 'Adjusted P-value')

In [None]:
reduced_LLM_genes_DF.shape

In [None]:
reduced_LLM_genes_DF.to_csv('data/omics_test_revamped_LLM_Enrichr_reduced_DF.tsv' , sep = "\t")

In [None]:
set(LLM_DF.GeneSetID.to_list()).difference(set(reduced_LLM_genes_DF.GeneSetID.to_list()))

In [None]:
reduced_LLM_genes_DF[reduced_LLM_genes_DF[['Source', 'GeneSetID', 'GeneSetName', 'GeneList']].duplicated() == True]


In [None]:
# reduced_LLM_genes_DF = expanded_LLM_genes_DF.loc[expanded_LLM_genes_DF.reset_index().groupby(['Source', 'GeneSetID', 'GeneSetName', 'GeneList'])['Adjusted P-value'].idxmin()]

In [None]:
reduced_LLM_genes_DF