In [None]:
# https://gseapy.readthedocs.io/en/latest/gseapy_example.html

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import gseapy as gp

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
dbs = gp.get_library_name()


In [4]:
#dbs

In [5]:
# Edit these parameters
dataType = "MSigDB"; # "NeST", "MSigDB"
runVersion = "test"

## Read in naming file

In [6]:
if runVersion == "test":
    infix = '_subset'
else: 
    infix = ''

In [7]:
inputFile = 'data/' + dataType + '_table' + infix + '_LLM_DF.tsv' 

In [8]:
inputFile

'data/MSigDB_table_subset_LLM_DF.tsv'

In [9]:
genesets_LLM = pd.read_csv(inputFile, sep = "\t") 

In [10]:
# Remove "HALLMARK_" prefix for MSigDB gene sets
if dataType == "MSigDB":
    genesets_LLM['Name'] = genesets_LLM['Name'].str.removeprefix("HALLMARK_").str.replace('_', ' ')


In [11]:
genesets_LLM.columns

Index(['Unnamed: 0', 'Name', 'Genes', 'LLM Name', 'LLM Analysis'], dtype='object')

In [12]:
### NeST Specific
# NeST_LLM_genes_raw_DF = pd.read_excel('data/Test_NeST_Nodes_LLM_enhanced.xlsx', sheet_name='Test_NeST_Nodes')
## remove  rows with missing values
#NeST_LLM_genes_DF=NeST_LLM_genes_raw_DF.dropna(subset=['NEST ID'])
#columnsToKeep = ['NEST ID','Size_All', 'All_Genes', 'name_new', 'LLM_name', 'LLM_description']
#NeST_LLM_genes_DF = NeST_LLM_genes_DF[columnsToKeep]

In [13]:
if dataType == "MSigDB":
    columnsToKeep = ['Name', 'Genes', 'LLM Name', 'LLM Analysis']
    geneSeparator = " "
elif dataType == "NeST":
    columnsToKeep = ['NEST ID', 'name_new', 'Genes', 'LLM Name', 'LLM Analysis']
    geneSeparator = ","
else:
    print("Data type not implemented yet")

In [14]:
geneSeparator

' '

In [15]:
LLM_DF = genesets_LLM # or = NeST_LLM_genes_DF

In [16]:
nSystems = LLM_DF.shape[0]

In [17]:
nSystems

3

## Go through each system and run Enrichr 

In [18]:
# Each system will have multiple rows

In [19]:
additionalCols = ['Rank', 'Overlap', 'P-value', 'Adjusted P-value', 'Genes', 'GO term', 'GO ID' ]

In [20]:
expandedColumnsToKeep =columnsToKeep;
expandedColumnsToKeep.extend(additionalCols)

In [21]:
expanded_LLM_genes_DF = pd.DataFrame(columns=expandedColumnsToKeep)

In [22]:
genesCol = 'Genes'

In [23]:
for systemInd in range(nSystems):
    print(systemInd)
    systemRow = LLM_DF.iloc[systemInd]
    systemGenes = systemRow[genesCol].split(geneSeparator)

    # Perform enrichment analysis
    enr = gp.enrichr(gene_list=systemGenes, # or "./tests/data/gene_list.txt",
                 gene_sets=['GO_Biological_Process_2023'],
                 organism='human', # don't forget to set organism to the one you desired! e.g. Yeast
                 outdir=None, # don't write to disk
                )
    
    # threshold
    # sigRes_DF =  enr.results.loc[enr.results['Adjusted P-value'] <=0.1]
    sigRes_DF = enr.results # remove thresholding, because want a name for each one
    sigRes_DF = sigRes_DF.sort_values('Adjusted P-value')
    if sigRes_DF.shape[0] == 0:
        
        print(['skipping ', systemInd])
        continue # skip to next gene set
        
        ## add something 
        
    
        
    expandedCols = sigRes_DF["Term"].str.split('(', expand = True)
    if expandedCols.shape[1] == 3:
        sigRes_DF[['GO term', 'GO ID', 'nothing']] = expandedCols
    else: # ==2
        sigRes_DF[['GO term', 'GO ID']] = expandedCols

    sigRes_DF[['GO ID', 'nothing']] = sigRes_DF["GO ID"].str.split(')', expand = True)

    # Get ranking of enriched GO terms
    sigRes_DF['Rank'] = sigRes_DF.reset_index().index

    
    # Doing what should be a simple colbind 
    systemRow_DF = systemRow.to_frame().T
    systemRow_repeated_DF = systemRow_DF.loc[systemRow_DF.index.repeat(sigRes_DF.shape[0])]
    systemRow_repeated_DF = systemRow_repeated_DF.reset_index()
    
    # system_Row_sigRes = systemRow_repeated_DF.join(sigRes_DF)
    system_Row_sigRes = pd.concat([systemRow_repeated_DF, sigRes_DF],axis=1)
    
    # Doing what should be a simple  rowbind
    if expanded_LLM_genes_DF.shape[0] == 0:
        expanded_LLM_genes_DF = system_Row_sigRes[expandedColumnsToKeep]
    else:
        expanded_LLM_genes_DF =  pd.concat([expanded_LLM_genes_DF, system_Row_sigRes[expandedColumnsToKeep]],axis=0)
    

0
1
2


In [24]:
expanded_LLM_genes_DF

Unnamed: 0,Name,Genes,Genes.1,LLM Name,LLM Analysis,Rank,Overlap,P-value,Adjusted P-value,Genes.2,Genes.3,GO term,GO ID
0,ALLOGRAFT REJECTION,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CD86;ITK;CD40;CD80;BRCA1;TNF;IL12B;IL12A;JAK2;...,Immune Response Regulation,The system of interacting proteins primarily p...,0,55/320,1.489019e-52,3.091203e-49,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CD86;ITK;CD40;CD80;BRCA1;TNF;IL12B;IL12A;JAK2;...,Positive Regulation Of Cytokine Production,GO:0001819
1,ALLOGRAFT REJECTION,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CCL13;CXCL9;SPI1;CCL11;IL2RG;CXCL13;TNF;IL27RA...,Immune Response Regulation,The system of interacting proteins primarily p...,1,46/257,1.366750e-44,1.418686e-41,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CCL13;CXCL9;SPI1;CCL11;IL2RG;CXCL13;TNF;IL27RA...,Cytokine-Mediated Signaling Pathway,GO:0019221
2,ALLOGRAFT REJECTION,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CCL13;CD40;CXCL9;CCL11;ITGB2;LY75;CXCL13;ITGAL...,Immune Response Regulation,The system of interacting proteins primarily p...,2,42/236,1.492656e-40,1.032918e-37,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CCL13;CD40;CXCL9;CCL11;ITGB2;LY75;CXCL13;ITGAL...,Inflammatory Response,GO:0006954
3,ALLOGRAFT REJECTION,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CD86;CD80;CD1D;THY1;CD3E;HLA-DMA;HLA-DMB;CCL5;...,Immune Response Regulation,The system of interacting proteins primarily p...,3,31/107,3.709867e-37,1.925421e-34,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CD86;CD80;CD1D;THY1;CD3E;HLA-DMA;HLA-DMB;CCL5;...,Positive Regulation Of T Cell Activation,GO:0050870
4,ALLOGRAFT REJECTION,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CCL13;CD40;CCL11;BRCA1;IL2RG;HIF1A;IL27RA;IKBK...,Immune Response Regulation,The system of interacting proteins primarily p...,4,39/308,1.038071e-31,4.310071e-29,AARS1 ABCE1 ABI1 ACHE ACVR2A AKT1 APBB1 B2M BC...,CCL13;CD40;CCL11;BRCA1;IL2RG;HIF1A;IL27RA;IKBK...,Cellular Response To Cytokine Stimulus,GO:0071345
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1370,ADIPOGENESIS,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,UBQLN1,Fatty Acid Metabolism and Oxidative Phosphoryl...,The most prominent biological process performe...,1370,1/322,9.617092e-01,9.645151e-01,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,UBQLN1,Organelle Assembly,GO:0070925
1371,ADIPOGENESIS,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,STAT5A;ESRRA;BCL6;CHUK;FZD4;ADIPOQ;PDCD4;BAZ2A...,Fatty Acid Metabolism and Oxidative Phosphoryl...,The most prominent biological process performe...,1371,12/1922,9.750666e-01,9.771987e-01,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,STAT5A;ESRRA;BCL6;CHUK;FZD4;ADIPOQ;PDCD4;BAZ2A...,Regulation Of DNA-templated Transcription,GO:0006355
1372,ADIPOGENESIS,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,RNF11,Fatty Acid Metabolism and Oxidative Phosphoryl...,The most prominent biological process performe...,1372,1/367,9.758321e-01,9.772536e-01,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,RNF11,Ubiquitin-Dependent Protein Catabolic Process,GO:0006511
1373,ADIPOGENESIS,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,RNF11,Fatty Acid Metabolism and Oxidative Phosphoryl...,The most prominent biological process performe...,1373,1/434,9.878426e-01,9.885615e-01,ABCA1 ABCB8 ACAA2 ACADL ACADM ACADS ACLY ACO2 ...,RNF11,Protein Ubiquitination,GO:0016567


In [25]:
expanded_LLM_genes_DF.shape

(4450, 13)

In [29]:
outputFile = 'data/' + dataType + '_table' + infix + '_LLM_Enrichr_DF.tsv' 

In [31]:
expanded_LLM_genes_DF.to_csv(outputFile, sep = "\t")