In [227]:
import pandas as pd
import os


In [228]:
LLM_score_thresh = 0.8
enrichr_adj_pval_thresh = 0.05
enrichr_JI_thresh = 0.1
coverage_thresh = 0.2

## TODO: LLM Coverage/JI
To be even handed, we need to process the LLM analyses to determine how many genes are used to support the chosen name.

Plan: present the 300 analyses to an LLM to extract the genes supporting the name and the genes discussed.

## Load the Enrichr query data

In [229]:

import os
import pandas as pd

# Read the TSV file into a Pandas DataFrame
LLM_genes_DF = pd.read_csv("data/omics_revamped_LLM_Enrichr_simVals_DF.tsv", delimiter="\t")
#LLM_genes_DF.shape
LLM_genes_DF.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Source,GeneSetID,GeneSetName,GeneList,n_Genes,LLM Name,LLM Analysis,Score,Rank,Overlap,P-value,Adjusted P-value,Genes,Term,GO term,GO ID,GO_term_genes,LLM_name_GO_term_sim
0,0,0,NeST,Cluster1-10,Cluster1-10,CTRL HSD17B14 KIAA0232 PAQR8 PLA2G1B RNF145 SG...,12,Lipid Metabolism and Membrane Dynamics,"1. CTRL, or carboxyl-terminal esterase/lipase,...",0.85,0,2/21,6.9e-05,0.008264,PLA2G1B;HSD17B14,Lipid Catabolic Process (GO:0016042),Lipid Catabolic Process,GO:0016042,CES1 PNLIPRP1 PLA2G1B CES1P1 AOAH HSD17B14 APO...,0.603573
1,1,0,NeST,Cluster1-11,Cluster1-11,LMF1 MFHAS1 MR1 PLA2G1B RASL11A RNF145 SLC2A6 ...,12,Lipid Metabolism and Membrane Trafficking,1. LMF1 (Lipase Maturation Factor 1) is crucia...,0.85,0,2/85,0.001146,0.050873,PLA2G1B;MR1,Defense Response To Gram-positive Bacterium (G...,Defense Response To Gram-positive Bacterium,GO:0050830,GBP7 CHGA GBP6 GSDMD DEFB104A H2BC8 RNASE12 RN...,0.317063
2,2,0,NeST,Cluster1-12,Cluster1-12,AMY2B CNPY2 EGFL7 LDLR LPL LRP8 LRPAP1 MYLIP P...,12,Lipid Metabolism and Receptor-Mediated Endocyt...,"1. AMY2B (Amylase, alpha 2B) is an enzyme that...",0.88,0,4/189,4e-06,0.000671,VLDLR;SORL1;LRP8;LDLR,Endocytosis (GO:0006897),Endocytosis,GO:0006897,RALBP1 SCARA5 TBC1D2B AP1S1 TSC2 ARHGAP27 TICA...,0.607433
3,3,1,NeST,Cluster1-12,Cluster1-12,AMY2B CNPY2 EGFL7 LDLR LPL LRP8 LRPAP1 MYLIP P...,12,Lipid Metabolism and Receptor-Mediated Endocyt...,"1. AMY2B (Amylase, alpha 2B) is an enzyme that...",0.88,1,2/6,5e-06,0.000671,LRPAP1;LDLR,Negative Regulation Of Lipoprotein Particle Cl...,Negative Regulation Of Lipoprotein Particle Cl...,GO:0010985,KHSRP APOC3 APOC2 PCSK9 LDLR LRPAP1,0.44078
4,4,2,NeST,Cluster1-12,Cluster1-12,AMY2B CNPY2 EGFL7 LDLR LPL LRP8 LRPAP1 MYLIP P...,12,Lipid Metabolism and Receptor-Mediated Endocyt...,"1. AMY2B (Amylase, alpha 2B) is an enzyme that...",0.88,2,4/233,8e-06,0.000671,RELN;VLDLR;SORL1;LRP8,Positive Regulation Of Developmental Process (...,Positive Regulation Of Developmental Process,GO:0051094,ACVR1 KRT2 ISL1 SMARCA2 SMARCA4 CDC20 NEURL1 T...,0.150215


## Common functions

In [230]:
# functions to compute the Jaccard Index and coverage for the query gene list and the GO term genes
def get_JI(GeneList: str, enrichr_genes) -> float:
    # Check if enrichr_genes is a string
    if not isinstance(enrichr_genes, str):
        #print(f"Warning: enrichr_genes is not a string: {enrichr_genes}")
        return 0
    
    # Assume separation is the same
    geneSetGenes = GeneList.split(" ")
    enrichRGenes = enrichr_genes.split(" ")
    
    # Calculate the Jaccard Index (JI)
    intersection = set(geneSetGenes).intersection(enrichRGenes)
    union = set(geneSetGenes).union(enrichRGenes)
    
    JI = len(intersection) / len(union) if len(union) != 0 else 0
    
    return JI

def get_coverage(GeneList: str, enrichr_genes) -> float:
    # Check if enrichr_genes is a string
    if not isinstance(enrichr_genes, str):
        #print(f"Warning: enrichr_genes is not a string: {enrichr_genes}")
        return 0
    
    # Assume separation is the same
    geneSetGenes = GeneList.split(" ")
    enrichRGenes = enrichr_genes.split(" ")
    
    # Calculate the intersection
    intersection = set(geneSetGenes).intersection(enrichRGenes)
    
    # Calculate the coverage
    coverage = len(intersection) / len(geneSetGenes) if len(geneSetGenes) != 0 else 0
    
    return coverage

def create_success_contingency_table(df):
    # Make the LLM vs enrichr contingency table
    contingency_table = pd.crosstab(df['enrichr_success_TF'], df['LLM_success_TF'], rownames=['enrichr_success_TF'], colnames=['LLM_success_TF'])

    # Reorder the rows and columns
    contingency_table = contingency_table.reindex(index=[True, False], columns=[True, False])

    # Add the total column and row
    contingency_table['Total'] = contingency_table.sum(axis=1)
    contingency_table.loc['Total'] = contingency_table.sum(axis=0)

    return contingency_table

def select_rows_and_columns(df, number_of_rows=None):
    # Specify the ordered list of column names you want to return
    ordered_column_names = [
        'n_Genes', 'GeneList', 'LLM Name', 'Score', 'GO term', 'GO_term_genes',
        'Genes', 'Overlap', 'Adjusted P-value', 'coverage', 'enrichr_JI', 
        'LLM_success_TF', 'enrichr_success_TF'
    ]
    
    # If number_of_rows is not specified, select all rows
    if number_of_rows is None:
        result_df = df.loc[:, ordered_column_names]
    else:
        result_df = df.loc[:, ordered_column_names].head(number_of_rows)
    
    return result_df

## Add JI and coverage scores

In [231]:
# Apply the get_JI and get_coverage functions to each row and add new columns to the DataFrame
LLM_genes_DF['enrichr_JI'] = LLM_genes_DF.apply(lambda row: get_JI(row['GeneList'], 
                                                                   row['GO_term_genes']), 
                                                                   axis=1)

LLM_genes_DF['coverage'] = LLM_genes_DF.apply(lambda row: get_coverage(row['GeneList'], 
                                                                   row['GO_term_genes']), 
                                                                   axis=1)
LLM_genes_DF.shape

(11310, 22)

## Original: select Enrichr result with best JI, then threshold JI and APV

In [232]:
# Reproducing the original analysis:

# create a DataFrame reduced_LLM_genes_by_JI_DF containing the rows with the highest enrichr_JI 
# for each combination of Source, GeneSetID, GeneSetName, GeneList.
grouped = LLM_genes_DF.groupby(["Source", "GeneSetID", "GeneSetName", "GeneList"])

# Define a function to return the row with the maximum 'enrichr_JI' for each group
def get_max_enrichr_JI(df):
    return df.loc[df['enrichr_JI'].idxmax()]

# Apply the function to each group and reset the index
# We take the GO term with the highest Jaccard Index
reduced_LLM_genes_by_JI_DF = grouped.apply(get_max_enrichr_JI).reset_index(drop=True)

# Add the LLM_success_TF and enrichr_success_TF columns
# LLM_success_TF is True if the 'Score' is greater than or equal to LLM_score_thresh
# enrichr_success_TF is False if the 'Adjusted P-value' is less or equal to  enrichr_adj_pval_thresh and the 'enrichr_JI' is greater than or equal to enrichr_JI_thresh
reduced_LLM_genes_by_JI_DF['LLM_success_TF'] = reduced_LLM_genes_by_JI_DF['Score'] >= LLM_score_thresh
reduced_LLM_genes_by_JI_DF['enrichr_success_TF'] = ~((reduced_LLM_genes_by_JI_DF['Adjusted P-value'] > enrichr_adj_pval_thresh) | (reduced_LLM_genes_by_JI_DF['enrichr_JI'] < enrichr_JI_thresh))

# Print the contingency table
print(create_success_contingency_table(reduced_LLM_genes_by_JI_DF))

result_df = select_rows_and_columns(reduced_LLM_genes_by_JI_DF, 5)
result_df

LLM_success_TF      True  False  Total
enrichr_success_TF                    
True                  36      4     40
False                 97    163    260
Total                133    167    300


Unnamed: 0,n_Genes,GeneList,LLM Name,Score,GO term,GO_term_genes,Genes,Overlap,Adjusted P-value,coverage,enrichr_JI,LLM_success_TF,enrichr_success_TF
0,24,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,Cellular Matrix Remodeling and Tissue Development,0.85,Regulation Of Peroxisome Proliferator Activate...,ASXL2 ASXL1 PTGIS BMP2 CITED2 GPS2 LMO3 PLIN5 ...,CITED2;TWIST1,2/10,0.02383,0.083333,0.0625,True,False
1,47,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,Cellular Adhesion and Extracellular Matrix Int...,0.85,Purine Ribonucleoside Monophosphate Catabolic ...,PNP NT5C1A NT5E HPRT1 SORD PRTFDC1 NT5C2,NT5E;HPRT1,2/7,0.059507,0.042553,0.038462,True,False
2,59,ABAT ASS1 CHI3L1 CHST2 CLDN3 EIF5B FRZB GAL HE...,System of unrelated proteins,0.0,Response To Steroid Hormone,MED1 TGFB1 PAQR7 TGFB3 PAQR8 TGFB2 HSPA8 NR1H3...,CAV1;AKR1B1;THBS1,3/29,0.047162,0.050847,0.035294,False,False
3,9,AMIGO2 AREG GAS6 GPR37 IFT57 PELI1 SQLE AKAP12...,System of unrelated proteins,0.0,B Cell Chemotaxis,HSD3B7 CXCL13 PIK3CD CYP7B1 GAS6,GAS6,1/5,0.037716,0.111111,0.076923,False,False
4,15,1060P11.3 CPE EFNB2 HIST1H2AC IL1R2 INHBB LYPD...,System of unrelated proteins,0.0,ERK1 And ERK2 Cascade,DUSP5 DUSP6 EGF MAP2K2 MAP2K1 KARS1 PTGER4 ZFP...,CTSH;SOX9,2/24,0.010442,0.133333,0.054054,False,False


## Alt #1: Select Enrichr result with best APV, then threshold JI and APV

In [233]:
# Alternative analysis in which the reduced DF is based on selecting by the minimum of Adjusted P-value

# create a DataFrame reduced_LLM_genes_by_apv_DF containing the rows with the lowest 'Adjusted P-value' 
# for each combination of Source, GeneSetID, GeneSetName, GeneList.

# Group the DataFrame by the specified columns
grouped = LLM_genes_DF.groupby(["Source", "GeneSetID", "GeneSetName", "GeneList"])

# Define a function to return the row with the minimum 'Adjusted P-value' for each group
def get_min_adj_p_value(df):
    return df.loc[df['Adjusted P-value'].idxmin()]

# Apply the function to each group and reset the index
reduced_LLM_genes_by_apv_DF = grouped.apply(get_min_adj_p_value).reset_index(drop=True)

# Add the LLM_success_TF and enrichr_success_TF columns
reduced_LLM_genes_by_apv_DF['LLM_success_TF'] = reduced_LLM_genes_by_apv_DF['Score'] >= LLM_score_thresh

# The resulting DataFrame will have an enrichr_success_TF column where the values are True if both conditions 
# (Adjusted P-value >= enrichr_adj_pval_thresh) and (enrichr_JI <= enrichr_JI_thresh) are met. 
# Otherwise, it will be False.
reduced_LLM_genes_by_apv_DF['enrichr_success_TF'] = ~((reduced_LLM_genes_by_apv_DF['Adjusted P-value'] > enrichr_adj_pval_thresh) | (reduced_LLM_genes_by_apv_DF['enrichr_JI'] < enrichr_JI_thresh))

# Print the contingency table
print(create_success_contingency_table(reduced_LLM_genes_by_apv_DF))

result_df = select_rows_and_columns(reduced_LLM_genes_by_apv_DF, 5)
result_df

LLM_success_TF      True  False  Total
enrichr_success_TF                    
True                  21      4     25
False                112    163    275
Total                133    167    300


Unnamed: 0,n_Genes,GeneList,LLM Name,Score,GO term,GO_term_genes,Genes,Overlap,Adjusted P-value,coverage,enrichr_JI,LLM_success_TF,enrichr_success_TF
0,24,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,Cellular Matrix Remodeling and Tissue Development,0.85,Regulation Of Peroxisome Proliferator Activate...,ASXL2 ASXL1 PTGIS BMP2 CITED2 GPS2 LMO3 PLIN5 ...,CITED2;TWIST1,2/10,0.02383,0.083333,0.0625,True,False
1,47,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,Cellular Adhesion and Extracellular Matrix Int...,0.85,Purine Ribonucleoside Monophosphate Catabolic ...,PNP NT5C1A NT5E HPRT1 SORD PRTFDC1 NT5C2,NT5E;HPRT1,2/7,0.059507,0.042553,0.038462,True,False
2,59,ABAT ASS1 CHI3L1 CHST2 CLDN3 EIF5B FRZB GAL HE...,System of unrelated proteins,0.0,Response To Steroid Hormone,MED1 TGFB1 PAQR7 TGFB3 PAQR8 TGFB2 HSPA8 NR1H3...,CAV1;AKR1B1;THBS1,3/29,0.047162,0.050847,0.035294,False,False
3,9,AMIGO2 AREG GAS6 GPR37 IFT57 PELI1 SQLE AKAP12...,System of unrelated proteins,0.0,Positive Regulation Of Protein Tyrosine Kinase...,PTPN1 IL34 ACE EGF FBXW7 SRCIN1 PRNP AGT EREG ...,GAS6;AREG,2/38,0.023945,0.222222,0.044444,False,False
4,15,1060P11.3 CPE EFNB2 HIST1H2AC IL1R2 INHBB LYPD...,System of unrelated proteins,0.0,SMAD Protein Signal Transduction,GDF11 TGFB1 TGFB3 JUN TGFB2 GDF15 BMP8B BMP8A ...,NUP93;GDF15;INHBB,3/51,0.002024,0.2,0.047619,False,False


## Alt #2 Select Enrichr result with best APV, threshold on APV, no JI requirement

In [234]:
# Alternative analysis in which the reduced DF is based on the minimum of Adjusted P-value with no JI requirement

# create a DataFrame reduced_LLM_genes_by_apv_DF containing the rows with the lowest 'Adjusted P-value' 
# for each combination of Source, GeneSetID, GeneSetName, GeneList.

# Group the DataFrame by the specified columns
grouped = LLM_genes_DF.groupby(["Source", "GeneSetID", "GeneSetName", "GeneList"])

# Define a function to return the row with the minimum 'Adjusted P-value' for each group
def get_min_adj_p_value(df):
    return df.loc[df['Adjusted P-value'].idxmin()]

# Apply the function to each group and reset the index
reduced_LLM_genes_by_apv_no_JI_DF = grouped.apply(get_min_adj_p_value).reset_index(drop=True)

# Add the LLM_success_TF and enrichr_success_TF columns
reduced_LLM_genes_by_apv_no_JI_DF['LLM_success_TF'] = reduced_LLM_genes_by_apv_DF['Score'] >= LLM_score_thresh

# The resulting DataFrame will have an enrichr_success_TF column where the values are True if both conditions 
# (Adjusted P-value >= enrichr_adj_pval_thresh) and (enrichr_JI <= enrichr_JI_thresh) are met. 
# Otherwise, it will be False.
reduced_LLM_genes_by_apv_no_JI_DF['enrichr_success_TF'] = ~(reduced_LLM_genes_by_apv_DF['Adjusted P-value'] > enrichr_adj_pval_thresh)

# Print the contingency table
print(create_success_contingency_table(reduced_LLM_genes_by_apv_no_JI_DF))

result_df = select_rows_and_columns(reduced_LLM_genes_by_apv_no_JI_DF, 5)
result_df

LLM_success_TF      True  False  Total
enrichr_success_TF                    
True                 117    103    220
False                 16     64     80
Total                133    167    300


Unnamed: 0,n_Genes,GeneList,LLM Name,Score,GO term,GO_term_genes,Genes,Overlap,Adjusted P-value,coverage,enrichr_JI,LLM_success_TF,enrichr_success_TF
0,24,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,Cellular Matrix Remodeling and Tissue Development,0.85,Regulation Of Peroxisome Proliferator Activate...,ASXL2 ASXL1 PTGIS BMP2 CITED2 GPS2 LMO3 PLIN5 ...,CITED2;TWIST1,2/10,0.02383,0.083333,0.0625,True,True
1,47,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,Cellular Adhesion and Extracellular Matrix Int...,0.85,Purine Ribonucleoside Monophosphate Catabolic ...,PNP NT5C1A NT5E HPRT1 SORD PRTFDC1 NT5C2,NT5E;HPRT1,2/7,0.059507,0.042553,0.038462,True,False
2,59,ABAT ASS1 CHI3L1 CHST2 CLDN3 EIF5B FRZB GAL HE...,System of unrelated proteins,0.0,Response To Steroid Hormone,MED1 TGFB1 PAQR7 TGFB3 PAQR8 TGFB2 HSPA8 NR1H3...,CAV1;AKR1B1;THBS1,3/29,0.047162,0.050847,0.035294,False,True
3,9,AMIGO2 AREG GAS6 GPR37 IFT57 PELI1 SQLE AKAP12...,System of unrelated proteins,0.0,Positive Regulation Of Protein Tyrosine Kinase...,PTPN1 IL34 ACE EGF FBXW7 SRCIN1 PRNP AGT EREG ...,GAS6;AREG,2/38,0.023945,0.222222,0.044444,False,True
4,15,1060P11.3 CPE EFNB2 HIST1H2AC IL1R2 INHBB LYPD...,System of unrelated proteins,0.0,SMAD Protein Signal Transduction,GDF11 TGFB1 TGFB3 JUN TGFB2 GDF15 BMP8B BMP8A ...,NUP93;GDF15;INHBB,3/51,0.002024,0.2,0.047619,False,True


## Alt #3: Select Enrichr result with best APV that also meets JI threshold

In [235]:
# Alternative analysis in which the result is selected as the row with the lowest Adjusted P-value that passes both requirements
grouped = LLM_genes_DF.groupby(["Source", "GeneSetID", "GeneSetName", "GeneList"])

# Define a function to filter and select the appropriate row
def filter_and_select(df):
    # Filter rows that pass both Enrichr requirements
    filtered = df[(df['enrichr_JI'] >= enrichr_JI_thresh) & (df['Adjusted P-value'] <= enrichr_adj_pval_thresh)]
    
    if not filtered.empty:
        # If there are rows that pass both requirements, select the one with the lowest adjusted p-value
        return filtered.loc[filtered['Adjusted P-value'].idxmin()]
    else:
        # If no rows pass both requirements, select the one with the lowest adjusted p-value in the original group
        return df.loc[df['Adjusted P-value'].idxmin()]

# Apply the function to each group and reset the index
reduced_LLM_genes_by_apv_best_DF = grouped.apply(filter_and_select).reset_index(drop=True)

# Add the 'LLM_success_TF' and 'enrichr_success_TF' columns
reduced_LLM_genes_by_apv_best_DF['LLM_success_TF'] = reduced_LLM_genes_by_apv_best_DF['Score'] >= LLM_score_thresh
reduced_LLM_genes_by_apv_best_DF['enrichr_success_TF'] = ~((reduced_LLM_genes_by_apv_best_DF['Adjusted P-value'] > enrichr_adj_pval_thresh) | (reduced_LLM_genes_by_apv_best_DF['enrichr_JI'] < enrichr_JI_thresh))

print(create_success_contingency_table(reduced_LLM_genes_by_apv_best_DF))

result_df = select_rows_and_columns(reduced_LLM_genes_by_apv_best_DF, 5)
result_df

LLM_success_TF      True  False  Total
enrichr_success_TF                    
True                  36      4     40
False                 97    163    260
Total                133    167    300


Unnamed: 0,n_Genes,GeneList,LLM Name,Score,GO term,GO_term_genes,Genes,Overlap,Adjusted P-value,coverage,enrichr_JI,LLM_success_TF,enrichr_success_TF
0,24,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,Cellular Matrix Remodeling and Tissue Development,0.85,Regulation Of Peroxisome Proliferator Activate...,ASXL2 ASXL1 PTGIS BMP2 CITED2 GPS2 LMO3 PLIN5 ...,CITED2;TWIST1,2/10,0.02383,0.083333,0.0625,True,False
1,47,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,Cellular Adhesion and Extracellular Matrix Int...,0.85,Purine Ribonucleoside Monophosphate Catabolic ...,PNP NT5C1A NT5E HPRT1 SORD PRTFDC1 NT5C2,NT5E;HPRT1,2/7,0.059507,0.042553,0.038462,True,False
2,59,ABAT ASS1 CHI3L1 CHST2 CLDN3 EIF5B FRZB GAL HE...,System of unrelated proteins,0.0,Response To Steroid Hormone,MED1 TGFB1 PAQR7 TGFB3 PAQR8 TGFB2 HSPA8 NR1H3...,CAV1;AKR1B1;THBS1,3/29,0.047162,0.050847,0.035294,False,False
3,9,AMIGO2 AREG GAS6 GPR37 IFT57 PELI1 SQLE AKAP12...,System of unrelated proteins,0.0,Positive Regulation Of Protein Tyrosine Kinase...,PTPN1 IL34 ACE EGF FBXW7 SRCIN1 PRNP AGT EREG ...,GAS6;AREG,2/38,0.023945,0.222222,0.044444,False,False
4,15,1060P11.3 CPE EFNB2 HIST1H2AC IL1R2 INHBB LYPD...,System of unrelated proteins,0.0,SMAD Protein Signal Transduction,GDF11 TGFB1 TGFB3 JUN TGFB2 GDF15 BMP8B BMP8A ...,NUP93;GDF15;INHBB,3/51,0.002024,0.2,0.047619,False,False


## Alt #4: Select Enrichr result with best APV that also passes coverage threshold

In [240]:

# Define a function to filter and select the appropriate row
def filter_and_select_coverage(df):
    # Filter rows that pass both Enrichr requirements
    filtered = df[(df['coverage'] >= coverage_thresh) & (df['Adjusted P-value'] <= enrichr_adj_pval_thresh)]
    
    if not filtered.empty:
        # If there are rows that pass both requirements, select the one with the lowest adjusted p-value
        return filtered.loc[filtered['Adjusted P-value'].idxmin()]
    else:
        # If no rows pass both requirements, select the one with the lowest adjusted p-value in the original group
        return df.loc[df['Adjusted P-value'].idxmin()]

# Grouping the DataFrame by the specified columns
grouped = LLM_genes_DF.groupby(["Source", "GeneSetID", "GeneSetName", "GeneList"])

# Apply the function to each group and reset the index
reduced_LLM_genes_by_coverage_DF = grouped.apply(filter_and_select_coverage).reset_index(drop=True)

# Add the 'LLM_success_TF' and 'enrichr_success_TF' columns
reduced_LLM_genes_by_coverage_DF['LLM_success_TF'] = reduced_LLM_genes_by_coverage_DF['Score'] >= LLM_score_thresh
reduced_LLM_genes_by_coverage_DF['enrichr_success_TF'] = ~((reduced_LLM_genes_by_coverage_DF['Adjusted P-value'] > enrichr_adj_pval_thresh) | (reduced_LLM_genes_by_coverage_DF['coverage'] < coverage_thresh))

print(create_success_contingency_table(reduced_LLM_genes_by_coverage_DF))

result_df = select_rows_and_columns(reduced_LLM_genes_by_coverage_DF, 20)
result_df

LLM_success_TF      True  False  Total
enrichr_success_TF                    
True                  72     17     89
False                 61    150    211
Total                133    167    300


Unnamed: 0,n_Genes,GeneList,LLM Name,Score,GO term,GO_term_genes,Genes,Overlap,Adjusted P-value,coverage,enrichr_JI,LLM_success_TF,enrichr_success_TF
0,24,CITED2 COL5A1 CRABP2 KCTD12 MDFIC MMP2 NRP1 OR...,Cellular Matrix Remodeling and Tissue Development,0.85,Regulation Of Peroxisome Proliferator Activate...,ASXL2 ASXL1 PTGIS BMP2 CITED2 GPS2 LMO3 PLIN5 ...,CITED2;TWIST1,2/10,0.02383046,0.083333,0.0625,True,False
1,47,1060P11.3 ADM AHR AMIGO2 ARL4C ATP10D CAV2 CD4...,Cellular Adhesion and Extracellular Matrix Int...,0.85,Purine Ribonucleoside Monophosphate Catabolic ...,PNP NT5C1A NT5E HPRT1 SORD PRTFDC1 NT5C2,NT5E;HPRT1,2/7,0.0595073,0.042553,0.038462,True,False
2,59,ABAT ASS1 CHI3L1 CHST2 CLDN3 EIF5B FRZB GAL HE...,System of unrelated proteins,0.0,Response To Steroid Hormone,MED1 TGFB1 PAQR7 TGFB3 PAQR8 TGFB2 HSPA8 NR1H3...,CAV1;AKR1B1;THBS1,3/29,0.04716222,0.050847,0.035294,False,False
3,9,AMIGO2 AREG GAS6 GPR37 IFT57 PELI1 SQLE AKAP12...,System of unrelated proteins,0.0,Positive Regulation Of Protein Tyrosine Kinase...,PTPN1 IL34 ACE EGF FBXW7 SRCIN1 PRNP AGT EREG ...,GAS6;AREG,2/38,0.0239453,0.222222,0.044444,False,True
4,15,1060P11.3 CPE EFNB2 HIST1H2AC IL1R2 INHBB LYPD...,System of unrelated proteins,0.0,SMAD Protein Signal Transduction,GDF11 TGFB1 TGFB3 JUN TGFB2 GDF15 BMP8B BMP8A ...,NUP93;GDF15;INHBB,3/51,0.002023951,0.2,0.047619,False,True
5,38,1060P11.3 ADH5 ADM ATF3 CASP1 CLEC2B CPA3 CRIM...,Stress Response and Apoptosis Regulation,0.85,Axon Regeneration,GAP43 RTN4RL1 RTN4RL2 TNC NREP MTR MAPK8IP3 FO...,TNC;FOLR1,2/9,0.05985517,0.052632,0.044444,True,False
6,76,1060P11.3 ADAM17 AKAP12 ALCAM AMFR BIK CASP2 C...,Cellular Stress Response and Apoptosis Regulation,0.85,Regulation Of Fibroblast Proliferation,CD74 PDGFRA CD300A PLA2G2C WNT5A SPHK1 MED9 FN...,FTH1;S100A6;CTNNB1;TP53,4/46,0.01120218,0.052632,0.033898,True,False
7,34,CASP1 CAST CD52 CLC COL1A2 COL3A1 COL5A2 CPA3 ...,Extracellular Matrix Organization and Cell Sig...,0.85,Collagen Fibril Organization,COL27A1 LUM TGFB2 CRTAP SERPINF2 OPTC P3H4 TGF...,COL3A1;COL1A2;LUM;COL5A2,4/42,0.0003028291,0.117647,0.055556,True,False
8,33,1060P11.3 ADM CD44 CD9 CHGB CHST1 CYP4B1 FRZB ...,System of unrelated proteins,0.0,Eye Morphogenesis,TENM3 RARG STAT3 IFT122 SHROOM2 NKD1 VEGFA EFE...,ALDH1A3;VEGFA,2/19,0.08621919,0.060606,0.04,False,False
9,24,1060P11.3 CLEC2B CPB1 GREM1 HIST1H1C IGJ KRT81...,System of unrelated proteins,0.0,Regulation Of Chondrocyte Differentiation,ZBTB16 MBOAT2 PTPN11 GDF5 WNT9A GDF6 TGFBR1 PT...,GREM1;TRPS1,2/25,0.1015541,0.083333,0.042553,False,False


In [239]:
# Write the DataFrames to TSV files
#reduced_LLM_genes_by_JI_DF.to_csv("data/omics_revamped_LLM_Enrichr_simVals_failure_by_JI_DF.tsv", sep="\t", index=False)
reduced_LLM_genes_by_apv_best_DF.to_csv("data/omics_revamped_LLM_Enrichr_simVals_failure_by_apv_best_DF.tsv", sep="\t", index=False)

## Translated R analysis

~~~

get_JI = function(GeneList, enrichr_genes){
    # assume separation is the same
    geneSetGenes = str_split(string = GeneList, pattern = " ")[[1]]
    enrichRGenes = str_split(string = enrichr_genes, pattern = " ")[[1]]
    JI = length(intersect(geneSetGenes, enrichRGenes))/ length(union(geneSetGenes, enrichRGenes))
    
    return(JI)
    }   

-----------------------------

LLM_genes_DF = LLM_genes_DF %>%
rowwise() %>%
mutate(enrichr_JI = get_JI(GeneList, GO_term_genes))

LLM_genes_reduced_DF = LLM_genes_DF %>%
group_by("Source", "GeneSetID", "GeneSetName")

reduced_LLM_genes_DF = LLM_genes_DF %>%
group_by(Source, GeneSetID, GeneSetName, GeneList) %>%
slice(which.max(enrichr_JI))

reduced_LLM_genes_DF = reduced_LLM_genes_DF %>%
rowwise() %>%
mutate(enrichr_JI = get_JI(GeneList, GO_term_genes)) %>%
mutate(LLM_success_TF = ifelse(Score < 0.8, FALSE, TRUE),
       enrichr_success_TF = ifelse(((`Adjusted P-value` > enrichr_adj_pval_thresh) | (enrichr_JI < enrichr_JI_thresh)), FALSE, TRUE))

-------------------------------
Single cell - is this the same?

reduced_LLM_genes_DF = LLM_genes_DF %>%
group_by(Source, GeneSetID, GeneSetName, GeneList) %>%
slice(which.max(enrichr_JI))

reduced_LLM_genes_DF = reduced_LLM_genes_DF %>%
rowwise() %>%
mutate(enrichr_JI = get_JI(GeneList, GO_term_genes)) %>%
mutate(LLM_success_TF = ifelse(Score < 0.8, FALSE, TRUE),
       enrichr_success_TF = ifelse(((`Adjusted P-value` > enrichr_adj_pval_thresh) | (enrichr_JI < enrichr_JI_thresh)), FALSE, TRUE))

------------------------------
figure from the paper:

LLM_success_TF      True   False  Total
enrichr_success_TF              
True                  36      4     40
False                 97    163    260 
Total                133    167    300

The LLM succeeds 133 out of 300 time
Enrichment + JI succeeds 40 out of 300
~~~


In [238]:
# In 20 cases, there is no overlap with any GO_term's gene set
# Use this function to find those cases
def find_non_string(df):
    nonstring = 0
    for index, row in df.iterrows():
        go_term_genes = row["GO_term_genes"]
        if not isinstance(go_term_genes, str):
            nonstring += 1
        print(f'index: {index}\n{row}\n---------\n')
    print(nonstring)