# Gene Enrichement

This notebook performs gene set enrichment analysis on the most frequently occurring genes across all aggregated gene files generated by Motif. It identifies biologically relevant patterns and functional annotations associated with these genes.

In [8]:
import os
import pandas as pd
import gseapy as gp

In [10]:
# Create output directory if not exists
outdir = 'results/gseapy_cancer_enrichr_results'
os.makedirs(outdir, exist_ok=True)

In [11]:
# Load Input Data
df = pd.read_csv('results/evaluation_grid_search/grid_search_results.tsv', sep="\t")
print("\nResult Grid Search Evaluation:")
print(df['intersec_entrez'])


Result Grid Search Evaluation:
0    794,1602,3848,4925,5217,5251,5318,5476,5775,58...
1    794,1602,2679,4925,5217,5251,5318,5476,5775,58...
Name: intersec_entrez, dtype: object


In [None]:
# Prepare Gene Lists
# Convert each comma‑separated string into a Python list of IDs

# Parse the column into a list of strings
df['gene_list'] = df['intersec_entrez'].apply(lambda x: x.split(','))

# Check the parsed lists
df[['intersec_entrez', 'gene_list']].head()

Unnamed: 0,intersec_entrez,gene_list
0,"794,1602,3848,4925,5217,5251,5318,5476,5775,58...","[794, 1602, 3848, 4925, 5217, 5251, 5318, 5476..."
1,"794,1602,2679,4925,5217,5251,5318,5476,5775,58...","[794, 1602, 2679, 4925, 5217, 5251, 5318, 5476..."


In [None]:
# Define Cancer-related Gene Sets

# Select Enrichr libraries relevant to cancer research

# Retrieve all available Enrichr libraries
all_libs = gp.get_library_name()

# Filter for cancer-related names
cancer_gene_sets = [lib for lib in all_libs if 'Cancer' in lib or 'Onco' in lib]
print("Filtered cancer-related libraries:")
for lib in cancer_gene_sets:
    print(lib)
    
organism = 'Human'

Filtered cancer-related libraries:
Cancer_Cell_Line_Encyclopedia
MSigDB_Oncogenic_Signatures
NCI-60_Cancer_Cell_Lines


In [None]:
# Run Enrichment Analysis

# Iterate over each gene list and run Enrichr. Save results both to memory and as CSV files

enrich_results = {}
for idx, row in df.iterrows():
    gene_list = row['gene_list']
    print(f"Running cancer enrichment for sample {idx} ({len(gene_list)} genes)")
    
    try:
        res = gp.enrichr(
            gene_list=gene_list,
            gene_sets=cancer_gene_sets,
            organism=organism,
            outdir=outdir,
            cutoff=0.05,
            no_plot=True  # skip plotting to avoid errors when no terms pass cutoff
        )
        df_res = res.results
        if df_res.empty:
            print(f"No significant enrichment for Sample_{idx} at cutoff 0.05.")
        else:
            # Save results
            enrich_results[f"Sample_{idx}"] = df_res
            df_res.to_csv(f"{outdir}/CancerEnrich_Sample_{idx}_results.csv", index=False)
            print(f"Results saved for Sample_{idx}.")
    except Exception as e:
        print(f"Error processing Sample_{idx}: {e}")

print("Cancer-focused enrichment analysis complete.")

Running cancer enrichment for sample 0 (30 genes)
No significant enrichment for Sample_0 at cutoff 0.05.
Running cancer enrichment for sample 1 (30 genes)
No significant enrichment for Sample_1 at cutoff 0.05.
Cancer-focused enrichment analysis complete.


In [None]:
# Inspect Results

# Display top enriched terms for a selected sample

# Example: view top 5 results for Sample_0
if 'Sample_0' in enrich_results:
    enrich_results['Sample_0'].head(5)
else:
    print("Sample_0 results not found.")