# Gene set overexpression analysis

You must `pip install gseapy` to run this notebook.

In [1]:
import os
import sys
import json
from collections import defaultdict

import pandas as pd
import gseapy as gp

from perturbqa import load_gse

import warnings
warnings.filterwarnings("ignore")

Load tasks, gene sets, and background genes

In [2]:
gene_lists = {
    "pert": load_gse("pert", skip_empty=False),
    "gene": load_gse("gene", skip_empty=False)
}

go_gsea = {}
fp_kgs = [
    "../perturbqa/datasets/kg/go_gsea.json",
    "../perturbqa/datasets/kg/corum_gsea.json",
    "../perturbqa/datasets/kg/reactome_gsea.json",
]

for fp in fp_kgs:
    with open(fp) as f:
        go_gsea.update(json.load(f))
        
# you may replace this with anoter list of background genes
with open("../perturbqa/datasets/k562_gw_mapping_full.json") as f:
    all_genes = set([g for g in json.load(f) if g != "non-targeting"])
    all_genes = sorted(all_genes)
    print(all_genes[:5], len(all_genes))

['A1BG', 'AAAS', 'AACS', 'AAGAB', 'AAK1'] 8454


Run gene set overexpression analysis for each set

In [3]:
task_to_dfs = {}
for task, lists in gene_lists.items():
    cur_dfs = []
    for gene_list in lists:
        try:
            df = gp.enrich(gene_list=gene_list["genes"],
                           gene_sets=go_gsea,
                           background=all_genes,
                           outdir=None).res2d
        except:
            print(gene_list)  # this should not be printed
            df = None
        cur_dfs.append(df)
    task_to_dfs[task] = cur_dfs

Visualization of results

In [4]:
print(task_to_dfs.keys(), len(task_to_dfs["pert"]))

dict_keys(['pert', 'gene']) 64


In [5]:
task_to_dfs["pert"][0]

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Odds Ratio,Combined Score,Genes
0,gs_ind_0,Assembly of EGFR complex in clathrin-coated ve...,1/2,0.000946,0.015689,2414.142857,16809.992742,SH3GL1
1,gs_ind_0,Auxilin recruits HSPA8:ATP to the clathrin-coa...,1/62,0.029019,0.043793,58.463415,206.948685,SH3GL1
2,gs_ind_0,BAR domain proteins recruit dynamin,1/58,0.027166,0.043793,62.560248,225.578418,SH3GL1
3,gs_ind_0,Beta-Pix:CDC42:GTP binds CBL in EGF:p-6Y-EGFR:...,1/4,0.001892,0.015689,1034.387755,6485.961543,SH3GL1
4,gs_ind_0,CBL recruits CIN85:endophilin complex to InlB-...,1/2,0.000946,0.015689,2414.142857,16809.992742,SH3GL1
...,...,...,...,...,...,...,...,...
78,gs_ind_0,RGGT:CHM binds RABs,1/32,0.015058,0.039049,114.551020,480.640942,RAB4B
79,gs_ind_0,RNF11-SMURF2-STAMBP complex,1/3,0.001419,0.015689,1448.314286,9497.818091,RNF11
80,gs_ind_0,SNX9 recruits components of the actin polymeri...,1/56,0.026239,0.043793,64.830116,236.014996,SH3GL1
81,gs_ind_0,"SYNJ hydrolyze PI(4,5)P2 to PI(4)P",1/59,0.027630,0.043793,61.483516,220.655858,SH3GL1
