In [6]:
import os, pickle
import numpy as np
import pandas as pd

import gseapy as gp

ONTO_PATH = "/data2/zhoujb/project/cowpea_project/rawData/geneOntology/"
GOKEGG_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/ML/GOKEGG"

In [2]:
kegg_df = pd.read_table(os.path.join(ONTO_PATH, "kegg_gene_id.txt"))
kegg_df["Term_list"] = kegg_df["Term"].str.split(",")
kegg_d = kegg_df[['KEGG_NAME', 'Term_list']].set_index('KEGG_NAME')['Term_list'].to_dict()

In [3]:
data_term = pd.read_table(os.path.join(GOKEGG_PATH, "traits_region_gene.txt"))

all_gsea_res = pd.DataFrame()
for group in data_term.groupby("traits"):
    group_name, group_data = group
    gene_list = list(set([str(x) for x in group_data['GeneID'].values]))
    
    enr2 = gp.enrich(gene_list=gene_list,
                 gene_sets=kegg_d, # kegg is a dict object
                 background=32563, # or "hsapiens_gene_ensembl", or int, or text file, or a list of genes
                 outdir=None,
                 verbose=True)

    tmp_res = enr2.results
    tmp_res = tmp_res[tmp_res["Adjusted P-value"]<=0.05].copy()
    tmp_res["traits"] = group_name

    all_gsea_res = pd.concat([all_gsea_res, tmp_res], axis=0, ignore_index=True)

    print("{} Done".format(group_name))

print("ALL DONE")

2024-07-19 16:08:11,315 [INFO] Input dict object named with gs_ind_0
2024-07-19 16:08:11,317 [INFO] Run: gs_ind_0 
2024-07-19 16:08:11,378 [INFO] Done.
2024-07-19 16:08:11,386 [INFO] Input dict object named with gs_ind_0
2024-07-19 16:08:11,388 [INFO] Run: gs_ind_0 
2024-07-19 16:08:11,437 [INFO] Done.
2024-07-19 16:08:11,448 [INFO] Input dict object named with gs_ind_0
2024-07-19 16:08:11,450 [INFO] Run: gs_ind_0 
2024-07-19 16:08:11,514 [INFO] Done.
2024-07-19 16:08:11,523 [INFO] Input dict object named with gs_ind_0
2024-07-19 16:08:11,525 [INFO] Run: gs_ind_0 
2024-07-19 16:08:11,561 [INFO] Done.
2024-07-19 16:08:11,571 [INFO] Input dict object named with gs_ind_0
2024-07-19 16:08:11,575 [INFO] Run: gs_ind_0 


GNP Done
PL Done
PProtein Done
PStarch Done


2024-07-19 16:08:11,612 [INFO] Done.
2024-07-19 16:08:11,621 [INFO] Input dict object named with gs_ind_0
2024-07-19 16:08:11,623 [INFO] Run: gs_ind_0 
2024-07-19 16:08:11,671 [INFO] Done.


PSugar Done
TSW Done
ALL DONE


In [4]:
all_gsea_res

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Odds Ratio,Combined Score,Genes,traits
0,gs_ind_0,Glucosinolate biosynthesis,3/8,0.000317,0.010158,34.329804,276.534931,114179512;114179851;114178551,GNP
1,gs_ind_0,Pantothenate and CoA biosynthesis,6/45,0.000161,0.007745,8.913014,77.828166,114183207;114185898;114178474;114179851;114178...,GNP
2,gs_ind_0,Sesquiterpenoid and triterpenoid biosynthesis,5/26,9.6e-05,0.007745,13.840163,128.056305,114177509;114178712;114178713;114196187;114179677,GNP
3,gs_ind_0,Metabolic pathways,67/2665,0.000647,0.035352,1.58705,11.654833,114181889;114178440;114193674;114173069;114179...,PL
4,gs_ind_0,Sesquiterpenoid and triterpenoid biosynthesis,4/26,0.000884,0.035352,11.751515,82.628096,114178712;114178713;114177509;114179677,PL


In [7]:
with open(os.path.join(GOKEGG_PATH, "kegg_gsea_res.pkl"), "wb") as f:
    pickle.dump(all_gsea_res, f)