In [2]:
import pandas as pd
from pandas import DataFrame as df
import gseapy as gp
from gseapy import gseaplot, heatmap, dotplot, ringplot
import matplotlib.pyplot as plt
import sys
import numpy as np
from gseapy import biomart 

In [2]:
gene_list = pd.read_csv('HUMAN_genes.txt', sep="\t")

In [3]:
gene_list.head()

Unnamed: 0,gene_id,log2FC,pvalue
0,ENSG00000260917,-2.622299,0.000321457
1,ENSG00000264462,-1.489412,3.12e-08
2,ENSG00000274060,6.671805,7.17e-21
3,ENSG00000275692,6.674949,6.69e-21
4,ENSG00000273937,8.937028,6.84e-71


In [None]:
def get_ensembl_mappings():                                   
    # Set up connection to server                                               
    server = biomart.BiomartServer('http://www.ensembl.org/biomart')         
    mart = server.datasets['hsapiens_gene_ensembl']   

In [14]:
gene_list = gene_list.rename(columns = {'symbol': 'Gene'})

In [19]:
gene_list = gene_list[gene_list.pvalue < 0.05].dropna()
gene_list

Unnamed: 0,gene_id,log2FC,pvalue
0,ENSG00000260917,-2.622299,3.214570e-04
1,ENSG00000264462,-1.489412,3.120000e-08
2,ENSG00000274060,6.671805,7.170000e-21
3,ENSG00000275692,6.674949,6.690000e-21
4,ENSG00000273937,8.937028,6.840000e-71
...,...,...,...
3737,ENSG00000188747,-5.309880,9.857150e-04
3738,ENSG00000178764,-5.449168,1.350000e-101
3739,ENSG00000119771,-5.510044,9.520000e-157
3740,ENSG00000130021,-6.079441,1.550000e-106


In [20]:
gene_list['Rank'] = -np.log10(gene_list.pvalue)*gene_list.log2FC
gene_list

Unnamed: 0,gene_id,log2FC,pvalue,Rank
0,ENSG00000260917,-2.622299,3.214570e-04,-9.159369
1,ENSG00000264462,-1.489412,3.120000e-08,-11.179296
2,ENSG00000274060,6.671805,7.170000e-21,134.400049
3,ENSG00000275692,6.674949,6.690000e-21,134.664248
4,ENSG00000273937,8.937028,6.840000e-71,627.066100
...,...,...,...,...
3737,ENSG00000188747,-5.309880,9.857150e-04,-15.962820
3738,ENSG00000178764,-5.449168,1.350000e-101,-549.655806
3739,ENSG00000119771,-5.510044,9.520000e-157,-859.684556
3740,ENSG00000130021,-6.079441,1.550000e-106,-643.263597


In [22]:
gene_list = gene_list.sort_values('Rank', ascending = False).reset_index(drop=True)
gene_list

Unnamed: 0,gene_id,log2FC,pvalue,Rank
0,ENSG00000132002,5.037748,2.080000e-229,1152.041984
1,ENSG00000204389,5.875244,9.180000e-168,981.384098
2,ENSG00000204388,5.465742,4.380000e-166,903.807066
3,ENSG00000149257,4.690143,2.140000e-193,903.647877
4,ENSG00000278775,8.752135,1.820000e-75,654.133952
...,...,...,...,...
3737,ENSG00000130021,-6.079441,1.550000e-106,-643.263597
3738,ENSG00000184640,-4.120346,5.060000e-163,-668.715103
3739,ENSG00000112977,-5.068988,1.290000e-137,-693.890782
3740,ENSG00000130638,-4.869234,1.570000e-152,-739.169684


In [25]:
ranking = gene_list[['gene_id', 'Rank']]
ranking

Unnamed: 0,gene_id,Rank
0,ENSG00000132002,1152.041984
1,ENSG00000204389,981.384098
2,ENSG00000204388,903.807066
3,ENSG00000149257,903.647877
4,ENSG00000278775,654.133952
...,...,...
3737,ENSG00000130021,-643.263597
3738,ENSG00000184640,-668.715103
3739,ENSG00000112977,-693.890782
3740,ENSG00000130638,-739.169684


In [28]:
gp.get_library_name()

['ARCHS4_Cell-lines',
 'ARCHS4_IDG_Coexp',
 'ARCHS4_Kinases_Coexp',
 'ARCHS4_TFs_Coexp',
 'ARCHS4_Tissues',
 'Achilles_fitness_decrease',
 'Achilles_fitness_increase',
 'Aging_Perturbations_from_GEO_down',
 'Aging_Perturbations_from_GEO_up',
 'Allen_Brain_Atlas_10x_scRNA_2021',
 'Allen_Brain_Atlas_down',
 'Allen_Brain_Atlas_up',
 'Azimuth_Cell_Types_2021',
 'BioCarta_2013',
 'BioCarta_2015',
 'BioCarta_2016',
 'BioPlanet_2019',
 'BioPlex_2017',
 'CCLE_Proteomics_2020',
 'CORUM',
 'COVID-19_Related_Gene_Sets',
 'COVID-19_Related_Gene_Sets_2021',
 'Cancer_Cell_Line_Encyclopedia',
 'CellMarker_Augmented_2021',
 'ChEA_2013',
 'ChEA_2015',
 'ChEA_2016',
 'ChEA_2022',
 'Chromosome_Location',
 'Chromosome_Location_hg19',
 'ClinVar_2019',
 'DSigDB',
 'Data_Acquisition_Method_Most_Popular_Genes',
 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
 'Descartes_Cell_Types_and_Tissue_2021',
 'Diabetes_Perturbations_GEO_2022',
 'DisGeNET',
 'Disease_

In [32]:
glist = gene_list['gene_id'].to_list()
glist

['ENSG00000132002',
 'ENSG00000204389',
 'ENSG00000204388',
 'ENSG00000149257',
 'ENSG00000278775',
 'ENSG00000274868',
 'ENSG00000276312',
 'ENSG00000277671',
 'ENSG00000273937',
 'ENSG00000273739',
 'ENSG00000275664',
 'ENSG00000276197',
 'ENSG00000149201',
 'ENSG00000151929',
 'ENSG00000120694',
 'ENSG00000204390',
 'ENSG00000147679',
 'ENSG00000241163',
 'ENSG00000139343',
 'ENSG00000184378',
 'ENSG00000117598',
 'ENSG00000173110',
 'ENSG00000178381',
 'ENSG00000158201',
 'ENSG00000144381',
 'ENSG00000182253',
 'ENSG00000113263',
 'ENSG00000244754',
 'ENSG00000162972',
 'ENSG00000225217',
 'ENSG00000170345',
 'ENSG00000198130',
 'ENSG00000164070',
 'ENSG00000163738',
 'ENSG00000119705',
 'ENSG00000249693',
 'ENSG00000184368',
 'ENSG00000189149',
 'ENSG00000115380',
 'ENSG00000120885',
 'ENSG00000258667',
 'ENSG00000119630',
 'ENSG00000004478',
 'ENSG00000247828',
 'ENSG00000269967',
 'ENSG00000146453',
 'ENSG00000095397',
 'ENSG00000283512',
 'ENSG00000167904',
 'ENSG00000169684',


In [33]:
enr = gp.enrichr(gene_list=glist, 
                  gene_sets=['KEGG_2021_Human','GO_Molecular_Function_2021'],
                  organism='human', #set organism to the one you desired! e.g. Yeast
                  outdir=None,                
                 )

In [40]:
enr.results.head(5)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
