Author: Irsyad Adam

Created: 6/15/2022

In [96]:
import pandas as pd
import numpy as np

import pyreadr # to read r object (stage5.rds)
import scanpy as sc # to read seurat object (so.Robj)
import scipy
import anndata

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

#default plt to sns
sns.set(font_scale = 1.5)
sns.set_theme()

import os
import gseapy as gs

import warnings
warnings.filterwarnings("ignore")

In [97]:
gene_list_dirs = ["../gene_extraction_by_stage/control_gene_list", "../gene_extraction_by_stage/single_injury_gene_list", "../gene_extraction_by_stage/double_injury_gene_list"]
working_dir = {}
for i in gene_list_dirs:
    working_dir[i] = [j for j in os.listdir(i) if j != "README.md"]
    
curr = os.listdir("../gsea")
curr = ["../gsea/" + i for i in curr if i != 'gsea_pathway_integration.ipynb']

final_dir = {}
for i, j in zip(working_dir.keys(), curr):
    final_dir[j] = [i + "/" + k for k in working_dir[i]]
final_dir

{'../gsea/single_injury_gsea': ['../gene_extraction_by_stage/control_gene_list/control_day56.csv',
  '../gene_extraction_by_stage/control_gene_list/day1.csv',
  '../gene_extraction_by_stage/control_gene_list/control_day28.csv'],
 '../gsea/control_gsea': ['../gene_extraction_by_stage/single_injury_gene_list/first_injury_day56.csv',
  '../gene_extraction_by_stage/single_injury_gene_list/day1.csv',
  '../gene_extraction_by_stage/single_injury_gene_list/first_injury_day28.csv'],
 '../gsea/double_injury_gsea': ['../gene_extraction_by_stage/double_injury_gene_list/double_injury_day30.csv',
  '../gene_extraction_by_stage/double_injury_gene_list/double_injury_day35.csv',
  '../gene_extraction_by_stage/double_injury_gene_list/day1.csv',
  '../gene_extraction_by_stage/double_injury_gene_list/double_injury_day42.csv',
  '../gene_extraction_by_stage/double_injury_gene_list/double_injury_day56.csv',
  '../gene_extraction_by_stage/double_injury_gene_list/double_injury_day28.csv']}

## Gene Enrichment Analysis with GSEAPY

In [98]:
#see all possible databases
import gseapy
names = gseapy.get_library_name()
print(names)

['ARCHS4_Cell-lines', 'ARCHS4_IDG_Coexp', 'ARCHS4_Kinases_Coexp', 'ARCHS4_TFs_Coexp', 'ARCHS4_Tissues', 'Achilles_fitness_decrease', 'Achilles_fitness_increase', 'Aging_Perturbations_from_GEO_down', 'Aging_Perturbations_from_GEO_up', 'Allen_Brain_Atlas_10x_scRNA_2021', 'Allen_Brain_Atlas_down', 'Allen_Brain_Atlas_up', 'Azimuth_Cell_Types_2021', 'BioCarta_2013', 'BioCarta_2015', 'BioCarta_2016', 'BioPlanet_2019', 'BioPlex_2017', 'CCLE_Proteomics_2020', 'CORUM', 'COVID-19_Related_Gene_Sets', 'COVID-19_Related_Gene_Sets_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_Augmented_2021', 'ChEA_2013', 'ChEA_2015', 'ChEA_2016', 'Chromosome_Location', 'Chromosome_Location_hg19', 'ClinVar_2019', 'DSigDB', 'Data_Acquisition_Method_Most_Popular_Genes', 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019', 'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019', 'Descartes_Cell_Types_and_Tissue_2021', 'DisGeNET', 'Disease_Perturbations_from_GEO_down', 'Disease_Perturbations_from_GEO_up', 'Disease_Signatures

In [89]:
#test form this cell on
df = pd.read_csv("../gene_extraction_by_stage/double_injury_gene_list/double_injury_day30.csv", index_col=0)
pathways = df.index.to_list()

In [95]:
gs_res = gs.enrichr(gene_list=pathways[:5000], gene_sets= ['MSigDB_Hallmark_2020'], description='pathway', organism="Human", cutoff=0.5, outdir="../gsea/enrichr")


  self.results = self.results.append(res, ignore_index=True)


In [110]:
def run_gsea(model: dict.keys, model_dict: dict, gene_db: list) -> None:
    """ 
    ARGS: 
        model is the type of mode
        model is the key in the dict
        gene_db is the gene db to measure against
    """
    print("gsea for:", model)

    #iterate through the list of the csvs
    for csv in tqdm(model_dict[model], desc = "parsing model"):
        #test form this cell on
        df = pd.read_csv(csv, index_col=0)
        genes = df.index.to_list()

        #take top 5000 genes, measure against MSigDB
        gs_res = gs.enrichr(gene_list=genes[:5000], gene_sets= gene_db, description='pathway', organism="Human", cutoff=0.5, outdir=model + "/" + csv.split("/")[-1][:-4])


In [111]:
for i in final_dir.keys():
    run_gsea(i, final_dir, gene_db=['MSigDB_Hallmark_2020'])

gsea for: ../gsea/single_injury_gsea


parsing model: 100%|██████████| 3/3 [00:32<00:00, 10.91s/it]


gsea for: ../gsea/control_gsea


parsing model: 100%|██████████| 3/3 [00:33<00:00, 11.15s/it]


gsea for: ../gsea/double_injury_gsea


parsing model: 100%|██████████| 6/6 [01:11<00:00, 11.91s/it]
