# Manage pathway ontology

In [None]:
import gseapy as gp
import scanpy as sc
from lamin_examples import datasets as ds
import matplotlib.pyplot as plt

## Fetch `GO_Biological_Process_2023` pathways annotated with human genes through Enrichr

In [None]:
go_bp = gp.get_library(name="GO_Biological_Process_2023", organism="Human")

# number of pathways
len(go_bp)

In [None]:
go_bp["ATF6-mediated Unfolded Protein Response (GO:0036500)"]

Parse out the ontology_id from keys, convert into the format of {ontology_id: (name, genes)}

In [None]:
def parse_ontology_id_from_keys(key):
    """Parse out the ontology id.

    "ATF6-mediated Unfolded Protein Response (GO:0036500)" -> ("GO:0036500", "ATF6-mediated Unfolded Protein Response")
    """
    id = key.split(" ")[-1].replace("(", "").replace(")", "")
    name = key.replace(f" ({id})", "")
    return (id, name)

In [None]:
go_bp_parsed = {}

for key, genes in go_bp.items():
    id, name = parse_ontology_id_from_keys(key)
    go_bp_parsed[id] = (name, genes)

In [None]:
go_bp_parsed["GO:0036500"]

## Register pathway ontology in LaminDB

Make sure you create or load a LaminDB instance before running the rest part of this notebook!

In [None]:
# A lamindb instance containing bionty schema (skip if you already loaded your instance)

!lamin init --storage enrichr --schema bionty

In [None]:
import lamindb as ln
from lnschema_bionty import Pathway, Gene

pathway_bionty = Pathway.bionty()  # equals to bionty.Pathway()

In [None]:
# Check which ontology of Pathway is being used in bionty

pathway_bionty

Next, we register all the pathways in LaminDB and link them with genes.

### Register pathway terms

In [None]:
pathway_records = ln.parse(go_bp_parsed.keys(), Pathway.ontology_id)

In [None]:
pathway_records[:3]

In [None]:
ln.save(pathway_records);

### Register gene symbols

In [None]:
all_genes = {g for genes in go_bp.values() for g in genes}

len(all_genes)

In [None]:
gene_records = ln.parse(all_genes, Gene.symbol)

In [None]:
gene_records[:3]

In [None]:
ln.save(gene_records);

### Link pathway to genes

In [None]:
gene_records_ids = {record.symbol: record for record in gene_records}

In [None]:
for pathway_record in pathway_records:
    pathway_genes = go_bp_parsed.get(pathway_record.ontology_id)[1]
    pathway_genes_records = [gene_records_ids.get(gene) for gene in pathway_genes]
    pathway_record.genes.set(pathway_genes_records)

Now genes are linked to pathways:

In [None]:
pathway_record.genes.values_list("symbol", flat=True)

## A interferon-beta treated dataset

A PBMCs dataset split into a stimulated and control group. The stimulated group was treated with interferon beta.

From "SeuratData::ifnb"

In [None]:
adata = ds.anndata_seurat_ifnb()

adata

In [None]:
adata.obs["seurat_annotations"].value_counts()

Subset to "B Activated" cells:

In [None]:
adata_ba = adata[adata.obs.seurat_annotations == "B Activated"].copy()
adata_ba

## Over-representation analysis by Enrichr

Based on: https://gseapy.readthedocs.io/en/master/singlecell_example.html

In [None]:
# compute differentially expressed genes
sc.tl.rank_genes_groups(
    adata_ba,
    groupby="stim",
    use_raw=False,
    method="wilcoxon",
    groups=["STIM"],
    reference="CTRL",
)

rank_genes_groups_df = sc.get.rank_genes_groups_df(adata_ba, "STIM")

In [None]:
rank_genes_groups_df.head()

Filter out up/down-regulated differentially expressed gene sets:

In [None]:
degs_up = rank_genes_groups_df[
    (rank_genes_groups_df["logfoldchanges"] > 0)
    & (rank_genes_groups_df["pvals_adj"] < 0.05)
]
degs_dw = rank_genes_groups_df[
    (rank_genes_groups_df["logfoldchanges"] < 0)
    & (rank_genes_groups_df["pvals_adj"] < 0.05)
]

In [None]:
degs_up.shape, degs_dw.shape

Run pathway enrichment analysis on DEGs and plot top 10 pathways:

In [None]:
enr_up = gp.enrichr(degs_up.names, gene_sets="GO_Biological_Process_2023").res2d

gp.dotplot(enr_up, figsize=(2, 3), title="Up", cmap=plt.cm.autumn_r);

In [None]:
enr_dw = gp.enrichr(degs_dw.names, gene_sets="GO_Biological_Process_2023").res2d

gp.dotplot(enr_dw, figsize=(2, 3), title="Down", cmap=plt.cm.winter_r, size=10);

## Track datasets with pathways in LaminDB

Let's enable tracking of the current notebook as the transform of this file:

In [None]:
ln.track()

In [None]:
file = ln.File(adata_ba, name="seurat_ifnb_activated_Bcells")

In [None]:
ln.save(file)

Register featuresets for degs_up and degs_dw:

In [None]:
degs_up_featureset = ln.FeatureSet.from_iterable(degs_up.names, Gene.symbol)

In [None]:
degs_dw_featureset = ln.FeatureSet.from_iterable(degs_dw.names, Gene.symbol)

In [None]:
ln.save(degs_up_featureset)
ln.save(degs_dw_featureset);

Link the top 10 pathways to the DEGs:

In [None]:
# get ontology ids for the top 10 pathways
enr_up_top10 = [i[0] for i in enr_up.head(10).Term.apply(parse_ontology_id_from_keys)]
enr_dw_top10 = [i[0] for i in enr_dw.head(10).Term.apply(parse_ontology_id_from_keys)]

# get pathway records
enr_up_top10_pathways = ln.parse(enr_up_top10, Pathway.ontology_id)
enr_dw_top10_pathways = ln.parse(enr_dw_top10, Pathway.ontology_id)

In [None]:
degs_up_featureset.pathways.set(enr_up_top10_pathways)
degs_dw_featureset.pathways.set(enr_dw_top10_pathways)

In [None]:
degs_up_featureset.pathways.values_list("name", flat=True)

Link the file to features:

In [None]:
file.featuresets.add(degs_up_featureset)
file.featuresets.add(degs_dw_featureset)

Query a pathway:

In [None]:
ln.select(Pathway).filter(name__contains="interferon-beta").df()

Query pathways from a gene:

In [None]:
ln.select(Pathway).filter(genes__symbol="KIR2DL1").df()

Query files from a pathway:

In [None]:
ln.select(ln.File).filter(
    featuresets__pathways__name__icontains="interferon-beta"
).first()

Query featuresets from a pathway (from which geneset was this pathway computed?):

In [None]:
pathway = ln.select(Pathway, ontology_id="GO:0035456").one()

pathway

In [None]:
degs = ln.select(ln.FeatureSet).filter(pathways__ontology_id=pathway.ontology_id).one()

Now we can get the list of genes that are differentially expressed and belong to this pathway:

In [None]:
pathway_genes = set(pathway.genes.values_list("symbol", flat=True))
degs_genes = set(degs.genes.values_list("symbol", flat=True))

In [None]:
pathway_genes.intersection(degs_genes)