# McFarland 2020 dataset preprocessing for perturbation use case

This notebook prepares the [McFarland et al. 2020](https://www.nature.com/articles/s41467-020-17440-w) dataset for the perturbation use case.

In [None]:
!lamin load laminlabs/lamindata

In [None]:
import lamindb as ln
import bionty as bt
import scanpy as sc
import anndata as ad
import numpy as np

ln.context.uid = "13VINnFk89PE0003"
ln.context.track()

In [None]:
!wget -nc https://zenodo.org/record/7041849/files/McFarlandTsherniak2020.h5ad

In [None]:
adata = ad.read_h5ad("McFarlandTsherniak2020.h5ad")

In [None]:
sc.pp.subsample(adata, n_obs=1000)
adata = adata[:, np.random.choice(adata.var_names, size=2000, replace=False)]

In [None]:
categoricals = {
    "DepMap_ID": bt.CellLine.ontology_id,
    "cell_line": bt.CellLine.name,
    "disease": bt.Disease.name,
    "organism": bt.Organism.name,
    "perturbation_type": ln.ULabel.name,
    "sex": bt.Phenotype.name,
    "time": ln.ULabel.name,
    "tissue_type": ln.ULabel.name,
}
sources = {
    "var_index": bt.Source.filter(entity="bionty.Gene", version="release-112", organism="human").one(),
    "DepMap_ID": bt.Source.filter(name="depmap").one(),
    "cell_line": bt.Source.filter(name="depmap").one(),
}

curate = ln.Curator.from_anndata(
    adata,
    var_index=bt.Gene.ensembl_gene_id,
    categoricals=categoricals,
    organism="human",
    sources=sources,
)

curate.validate()

In [None]:
# Map mix of ensembl IDs and gene symbols in the var_index to ensembl IDs
gene_mapper = bt.Gene.standardize(
    adata.var_names,
    field="symbol",
    return_field="ensembl_gene_id",
    return_mapper=True,
    organism="human",
)
gene_mapper = {k: v for k, v in gene_mapper.items() if v is not None}
adata.var.index = adata.var.index.map(lambda x: gene_mapper.get(x, x))

In [None]:
curate = ln.Curator.from_anndata(
    adata,
    var_index=bt.Gene.ensembl_gene_id,
    organism="human",
    sources={"var_index": bt.Source.filter(entity="bionty.Gene", version="release-112", organism="human").one()}
)

curate.validate()

In [None]:
# Anything that's still left after mapping is not a valid gene that we want to keep
adata = adata[:, ~adata.var.index.isin(curate.non_validated["var_index"])].copy()
adata.var_names = adata.var['ensembl_id']

In [None]:
adata.obs["disease"] = adata.obs["disease"].cat.rename_categories(
    {
        "colon/colorectal cancer": "colorectal cancer",
        "rhabdoid": "rhabdoid tumor",
        "bladder cancer": "urinary bladder carcinoma",
        "endometrial/uterine cancer": "uterine corpus cancer",
    }
)

adata.obs["cell_line"] = bt.CellLine.public(
    source=bt.Source.filter(name="depmap").one()
).standardize(adata.obs["cell_line"], field="name")

adata.obs["cell_line"] = adata.obs["cell_line"].astype('category')

adata.obs["perturbation"] = adata.obs["perturbation"].cat.rename_categories(
    lambda category: category.lower()
)

adata.obs = adata.obs.rename(columns={"DepMap_ID": "depmap_id"})

adata.obs = adata.obs.drop(columns="percent.mito")

In [None]:
curate = ln.Curator.from_anndata(
    adata,
    var_index=bt.Gene.ensembl_gene_id,
    organism="human",
    sources={"var_index": bt.Source.filter(entity="bionty.Gene", version="release-112", organism="human").one()}
)

curate.validate()

In [None]:
adata = adata[:, ~adata.var.index.isin(curate.non_validated["var_index"])].copy()
adata.var_names = adata.var['ensembl_id']

In [None]:
curate = ln.Curator.from_anndata(
    adata,
    var_index=bt.Gene.ensembl_gene_id,
    organism="human",
    sources={"var_index": bt.Source.filter(entity="bionty.Gene", version="release-112", organism="human").one()}
)

curate.validate()

In [None]:
mcfarland_af = ln.Artifact.from_anndata(adata, description="McFarland 2020 preprocessed").save()

In [None]:
ln.context.finish()