From d93857262ec561bc74c33294923b610a0d01dd1c Mon Sep 17 00:00:00 2001 From: Sunny Sun <38218185+sunnyosun@users.noreply.github.com> Date: Tue, 24 Oct 2023 05:57:05 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Add=20a=20faq=20for=20custome=20?= =?UTF-8?q?validator=20(#1249)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/faq.md | 1 + docs/faq/validator.ipynb | 150 ++++++++++++++++++++++++++++++++++ docs/validate.ipynb | 14 ++++ lamindb/dev/datasets/_core.py | 13 +-- 4 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 docs/faq/validator.ipynb diff --git a/docs/faq.md b/docs/faq.md index 614224d89..e4ae62125 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -10,4 +10,5 @@ faq/notebooks faq/setup faq/import-schema faq/reference-field +faq/validator ``` diff --git a/docs/faq/validator.ipynb b/docs/faq/validator.ipynb new file mode 100644 index 000000000..c3eac9447 --- /dev/null +++ b/docs/faq/validator.ipynb @@ -0,0 +1,150 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom validators" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!lamin init --storage test-validator --schema bionty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import lamindb as ln\n", + "import lnschema_bionty as lb\n", + "from lamin_utils import logger\n", + "\n", + "lb.settings.organism = \"human\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use an AnnData as the dataset to validate:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata = ln.dev.datasets.anndata_human_immune_cells(populate_registries=True)\n", + "adata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define validation criteria for an `AnnData`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validators = {\n", + " \"var\": {\"index\": lb.Gene.ensembl_gene_id},\n", + " \"obs\": {\n", + " \"donor\": ln.ULabel.name,\n", + " \"tissue\": lb.Tissue.name,\n", + " \"cell_type\": lb.CellType.name,\n", + " \"assay\": lb.ExperimentalFactor.name,\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run bulk validation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = ln.Feature.lookup()\n", + "\n", + "for slot, slot_validators in validators.items():\n", + " for name, validator in slot_validators.items():\n", + " # registry model\n", + " model = validator.field.model\n", + "\n", + " # validate index\n", + " if name == \"index\":\n", + " logger.print(f\"validating {slot}.{name}:\")\n", + " index = adata.__getattribute__(slot).index\n", + " validated = model.validate(index, validator)\n", + " if validated.sum() == len(index):\n", + " logger.print(\"🎉 PASS\")\n", + "\n", + " # validate columns\n", + " else:\n", + " logger.print(f\"\\nvalidating {slot}.{name}:\")\n", + " # check if the column name exist\n", + " if name not in adata.__getattribute__(slot).columns:\n", + " logger.warning(f\"{slot}.{name} field is missing\")\n", + " else:\n", + " # check if a feature is registered for the column\n", + " if not hasattr(features, name):\n", + " logger.warning(f\"feature '{name}' is not registered\")\n", + " # validate categorical labels in a column\n", + " else:\n", + " labels = adata.__getattribute__(slot)[name]\n", + " validated = model.validate(labels, validator)\n", + " if validated.sum() == len(labels):\n", + " logger.print(\"🎉 PASS\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!lamin delete --force test-validator" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/validate.ipynb b/docs/validate.ipynb index 73f653773..cd37b5a62 100644 --- a/docs/validate.ipynb +++ b/docs/validate.ipynb @@ -384,6 +384,20 @@ "For an overview of data formats used to model different data types, see {doc}`docs:by-datatype`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bulk validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Write your own validators using LaminDB registries, see {doc}`/faq/validator`." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/lamindb/dev/datasets/_core.py b/lamindb/dev/datasets/_core.py index d67850db0..6f89ff8c1 100644 --- a/lamindb/dev/datasets/_core.py +++ b/lamindb/dev/datasets/_core.py @@ -156,7 +156,7 @@ def anndata_mouse_sc_lymph_node( populate_registries: pre-populate metadata records to simulate existing registries # noqa """ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad") - adata = ad.read(filepath) + adata = ad.read_h5ad(filepath) # The column names are a bit lengthy, let's abbreviate them: adata.obs.columns = ( @@ -253,7 +253,7 @@ def anndata_pbmc68k_reduced() -> ad.AnnData: filepath, _ = urlretrieve( "https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad" ) - return ad.read(filepath) + return ad.read_h5ad(filepath) def anndata_file_pbmc68k_test() -> Path: @@ -283,7 +283,7 @@ def anndata_pbmc3k_processed() -> ad.AnnData: # pragma: no cover filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad" ) - pbmc3k = ad.read(filepath) + pbmc3k = ad.read_h5ad(filepath) pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True) return pbmc3k @@ -306,8 +306,11 @@ def anndata_human_immune_cells( adata.write('human_immune.h5ad') """ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad") - adata = ad.read(filepath) + adata = ad.read_h5ad(filepath) adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True) + adata.uns.pop("cell_type_ontology_term_id_colors") + adata.uns.pop("title") + adata.uns.pop("schema_version") adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor") columns = [col for col in adata.obs.columns if "ontology_term" not in col] adata.obs = adata.obs[columns] @@ -378,7 +381,7 @@ def anndata_suo22_Visium10X(): # pragma: no cover ) Path("suo22/").mkdir(exist_ok=True) filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad") - return ad.read(filepath) + return ad.read_h5ad(filepath) def mudata_papalexi21_subset(): # pragma: no cover