From d93857262ec561bc74c33294923b610a0d01dd1c Mon Sep 17 00:00:00 2001
From: Sunny Sun <38218185+sunnyosun@users.noreply.github.com>
Date: Tue, 24 Oct 2023 05:57:05 -0400
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Add=20a=20faq=20for=20custome=20?=
 =?UTF-8?q?validator=20(#1249)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/faq.md                   |   1 +
 docs/faq/validator.ipynb      | 150 ++++++++++++++++++++++++++++++++++
 docs/validate.ipynb           |  14 ++++
 lamindb/dev/datasets/_core.py |  13 +--
 4 files changed, 173 insertions(+), 5 deletions(-)
 create mode 100644 docs/faq/validator.ipynb

diff --git a/docs/faq.md b/docs/faq.md
index 614224d89..e4ae62125 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -10,4 +10,5 @@ faq/notebooks
 faq/setup
 faq/import-schema
 faq/reference-field
+faq/validator
 ```
diff --git a/docs/faq/validator.ipynb b/docs/faq/validator.ipynb
new file mode 100644
index 000000000..c3eac9447
--- /dev/null
+++ b/docs/faq/validator.ipynb
@@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Custom validators"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lamin init --storage test-validator --schema bionty"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lamindb as ln\n",
+    "import lnschema_bionty as lb\n",
+    "from lamin_utils import logger\n",
+    "\n",
+    "lb.settings.organism = \"human\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's use an AnnData as the dataset to validate:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata = ln.dev.datasets.anndata_human_immune_cells(populate_registries=True)\n",
+    "adata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define validation criteria for an `AnnData`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validators = {\n",
+    "    \"var\": {\"index\": lb.Gene.ensembl_gene_id},\n",
+    "    \"obs\": {\n",
+    "        \"donor\": ln.ULabel.name,\n",
+    "        \"tissue\": lb.Tissue.name,\n",
+    "        \"cell_type\": lb.CellType.name,\n",
+    "        \"assay\": lb.ExperimentalFactor.name,\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run bulk validation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = ln.Feature.lookup()\n",
+    "\n",
+    "for slot, slot_validators in validators.items():\n",
+    "    for name, validator in slot_validators.items():\n",
+    "        # registry model\n",
+    "        model = validator.field.model\n",
+    "\n",
+    "        # validate index\n",
+    "        if name == \"index\":\n",
+    "            logger.print(f\"validating {slot}.{name}:\")\n",
+    "            index = adata.__getattribute__(slot).index\n",
+    "            validated = model.validate(index, validator)\n",
+    "            if validated.sum() == len(index):\n",
+    "                logger.print(\"🎉 PASS\")\n",
+    "\n",
+    "        # validate columns\n",
+    "        else:\n",
+    "            logger.print(f\"\\nvalidating {slot}.{name}:\")\n",
+    "            # check if the column name exist\n",
+    "            if name not in adata.__getattribute__(slot).columns:\n",
+    "                logger.warning(f\"{slot}.{name} field is missing\")\n",
+    "            else:\n",
+    "                # check if a feature is registered for the column\n",
+    "                if not hasattr(features, name):\n",
+    "                    logger.warning(f\"feature '{name}' is not registered\")\n",
+    "                # validate categorical labels in a column\n",
+    "                else:\n",
+    "                    labels = adata.__getattribute__(slot)[name]\n",
+    "                    validated = model.validate(labels, validator)\n",
+    "                    if validated.sum() == len(labels):\n",
+    "                        logger.print(\"🎉 PASS\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lamin delete --force test-validator"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/validate.ipynb b/docs/validate.ipynb
index 73f653773..cd37b5a62 100644
--- a/docs/validate.ipynb
+++ b/docs/validate.ipynb
@@ -384,6 +384,20 @@
     "For an overview of data formats used to model different data types, see {doc}`docs:by-datatype`."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bulk validation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Write your own validators using LaminDB registries, see {doc}`/faq/validator`."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/lamindb/dev/datasets/_core.py b/lamindb/dev/datasets/_core.py
index d67850db0..6f89ff8c1 100644
--- a/lamindb/dev/datasets/_core.py
+++ b/lamindb/dev/datasets/_core.py
@@ -156,7 +156,7 @@ def anndata_mouse_sc_lymph_node(
         populate_registries: pre-populate metadata records to simulate existing registries  # noqa
     """
     filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
-    adata = ad.read(filepath)
+    adata = ad.read_h5ad(filepath)
 
     # The column names are a bit lengthy, let's abbreviate them:
     adata.obs.columns = (
@@ -253,7 +253,7 @@ def anndata_pbmc68k_reduced() -> ad.AnnData:
     filepath, _ = urlretrieve(
         "https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
     )
-    return ad.read(filepath)
+    return ad.read_h5ad(filepath)
 
 
 def anndata_file_pbmc68k_test() -> Path:
@@ -283,7 +283,7 @@ def anndata_pbmc3k_processed() -> ad.AnnData:  # pragma: no cover
     filepath, _ = urlretrieve(
         "https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
     )
-    pbmc3k = ad.read(filepath)
+    pbmc3k = ad.read_h5ad(filepath)
     pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
     return pbmc3k
 
@@ -306,8 +306,11 @@ def anndata_human_immune_cells(
         adata.write('human_immune.h5ad')
     """
     filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
-    adata = ad.read(filepath)
+    adata = ad.read_h5ad(filepath)
     adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
+    adata.uns.pop("cell_type_ontology_term_id_colors")
+    adata.uns.pop("title")
+    adata.uns.pop("schema_version")
     adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
     columns = [col for col in adata.obs.columns if "ontology_term" not in col]
     adata.obs = adata.obs[columns]
@@ -378,7 +381,7 @@ def anndata_suo22_Visium10X():  # pragma: no cover
     )
     Path("suo22/").mkdir(exist_ok=True)
     filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
-    return ad.read(filepath)
+    return ad.read_h5ad(filepath)
 
 
 def mudata_papalexi21_subset():  # pragma: no cover