Skip to content

Commit

Permalink
📝 Add a faq for custome validator (#1249)
Browse files Browse the repository at this point in the history
  • Loading branch information
sunnyosun committed Oct 24, 2023
1 parent 0d8af7f commit d938572
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 5 deletions.
1 change: 1 addition & 0 deletions docs/faq.md
Expand Up @@ -10,4 +10,5 @@ faq/notebooks
faq/setup
faq/import-schema
faq/reference-field
faq/validator
```
150 changes: 150 additions & 0 deletions docs/faq/validator.ipynb
@@ -0,0 +1,150 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Custom validators"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!lamin init --storage test-validator --schema bionty"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import lamindb as ln\n",
"import lnschema_bionty as lb\n",
"from lamin_utils import logger\n",
"\n",
"lb.settings.organism = \"human\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's use an AnnData as the dataset to validate:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"adata = ln.dev.datasets.anndata_human_immune_cells(populate_registries=True)\n",
"adata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define validation criteria for an `AnnData`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"validators = {\n",
" \"var\": {\"index\": lb.Gene.ensembl_gene_id},\n",
" \"obs\": {\n",
" \"donor\": ln.ULabel.name,\n",
" \"tissue\": lb.Tissue.name,\n",
" \"cell_type\": lb.CellType.name,\n",
" \"assay\": lb.ExperimentalFactor.name,\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run bulk validation:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features = ln.Feature.lookup()\n",
"\n",
"for slot, slot_validators in validators.items():\n",
" for name, validator in slot_validators.items():\n",
" # registry model\n",
" model = validator.field.model\n",
"\n",
" # validate index\n",
" if name == \"index\":\n",
" logger.print(f\"validating {slot}.{name}:\")\n",
" index = adata.__getattribute__(slot).index\n",
" validated = model.validate(index, validator)\n",
" if validated.sum() == len(index):\n",
" logger.print(\"🎉 PASS\")\n",
"\n",
" # validate columns\n",
" else:\n",
" logger.print(f\"\\nvalidating {slot}.{name}:\")\n",
" # check if the column name exist\n",
" if name not in adata.__getattribute__(slot).columns:\n",
" logger.warning(f\"{slot}.{name} field is missing\")\n",
" else:\n",
" # check if a feature is registered for the column\n",
" if not hasattr(features, name):\n",
" logger.warning(f\"feature '{name}' is not registered\")\n",
" # validate categorical labels in a column\n",
" else:\n",
" labels = adata.__getattribute__(slot)[name]\n",
" validated = model.validate(labels, validator)\n",
" if validated.sum() == len(labels):\n",
" logger.print(\"🎉 PASS\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!lamin delete --force test-validator"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py39",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
14 changes: 14 additions & 0 deletions docs/validate.ipynb
Expand Up @@ -384,6 +384,20 @@
"For an overview of data formats used to model different data types, see {doc}`docs:by-datatype`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bulk validation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Write your own validators using LaminDB registries, see {doc}`/faq/validator`."
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
13 changes: 8 additions & 5 deletions lamindb/dev/datasets/_core.py
Expand Up @@ -156,7 +156,7 @@ def anndata_mouse_sc_lymph_node(
populate_registries: pre-populate metadata records to simulate existing registries # noqa
"""
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
adata = ad.read(filepath)
adata = ad.read_h5ad(filepath)

# The column names are a bit lengthy, let's abbreviate them:
adata.obs.columns = (
Expand Down Expand Up @@ -253,7 +253,7 @@ def anndata_pbmc68k_reduced() -> ad.AnnData:
filepath, _ = urlretrieve(
"https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
)
return ad.read(filepath)
return ad.read_h5ad(filepath)


def anndata_file_pbmc68k_test() -> Path:
Expand Down Expand Up @@ -283,7 +283,7 @@ def anndata_pbmc3k_processed() -> ad.AnnData: # pragma: no cover
filepath, _ = urlretrieve(
"https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
)
pbmc3k = ad.read(filepath)
pbmc3k = ad.read_h5ad(filepath)
pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
return pbmc3k

Expand All @@ -306,8 +306,11 @@ def anndata_human_immune_cells(
adata.write('human_immune.h5ad')
"""
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
adata = ad.read(filepath)
adata = ad.read_h5ad(filepath)
adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
adata.uns.pop("cell_type_ontology_term_id_colors")
adata.uns.pop("title")
adata.uns.pop("schema_version")
adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
columns = [col for col in adata.obs.columns if "ontology_term" not in col]
adata.obs = adata.obs[columns]
Expand Down Expand Up @@ -378,7 +381,7 @@ def anndata_suo22_Visium10X(): # pragma: no cover
)
Path("suo22/").mkdir(exist_ok=True)
filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
return ad.read(filepath)
return ad.read_h5ad(filepath)


def mudata_papalexi21_subset(): # pragma: no cover
Expand Down

0 comments on commit d938572

Please sign in to comment.