## Generate a test data that can pass validation

In [1]:
# compact rewrite: validate from files, minimal helpers
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="mudata")
warnings.filterwarnings("ignore", category=FutureWarning, module="tqdm")

import numpy as np
from anndata import AnnData
from mudata import MuData
import mudata as md
from pathlib import Path

# -------------------------------
# helpers
# -------------------------------
def read_valid_set(path: str) -> set[str]:
    p = Path(path)
    if not p.is_file():
        raise FileNotFoundError(f"{path} not found")
    with p.open() as f:
        return {ln.strip() for ln in f if ln.strip() and not ln.startswith("#")}

def pick_match(valid: set[str], *guesses: str) -> str:
    norm = {v.lower().replace("_", ""): v for v in valid}
    for g in guesses:
        k = g.lower().replace("_", "")
        if k in norm:
            return norm[k]
        # loose contains
        for vk, v in norm.items():
            if k in vk:
                return v
    raise ValueError(f"No match in {sorted(valid)} for guesses {guesses}")

# -------------------------------
# load vocab
# -------------------------------
VALID_ANALYTE = read_valid_set("valid_analyte_classes.txt")
VALID_OBJECT  = read_valid_set("valid_object_types.txt")

In [2]:
# -------------------------------
# create data
# -------------------------------
np.random.seed(1)
n, d, k = 10, 10, 10
z = np.random.normal(loc=np.arange(k), scale=np.arange(k)*2, size=(n, k))
w = np.random.normal(size=(d, k))
y = z @ w.T

adata = AnnData(y)
adata.obs_names = [f"obs_{i+1}" for i in range(n)]
adata.var_names = [f"var_{j+1}" for j in range(d)]
adata.obsm["annotation"] = np.random.choice(["cell_type_A", "cell_type_B"], size=n)

d2 = 50
w2 = np.random.normal(size=(d2, k))
y2 = z @ w2.T
adata2 = AnnData(y2)
adata2.obs_names = [f"obs_{i+1}" for i in range(n)]
adata2.var_names = [f"var2_{j+1}" for j in range(d2)]

mdata = MuData({"A": adata, "B": adata2})

# -------------------------------
# fill required validation fields
# -------------------------------
obj_type = pick_match(VALID_OBJECT, "cell") if VALID_OBJECT else "cell"
protocol = "10.17504/protocols.io.bxyz1234"

for mod in mdata.mod.values():
    mod.obs["original_obs_id"] = mod.obs_names.astype(str)
    mod.obs["object_type"] = obj_type
    mod.uns["protocol"] = protocol

# choose analyte class per modality; keep simple with "RNA" guess
analyte = pick_match(VALID_ANALYTE, "RNA") if VALID_ANALYTE else "RNA"
for ad in mdata.mod.values():
    ad.uns["analyte_class"] = analyte

# epic_type
has_annotations = any("annotation" in mod.obsm and len(mod.obsm["annotation"]) > 0
                      for mod in mdata.mod.values())
has_analyses = any(mod.X is not None and getattr(mod.X, "size", 0) > 0
                   for mod in mdata.mod.values())
mdata.uns["epic_type"] = ([t for t, ok in (("annotations", has_annotations),
                                           ("analyses", has_analyses)) if ok])

# -------------------------------
# write and reload
# -------------------------------
path_h5mu = "example.h5mu"
mdata.write(path_h5mu)
mdata_r = md.read(path_h5mu, backed=True)
print("Written and reloaded MuData successfully.")
print("epic_type:", mdata.uns["epic_type"])
print("object_type:", obj_type)
print("analyte_class:", analyte)

Written and reloaded MuData successfully.
epic_type: ['annotations', 'analyses']
object_type: cell
analyte_class: RNA


## Validate a demo MuData object
#### validate using the h5mu file path

In [3]:
# Validate a MuData object
# validate using the h5mu file path
from mudata_validator import validate_mudata
import muon as mu
mdata = mu.read_h5mu(path_h5mu)
validate_mudata(mdata)

  from .autonotebook import tqdm as notebook_tqdm
  if Version(scanpy.__version__) < Version("1.10"):


Validating overall MuData object...
Validating modality: A
The values in AnnData.obs.index will be used as the objects' unique identifiers. They look like:
Index(['obs_1', 'obs_2', 'obs_3', 'obs_4', 'obs_5'], dtype='object')
The HUGO symbol should be included as an annotation for genes and the Uniprot ID should be included as an annotation for proteins.




Standard plots are expected to be stored in .obsm['X_umap'], .obsm['X_harmony'], .obsm['X_tsne'] and .obsm['X_pca']
If this is spatial data, coordinates should go in .obsm['X_spatial']
Validating modality: B
The values in AnnData.obs.index will be used as the objects' unique identifiers. They look like:
Index(['obs_1', 'obs_2', 'obs_3', 'obs_4', 'obs_5'], dtype='object')
The HUGO symbol should be included as an annotation for genes and the Uniprot ID should be included as an annotation for proteins.




Standard plots are expected to be stored in .obsm['X_umap'], .obsm['X_harmony'], .obsm['X_tsne'] and .obsm['X_pca']
If this is spatial data, coordinates should go in .obsm['X_spatial']
Validation passed!


## Validate Object x analyte dataset ingested from TC-CMU:
#### /consortium/TC - Carnegie Mellon University/2995dc4b75f09b68c93fa79c7de44fc6/

In [4]:
path_h5mu = "data/RNA_raw.h5mu"
mdata = mu.read_h5mu(path_h5mu)
validate_mudata(mdata)

Validating overall MuData object...
Validating modality: RNA_raw
The values in AnnData.obs.index will be used as the objects' unique identifiers. They look like:
Index(['6c57274e7a40413dc042ec32442a228b-AAACCCAAGCAGCCTC',
       '6c57274e7a40413dc042ec32442a228b-AAACCCAAGCATCCTA',
       '6c57274e7a40413dc042ec32442a228b-AAACCCAAGCTAATCC',
       '6c57274e7a40413dc042ec32442a228b-AAACCCAAGCTATCCA',
       '6c57274e7a40413dc042ec32442a228b-AAACCCAAGGTTGAGC'],
      dtype='object', name='cell_id')
The HUGO symbol should be included as an annotation for genes and the Uniprot ID should be included as an annotation for proteins.




Standard plots are expected to be stored in .obsm['X_umap'], .obsm['X_harmony'], .obsm['X_tsne'] and .obsm['X_pca']
If this is spatial data, coordinates should go in .obsm['X_spatial']
Validation passed!
