# Tutorial: Model training

In [3]:
# pip install zarr<3 lamindb lightning modlyn
import warnings
import os
from os.path import join
import lamindb as ln
import anndata as ad
import lightning as L
from tqdm import tqdm
from modlyn.io.datamodules import ClassificationDataModule
from modlyn.models.linear import Linear
from modlyn.io.loading import read_lazy

ln.track("UMQFXo0vs0Z6", project="DataLoader v2")

[92m→[0m found notebook train_linear.ipynb, making new version
[92m→[0m created Transform('UMQFXo0vs0Z60005'), started new Run('YsHf4azP...') at 2025-06-04 14:49:20 UTC
[92m→[0m notebook imports: anndata==0.12.0rc3 lamindb==1.5.3 lightning==2.5.1.post0 modlyn==0.0.2 tqdm==4.67.1


## Cache the pre-shuffled zarr store

In [4]:
# if running this not in the arrayloader-benchmarks instance, please add .using(...)
# ln.Artifact.using("laminlabs/arrayloader-benchmarks").get(uid)
# artifact_tahoe_store = ln.Artifact.get("BQ6RplqNcT0akokn0000")  # full 100M cells and 60k genes
artifact_tahoe_store = ln.Artifact.get("TuhkPw0wkzlUXN5k0000")  # subsampled to 2k cells and 200 genes
artifact_tahoe_store

Artifact(uid='TuhkPw0wkzlUXN5k0000', is_latest=True, key='tahoe100M/shuffled_plate3_subset_1000_100_AB', suffix='', size=3989166, hash='Y-pMgFWbhcdvZsvmXEt--w', n_files=108, space_id=1, storage_id=3, run_id=104, created_by_id=2, created_at=2025-05-10 15:05:47 UTC)

In [5]:
%%time
# in case of the 100M cell datasets, downloads 320GB and 36k zarr fragments (files) into the local cache
# will run a while even on AWS due to so many files
store_path = artifact_tahoe_store.cache()

CPU times: user 327 ms, sys: 48.5 ms, total: 375 ms
Wall time: 4.37 s


In [11]:
# list(store_path.iterdir())
store_path

PosixUPath('/home/ubuntu/.cache/lamindb/lamin-us-west-2/wXDsTYYd/tahoe100M/shuffled_plate3_subset_1000_100_AB')

## Train a linear model

In [14]:
import anndata
anndata.__version__

'0.12.0rc3'

In [9]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")  # ignore zarr warnings that zarrv3 codec is not final yet
    adata = read_lazy(store_path)

adata

ValueError: X needs to be of one of <class 'numpy.ndarray'>, <class 'numpy.ma.MaskedArray'>, <class 'scipy.sparse._csr.csr_matrix'>, <class 'scipy.sparse._csc.csc_matrix'>, <class 'scipy.sparse._csr.csr_array'>, <class 'scipy.sparse._csc.csc_array'>, <class 'h5py._hl.dataset.Dataset'>, <class 'zarr.core.array.Array'>, <class 'anndata.compat.ZappyArray'>, <class 'anndata.abc.CSRDataset'>, <class 'anndata.abc.CSCDataset'>, <class 'anndata.compat.DaskArray'>, <class 'anndata.compat.CupyArray'>, <class 'anndata.compat.CupySparseMatrix'>, <class 'anndata.compat.AwkArray'>, or <class 'anndata.compat.XDataArray'>, not <class 'dask.array.core.Array'>.

In [None]:
adata.obs["y"] = adata.obs["cell_line"].astype("category").cat.codes.to_numpy().astype("i8")

In [None]:
adata_train = adata[:80_527_360]
adata_val = adata[80_527_360:]

datamodule = ClassificationDataModule(
    adata_train=adata_train,
    adata_val=adata_val,
    label_column="y",
    train_dataloader_kwargs={
        "batch_size": 2048,
        "drop_last": True,
    },
    val_dataloader_kwargs={
        "batch_size": 2048,
        "drop_last": False,
    },
)

In [None]:
linear = Linear(
    n_genes=adata.n_vars,
    n_covariates=adata.obs["y"].nunique(),
    learning_rate=1e-2,
)

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    log_every_n_steps=100,
    max_steps=3000,  # only fit a few steps for the sake of this tutorial
)

In [None]:
trainer.fit(model=linear, datamodule=datamodule)

In [None]:
ln.finish()