# Parial streaming of annotated data matrices from the cloud

In [None]:
!lndb login testuser1 --password "cEvcwMJFX4OwbsYVaMt2Os6GxxGgDUlBGILs2RyS"
!lndb init --storage "s3://lndb-setup-ci"

In [None]:
import lamindb as ln
import lamindb.schema as lns
import scanpy as sc

In [None]:
ln.nb.header()

## Retrieve test data

Here, we'll use `AnnData` objects. In the future, other data objects will provide similar functionality.

In [None]:
pbmc68k = sc.datasets.pbmc68k_reduced()

In [None]:
pbmc68k

In [None]:
pbmc68k.obs["bulk_labels"].value_counts()

In [None]:
pbmc3k = sc.datasets.pbmc3k_processed()

In [None]:
pbmc3k

In [None]:
pbmc3k.obs["louvain"].value_counts()

## Ingest the test data

Create pipeline for this example.

In [None]:
pipeline = ln.add(lns.Pipeline(name="Streaming pipeline", v="1"))

In [None]:
run = lns.Run(name="Ingest PBMCs", pipeline_id=pipeline.id, pipeline_v=pipeline.v)

Ingest the `AnnData` objects.

In [None]:
pbmc68k_dobj = ln.record(pbmc68k, name="pbmc68k", run=run)
pbmc3k_dobj = ln.record(pbmc3k, name="pbmc3k", run=run)

In [None]:
ln.add([pbmc68k_dobj, pbmc3k_dobj])

## Query the data records

Query the `AnnData` objects from this run. Note that this does **not** download the data.

In [None]:
dobjects = ln.select(lns.DObject).join(lns.Run, id=run.id).all()

Prepare the strings to query the columns of `.obs` for each `AnnData` object. For details see the [pandas docs](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html).

```{note}

Soon, we'll integrate the within-object queries with the SQL queries.

```

### Pandas-style query strings

In [None]:
pbmc68k_obs = "bulk_labels == 'Dendritic' | bulk_labels == 'CD14+ Monocyte'"
pbmc3k_obs = "louvain == 'CD4 T cells' | louvain == 'CD14+ Monocytes'"

Subset the `AnnData` objects based on the query strings above and load them directly into memory.

No caching happens here!

Only the `.obs` columns are loaded fully to do the subsetting during the function execution. For all remaining, data **only the subsets** are loaded in memory.

In [None]:
adatas = ln.subset(dobjects, query_obs=[pbmc68k_obs, pbmc3k_obs], use_concat=False)

In [None]:
adatas

In [None]:
adata = ln.subset(dobjects, query_obs=[pbmc68k_obs, pbmc3k_obs], use_concat=True)

In [None]:
adata

### Lazy query expressions

Lazy selectors for convenient subsetting with complicated conditions. Operators, methods and numpy functions are supported.

In [None]:
from lamindb import lazy
import numpy as np

In [None]:
pbmc68k_obs = ~lazy.bulk_labels.isin(("Dendritic", "CD14+ Monocyte")) & (
    lazy.phase == "G1"
)
pbmc3k_obs = np.isin(lazy.louvain, ("CD4 T cells", "CD14+ Monocytes"))

In [None]:
adatas = ln.subset(dobjects, query_obs=[pbmc68k_obs, pbmc3k_obs])

In [None]:
adatas

Contents of the cache of the current instance.

In [None]:
list((ln.settings.instance.cache_dir / ln.settings.instance.name).glob("*.*"))

## Clean the test data from CI

Clean the test instance.

In [None]:
ln.delete(pbmc68k_dobj)
ln.delete(pbmc3k_dobj)

In [None]:
ln.delete(pipeline)
ln.delete(run)