# Integrate scRNA datasets based on shared features/metadata

In [None]:
!lamin load test-scrna

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import pandas as pd
import anndata as ad

ln.settings.verbosity = 3  # show hints

In [None]:
ln.track()

## Query files based on metadata

In [None]:
ln.File.select(tissues__name__icontains="lymph node").distinct().df()

In [None]:
ln.File.select(cell_types__name__icontains="monocyte").distinct().df()

In [None]:
ln.File.select(labels__name="female").distinct().df()

## Intersect measured genes between two datasets

In [None]:
file1 = ln.File.select(description="Conde22").one()
file2 = ln.File.select(description="10x reference pbmc68k").one()

In [None]:
file1.describe()

In [None]:
file2.describe()

In [None]:
file1_adata = file1.load()
file2_adata = file2.load()

Here we compute shared genes without loading files:

In [None]:
file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
shared_genes.list("symbol")[:10]

We also need to convert the ensembl_gene_id to symbol for file2 so that they can be concatenated:

In [None]:
mapper = (
    pd.DataFrame(file2_genes.values_list("ensembl_gene_id", "symbol"))
    .drop_duplicates(0)
    .set_index(0)[1]
)
mapper.head()

In [None]:
file1_adata.var.rename(index=mapper, inplace=True)

## Intersect cell types

In [None]:
file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

We can now subset the two datasets by shared cell types:

In [None]:
file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file1_adata_subset.obs["cell_type"].value_counts()

But when we subset the 2nd file, we don't see the two cell types, why?

Because they are labeled with synonyms!

In [None]:
file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset.obs["cell_type"].value_counts()

We can easily standardize them using {meth}`~lamindb.dev.ORM.map_synonyms`:

In [None]:
file2_adata.obs["cell_type"] = lb.CellType.map_synonyms(file2_adata.obs["cell_type"])

Now we have the two cell types:

In [None]:
file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset.obs["cell_type"].value_counts()

In [None]:
adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat

In [None]:
adata_concat.obs.value_counts()

In [None]:
!lamin delete test-scrna
!rm -r ./test-scrna