# Validate and register metadata

In this notebook, we'll take a look at how to validate and register metadata.

We will combine metadata and deep learning embeddings into an `AnnData` object.

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import lnschema_lamin1 as ln1
import anndata as ad

In [None]:
ln.settings.verbosity = "hint"

In [None]:
ln.track()

## Load metadata

We read in the metadata of the wells:

In [None]:
meta_file = ln.File.filter(key="rxrx1.parquet").one()

In [None]:
meta = meta_file.load()

In [None]:
meta["plate"] = meta["plate"].astype(str)
meta["site"] = meta["site"].astype(str)
meta["sirna_id"] = meta["sirna_id"].astype(str)

In [None]:
meta.shape

In [None]:
meta.head()

## Load embeddings

In [None]:
embedding_file = ln.File.filter(description__contains="embeddings").one()

In [None]:
embedding = embedding_file.load()

In [None]:
embedding.set_index("site_id", inplace=True)

In [None]:
embedding.shape

In [None]:
embedding.head()

## Validate and register metadata

### `cell_type`

We can start with curating the metadata table based on the ontologies from bionty. For example, let's start with the `cell_line` metadata.
In this table, they are called "cell_type" but they are in fact the cell line for the experiments

In [None]:
meta["cell_type"].unique()

In [None]:
cell_lines = lb.CellLine.from_values(meta["cell_type"])

In [None]:
ln.save(cell_lines, parents=False)

### `dataset` (train/test labels)

Use `ULabel` for "train", "test":

In [None]:
meta["dataset"].unique()

In [None]:
train_test = []
for name in meta["dataset"].unique():
    train_test.append(ln.ULabel(name=name, description="ML train/test split"))

ln.save(train_test)

### `experiment`

In [None]:
ln.settings.upon_create_search_name = False

experiments = []
for name in meta["experiment"].unique():
    record = ln1.Experiment(name=name, description="RxRx1")
    experiments.append(record)

ln.save(experiments)

ln.settings.upon_create_search_name = True

### `plate`

In [None]:
is_plate = ln.ULabel(name="is_plate", description="parent of plates")
is_plate.save()

In [None]:
plates = []
for name in meta["plate"].unique():
    record = ln.ULabel(name=f"Plate{name}")
    plates.append(record)

ln.save(plates)
is_plate.children.set(plates)

In [None]:
is_plate.view_parents(with_children=True)

### `well`

We might also want to add the well information, so that we can link image files and parse images based on well coordinates. To do this, let's first extract well locations from the table:

In [None]:
ln.settings.upon_create_search_name = False

wells = []
for well in meta["well"].unique():
    wells.append(ln1.Well(name=well, row=well[0], column=int(well[1:])))

ln.save(wells)

ln.settings.upon_create_search_name = True

### `site`

In [None]:
is_site = ln.ULabel(name="is_site", description="parent of sites")
is_site.save()

In [None]:
sites = []
for name in meta["site"].unique():
    record = ln.ULabel(name=f"Site{name}")
    sites.append(record)

ln.save(sites)
is_site.children.set(sites)

In [None]:
is_site.view_parents(with_children=True)

### `well_type`

In [None]:
is_well_type = ln.ULabel(name="is_well_type", description="parent of well types")
is_well_type.save()

In [None]:
well_types = []
for name in meta["well_type"].unique():
    record = ln.ULabel(name=name)
    well_types.append(record)

ln.save(well_types)
is_well_type.children.set(well_types)

In [None]:
is_well_type.view_parents(with_children=True)

### `sirna`

add sirna to Treatment table:

In [None]:
ln.settings.upon_create_search_name = False

sirnas = []
for sirna in meta["sirna"].unique():
    record = ln1.Treatment(
        name=sirna,
        type="genetic",
        system="siRNA",
        description="ThermoFisher ID of siRNA",
    )
    sirnas.append(record)

ln.save(sirnas)

ln.settings.upon_create_search_name = True

### assay/readout

We can do the same for other ontologies that we are interested to curate. For example, the type of readout. We are going to choose the "high content screen" readout.

In [None]:
ln.Feature(name="assay", type="category").save()

In [None]:
assay = lb.ExperimentalFactor.from_bionty(name="high content screen")
assay.save(parents=False)

### features

In [None]:
var_features = ln.Feature.from_df(embedding)
ln.save(var_features)

In [None]:
obs_features = ln.Feature.from_df(meta)
ln.save(obs_features)

## Create `AnnData` object for embeddings and metadata

In [None]:
adata = ad.AnnData(embedding)
adata.obs = meta.set_index("site_id").loc[adata.obs.index]

In [None]:
adata

## Register AnnData and link to metadata records

In [None]:
file = ln.File.from_anndata(
    adata,
    field=ln.Feature.name,
    key="rxrx1_embeddings.h5ad",
    description="Metadata annotated deep learning embeddings for each RxRx1 image.",
)

In [None]:
file.save()

In [None]:
features = ln.Feature.lookup()

In [None]:
# obs
file.labels.add(cell_lines, features.cell_type)
file.labels.add(train_test, features.dataset)
file.labels.add(experiments, features.experiment)
file.labels.add(plates, features.plate)
file.labels.add(wells, features.well)
file.labels.add(sites, features.site)
file.labels.add(well_types, features.well_type)
file.labels.add(sirnas, features.sirna)
# external
file.labels.add(assay, features.assay)

In [None]:
file.labels

In [None]:
file.features

In [None]:
file.feature_sets.df()

In [None]:
file.features["obs"].df()