# Ingest the RxRx1 dataset

In [None]:
!lamin init --storage test-rxrx --schema bionty,wetlab

In [None]:
import lamindb as ln
import bionty as bt
import wetlab as wl

ln.context.uid = "Zo0qJt4IQPsb0000"
run = ln.context.track()

The `metadata.csv` was originally downloaded from [here](https://www.rxrx.ai/rxrx1#Download) and deposited on S3.

## Load metadata

Read in the raw metadata of the wells:

In [None]:
meta = ln.Artifact(
    "s3://lamindata/rxrx1/metadata.csv",
    description=(
        "Experimental design of RxRx1, e.g. what cell type and"
        " treatment are in each well."
    ),
).load()
meta.shape

In [None]:
meta.head(5)

It seems that the column storing cell lines is erroneously called `cell_type`. 

Also `dataset` refers to something that's typicalled called `split`.

Let's rename it: 

In [None]:
meta.rename({"cell_type": "cell_line", "dataset": "split"}, axis=1, inplace=True)
meta.head(5)

Add a `paths` column - this is an aggregate over 6 paths for 6 channels. We'll deconvolute further down:

In [None]:
paths = []
for _, row in meta.iterrows():
    well = row.well
    site = row.site
    paths.append(
        f"images/{row.split}/{row.experiment}/Plate{row.plate}/{well}_s{site}_w1-w6.png"
    )
meta["paths"] = paths

In [None]:
meta.head(2)

## Validate and register metadata

### `cell_line`

We can start with curating the metadata table based on the ontologies from bionty. For example, let's start with the `cell_line` metadata.
In this table, they are called "cell_type" but they are in fact the cell line for the experiments

In [None]:
meta["cell_line"].unique()

In [None]:
cell_lines = bt.CellLine.from_values(meta["cell_line"])

In [None]:
ln.save(cell_lines)

Define abbreviation:

In [None]:
bt.CellLine.get("30n7ByjL").set_abbr("HUVEC")
bt.CellLine.get("6EK4GXdy").set_abbr("U2OS")
bt.CellLine.get("og6IaxOV").set_abbr("RPE")
bt.CellLine.get("4ea731nb").set_abbr("HEPG2")
bt.CellLine.df().head(4)

In [None]:
cell_lines = bt.CellLine.lookup(field="abbr")
cell_lines.huvec.view_parents()

In [None]:
cell_lines.hepg2.view_parents()

### `split`

Use `ULabel` for "train", "test":

In [None]:
meta["split"].unique()

In [None]:
train_test = [
    ln.ULabel(name=name, description="ML split") for name in meta["split"].unique()
]
ln.save(train_test)

Define a parent:

In [None]:
is_split = ln.ULabel(name="is_split", description="ML split")
is_split.save()
is_split.children.add(*train_test)
is_split.view_parents(with_children=True)

### `experiment`

In [None]:
ln.settings.creation.search_names = False
experiments = [
    wl.Experiment(name=name, description="RxRx1")
    for name in meta["experiment"].unique()
]
ln.save(experiments)
ln.settings.creation.search_names = True

### `plate`

In [None]:
ln.settings.creation.search_names = False
is_plate = ln.ULabel(name="is_plate", description="parent of plates")
is_plate.save()
plates = [ln.ULabel(name=f"Plate{name}") for name in meta["plate"].unique()]
ln.save(plates)
is_plate.children.set(plates)
ln.settings.creation.search_names = True

In [None]:
is_plate.view_parents(with_children=True)

### `well`

We might also want to add the well information, so that we can link image files and parse images based on well coordinates. To do this, let's first extract well locations from the table:

In [None]:
ln.settings.creation.search_names = False
wells = [
    wl.Well(name=well, row=well[0], column=int(well[1:]))
    for well in meta["well"].unique()
]
ln.save(wells)
ln.settings.creation.search_names = True

### `well_type`

In [None]:
is_well_type = ln.ULabel(name="is_well_type", description="parent of well types")
is_well_type.save()

In [None]:
well_types = [ln.ULabel(name=name) for name in meta["well_type"].unique()]
ln.save(well_types)
is_well_type.children.set(well_types)

In [None]:
is_well_type.view_parents(with_children=True)

### `sirna`

Add `sirna` to Treatment table:

In [None]:
ln.settings.creation.search_names = False
sirnas = [
    wl.GeneticTreatment(
        name=sirna,
        system="siRNA",
        # description="ThermoFisher ID of siRNA",
    )
    for sirna in meta["sirna"].unique()
]
ln.save(sirnas)
ln.settings.creation.search_names = True

### `readout`

In [None]:
ln.Feature(name="readout", dtype="cat").save()
readout = bt.ExperimentalFactor.from_public(name="high content screen")
readout.save()

In [None]:
readout.view_parents(with_children=True)

## Register all metadata features

Here we create a DataFrame with each row as a single image, similar to a link table but with multiple metadata columns:

In [None]:
meta_with_path = meta.copy()
keys_list = []
for key in meta_with_path["paths"]:
    keys = [key.replace("w1-w6.png", f"w{str(channel)}.png") for channel in range(1, 7)]
    keys_list.append(keys)
meta_with_path["path"] = keys_list
meta_with_path = meta_with_path.explode("path").reset_index(drop=True)
del meta_with_path["paths"]
meta_with_path

Here we register all metadata features:

In [None]:
obs_features = ln.Feature.from_df(
    meta_with_path[
        [
            "cell_line",
            "split",
            "experiment",
            "plate",
            "well",
            "well_type",
            "sirna",
            "path",
        ]
    ]
)
ln.save(obs_features)

In [None]:
obs_features.df()

In [None]:
features = ln.Feature.lookup()

## Register metadata file

In [None]:
meta_file = ln.Artifact.from_df(
    meta_with_path,
    key="rxrx1/metadata.parquet",
    description="Metadata with file paths for each RxRx1 image.",
)

In [None]:
meta_file.features._add_set_from_df()

In [None]:
meta_file.save()

Annotate with labels:

In [None]:
cell_lines = bt.CellLine.from_values(meta.cell_line.unique())

In [None]:
# columns
meta_file.labels.add(cell_lines, features.cell_line)
meta_file.labels.add(train_test, features.split)
meta_file.labels.add(experiments, features.experiment)
meta_file.labels.add(plates, features.plate)
meta_file.labels.add(wells, features.well)
meta_file.labels.add(well_types, features.well_type)
meta_file.labels.add(sirnas, features.sirna)
# external
meta_file.labels.add(readout, features.readout)

In [None]:
meta_file.describe()

## Register images

In [None]:
ln.UPath("gs://rxrx1-europe-west4/images").view_tree(level=2)

Take a subset to run on CI:

In [None]:
images = ln.Artifact(
    "gs://rxrx1-europe-west4/images/test/HEPG2-08", description="RxRx1 image files"
)
images.n_objects

In [None]:
images.hash

In [None]:
images.save()

In [None]:
collection = ln.Collection(
    images, name="Annotated RxRx1 images", meta_artifact=meta_file, version="1"
)
collection.save()

In [None]:
collection.meta_artifact

In [None]:
collection.data_artifact

In [None]:
collection.describe()