# Ingest the RxRx1 dataset

In [None]:
!lamin init --storage test-rxrx --modules bionty,wetlab

In [None]:
import lamindb as ln
import bionty as bt
import wetlab as wl

ln.track("Zo0qJt4IQPsb")

The `metadata.csv` was originally downloaded from [here](https://www.rxrx.ai/rxrx1#Download) and deposited on S3.

## Load metadata

Read in the raw metadata of the wells:

In [None]:
meta = ln.Artifact(
    "s3://lamindata/rxrx1/metadata.csv",
    description=(
        "Experimental design of RxRx1, e.g. what cell type and treatment are in each well."
    ),
).load()
meta.head()

It seems that the column storing cell lines is erroneously called `cell_type`. 
Also `dataset` refers to something that's typically called `split`.
Let's rename it: 

In [None]:
meta.rename({"cell_type": "cell_line", "dataset": "split"}, axis=1, inplace=True)

Add a `paths` column - this is an aggregate over 6 paths for 6 channels. We'll deconvolute further down:

In [None]:
paths = []
for _, row in meta.iterrows():
    well = row.well
    site = row.site
    paths.append(
        f"images/{row.split}/{row.experiment}/Plate{row.plate}/{well}_s{site}_w1-w6.png"
    )
meta["paths"] = paths

Use more meaningful plate names:

In [None]:
meta["plate"] = meta["plate"].apply(lambda name: f"Plate{name}")

Create a DataFrame with each row as a single image, similar to a link table but with multiple metadata columns:

In [None]:
meta_with_path = meta.copy()
keys_list = []
for key in meta_with_path["paths"]:
    keys = [key.replace("w1-w6.png", f"w{str(channel)}.png") for channel in range(1, 7)]
    keys_list.append(keys)
meta_with_path["path"] = keys_list
meta_with_path = meta_with_path.explode("path").reset_index(drop=True)
del meta_with_path["paths"]
meta_with_path

## Validate and register metadata

In [None]:
schema = ln.Schema(
    features=[
        ln.Feature(name="cell_line", dtype=bt.CellLine).save(),
        ln.Feature(name="split", dtype=ln.ULabel).save(),
        ln.Feature(name="experiment", dtype=wl.Experiment).save(),
        ln.Feature(name="plate", dtype=ln.ULabel).save(),
        ln.Feature(name="well_type", dtype=ln.ULabel).save(),
        ln.Feature(name="sirna", dtype=wl.GeneticPerturbation).save(),
    ],
    coerce_dtype=True,
).save()

rxrx_curator = ln.curators.DataFrameCurator(meta_with_path, schema)

In [None]:
try:
    rxrx_curator.validate()
except ln.errors.ValidationError as e:
    print(e)

In [None]:
rxrx_curator.cat.standardize("cell_line")
rxrx_curator.cat.add_new_from("split")
rxrx_curator.cat.add_new_from("experiment")
rxrx_curator.cat.add_new_from("plate")
rxrx_curator.cat.add_new_from("well_type")
# well requires row and column information so we'll create records manually
# sirna requires system information so we'll create records manually

### `sirna`

Add `sirna` to `GeneticPerturbation` table:

In [None]:
sirnas = [
    wl.GeneticPerturbation(
        name=sirna,
        system="siRNA",
        _skip_validation=True,
    )
    for sirna in meta["sirna"].unique()
]
ln.save(sirnas)

### `cell_line`

Add commonly used abbreviations:

In [None]:
bt.CellLine.get("30n7ByjL").set_abbr("HUVEC")
bt.CellLine.get("6EK4GXdy").set_abbr("U2OS")
bt.CellLine.get("og6IaxOV").set_abbr("RPE")
bt.CellLine.get("4ea731nb").set_abbr("HEPG2")

## Register metadata file

In [None]:
meta_af = rxrx_curator.save_artifact(
    key="rxrx1/metadata.parquet",
    description="Metadata with file paths for each RxRx1 image.",
)

# Add a `readout` label using The `Experimental Factor Ontology`:
readout_feat = ln.Feature(name="readout", dtype="cat").save()
readout = bt.ExperimentalFactor.from_source(name="high content screen").save()
meta_af.labels.add(readout, readout_feat)

In [None]:
meta_af.describe()

## Register images

In [None]:
ln.UPath("gs://rxrx1-europe-west4/images").view_tree(level=2)

Take a subset to run on CI:

In [None]:
images = ln.Artifact(
    "gs://rxrx1-europe-west4/images/test/HEPG2-08", description="RxRx1 image files"
).save()
images.n_files

In [None]:
collection = ln.Collection(
    images, key="Annotated RxRx1 images", meta_artifact=meta_af
).save()

In [None]:
collection.meta_artifact

In [None]:
collection.data_artifact

In [None]:
collection.describe()