# Register RxRx1 images

Images are located: https://console.cloud.google.com/storage/browser/rxrx1-europe-west4

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import lnschema_lamin1 as ln1
import pandas as pd

ln.settings.verbosity = "hint"

In [None]:
ln.track()

## Register all image files

In [None]:
%%time

ln.settings.verbosity = "error"

files = ln.File.from_dir("gs://rxrx1-europe-west4/images")

ln.settings.verbosity = "hint"

In [None]:
%%time

ln.save(files)

## Link each image file to metadata records

### Match file key with its metadata

In [None]:
meta_file = ln.File.filter(key="rxrx1.parquet").one()
meta = meta_file.load()

In [None]:
meta["plate"] = "Plate" + meta["plate"].astype(str)

In [None]:
# get the 6 file keys for each row
files_list = []
for _, row in meta.iterrows():
    well = row.well
    site = row.site
    # each image has 6 channels
    files_list.append(
        [
            f"images/{row.dataset}/{row.experiment}/{row.plate}/{well}_s{site}_w{channel}.png"
            for channel in range(1, 7)  # channels are w1 - w6
        ]
    )
meta["files"] = files_list

In [None]:
# each row is now a single file, represented by its if
meta = meta.explode("files")
meta["files"] = meta["files"].map(dict(ln.File.objects.values_list("key", "id")))

In [None]:
meta.head()

### Create records of link tables

Replace record names with ids:

In [None]:
def name_mapper(registry):
    return dict(registry.objects.values_list("name", "id"))

In [None]:
meta["cell_type"] = lb.CellLine.standardize(meta["cell_type"])
meta["cell_type"] = meta["cell_type"].map(name_mapper(lb.CellLine))

meta["dataset"] = meta["dataset"].map(name_mapper(ln.ULabel))
meta["experiment"] = meta["experiment"].map(name_mapper(ln1.Experiment))
meta["plate"] = meta["plate"].map(name_mapper(ln.ULabel))
meta["well"] = meta["well"].map(name_mapper(ln1.Well))
meta["site"] = "Site" + meta["site"].astype(str)
meta["site"] = meta["site"].map(name_mapper(ln.ULabel))
meta["well_type"] = meta["well_type"].map(name_mapper(ln.ULabel))
meta["sirna"] = meta["sirna"].map(name_mapper(ln1.Treatment))

meta["experimentalfactor_id"] = "high content screen"
meta["experimentalfactor_id"] = meta["experimentalfactor_id"].map(
    name_mapper(lb.ExperimentalFactor)
)

In [None]:
meta = meta.rename(columns={"files": "file_id"}).set_index("file_id")

In [None]:
meta.head()

In [None]:
meta.shape

### Link obs and external labels

In [None]:
def link_records(registry, iterable, field_name, **kwargs):
    records = []
    for file_id, label_id in iterable.items():
        record = registry(**{field_name: label_id, "file_id": file_id}, **kwargs)
        records.append(record)
    registry.objects.bulk_create(records)

In [None]:
link_records(ln.File.cell_lines.through, meta["cell_type"], "cellline_id")
link_records(ln.File.ulabels.through, meta["dataset"], "ulabel_id")
link_records(ln.File.experiments.through, meta["experiment"], "experiment_id")
link_records(ln.File.ulabels.through, meta["plate"], "ulabel_id")
link_records(ln.File.wells.through, meta["well"], "well_id")
link_records(ln.File.ulabels.through, meta["site"], "ulabel_id")
link_records(ln.File.ulabels.through, meta["well_type"], "ulabel_id")
link_records(ln.File.treatments.through, meta["sirna"], "treatment_id")
link_records(
    ln.File.experimental_factors.through,
    meta["experimentalfactor_id"],
    "experimentalfactor_id",
)

### Link feature sets

In [None]:
ln.FeatureSet.filter().df()

In [None]:
# obs feature set
meta["featureset_id"] = "4ueOrr0AS9GwslD3HC8u"
link_records(
    ln.File.feature_sets.through,
    meta["featureset_id"],
    "feature_set_id",
    slot="well_meta",
)

In [None]:
# external feature set
meta["featureset_id"] = "dgPotT98Z6N9EP1YJDNu"
link_records(
    ln.File.feature_sets.through,
    meta["featureset_id"],
    "feature_set_id",
    slot="external",
)

## Inspect linked features and labels

In [None]:
file = ln.File.filter().last()

In [None]:
file

In [None]:
file.features

In [None]:
file.labels