In [1]:
import importlib.resources as pkg_resources
from typing import Optional
import os

import numpy as np
import pandas as pd
import plotly.graph_objects as go

import deep_paint

In [2]:
# Download metadata directly from https://www.rxrx.ai/rxrx2
metadata_path = pkg_resources.files(deep_paint).joinpath("..", "results", "metadata").resolve()
rxrx2_metadata = pd.read_csv(os.path.join(metadata_path, "metadata.csv"))

In [3]:
# Load significant treatments
significant_df = pd.read_csv(os.path.join(metadata_path, "significant_treatments.csv"))

In [4]:
# Filter by factors (treatments) that underwent significant changes in HUVEC cells
rxrx2_metadata = rxrx2_metadata[rxrx2_metadata["treatment"].isin(significant_df["treatment"].values)].reset_index(drop=True)

# Add treatment class
treatment_mapping = dict(zip(significant_df["treatment"], significant_df["class"]))
rxrx2_metadata["treatment_class"] = rxrx2_metadata["treatment"].map(treatment_mapping)

# Add directory column
rxrx2_metadata["directory"] = rxrx2_metadata["experiment"] + os.sep + \
    "Plate_" + rxrx2_metadata["plate"].astype(str) + os.sep + rxrx2_metadata["well"] + "_s" + rxrx2_metadata["site"].astype(str)

# Filter by treatment class
treatment_classes = ["Cytokine", "Growth factor", "Toxin", "Untreated"]
rxrx2_metadata = rxrx2_metadata[rxrx2_metadata["treatment_class"].isin(treatment_classes)].reset_index(drop=True)

## Binary Metadata

In [5]:
binary_metadata = rxrx2_metadata.copy(deep=True)

# Create binary labels
binary_metadata["label"] = 0
binary_metadata["class"] = "low_conc"

for treatment, group in binary_metadata[binary_metadata["treatment"] != "EMPTY"].groupby("treatment"):
    concs = sorted(group["treatment_conc"].unique())
    # Top 3 doses are labeled 1 and "high_conc"
    binary_metadata.loc[(binary_metadata["treatment"] == treatment) & (binary_metadata["treatment_conc"].isin(concs[-3:])), "label"] = 1
    binary_metadata.loc[(binary_metadata["treatment"] == treatment) & (binary_metadata["treatment_conc"].isin(concs[-3:])), "class"] = "high_conc"

# Sort values
binary_metadata = binary_metadata.sort_values(by=["treatment", "treatment_conc"]).reset_index(drop=True)

# Save to csv
binary_metadata.to_csv("binary_rxrx2.csv", index=False)

## Multiclass Metadata

In [6]:
multiclass_metadata = rxrx2_metadata.copy(deep=True)

# Include only three highest doses per treatment
for treatment, group in multiclass_metadata[multiclass_metadata["treatment"] != "EMPTY"].groupby("treatment"):
    concs = sorted(group["treatment_conc"].unique())
    lowest_3_concs = concs[:3]
    # Drop rows with three lowest doses
    multiclass_metadata = multiclass_metadata[~((multiclass_metadata['treatment'] == treatment) & (multiclass_metadata["treatment_conc"].isin(concs[:3])))]

# Reset index
multiclass_metadata.reset_index(drop=True, inplace=True)

# Combine cytokine and growth factors
multiclass_metadata["class"] = multiclass_metadata["treatment_class"]
multiclass_metadata.loc[multiclass_metadata["treatment_class"] == "Cytokine", "class"] = "Cytokine-GF"
multiclass_metadata.loc[multiclass_metadata["treatment_class"] == "Growth factor", "class"] = "Cytokine-GF"

# Add numeric labels
multilabel_mapping = {"Cytokine-GF": 0, "Toxin": 1, "Untreated": 2}
multiclass_metadata["label"] = multiclass_metadata["class"].map(multilabel_mapping)

# Sort values
multiclass_metadata = multiclass_metadata.sort_values(by=["treatment", "treatment_conc"]).reset_index(drop=True)

# Save to csv
multiclass_metadata.to_csv("multiclass_rxrx2.csv", index=False)