# Number of active readouts

Compute the number of active compounds for morphology (broken down by distance type and representation), cell count, MTT, and LDH. 

In [None]:
import polars as pl


output_dir = "../../1_snakemake/outputs"
input_dir = "../../1_snakemake/inputs"

cellprofiler_dir = f"{output_dir}/cellprofiler/mad_featselect/curves"
cpcnn_dir = f"{output_dir}/cpcnn/mad_featselect/curves"
dino_dir = f"{output_dir}/dino/mad_featselect/curves"

In [2]:
# Get total cmpd number
len(pl.read_parquet(f"{dino_dir}/ccpods.parquet").select("Metadata_Compound").to_series().unique().to_list())

1085

In [14]:
# Get info to convert PODs
conc_conv = pl.read_parquet(f"{input_dir}/metadata/metadata.parquet")
conc_conv = conc_conv.filter(pl.col("Metadata_Compound") != "DMSO").select(
    ["Metadata_Compound", "Metadata_Concentration"]
).group_by("Metadata_Compound").agg([
    pl.col("Metadata_Concentration").min().alias("Metadata_Min_Conc")
])

conc_conv = conc_conv.with_columns(
    (pl.col("Metadata_Min_Conc")/3).log10().abs().alias("Metadata_Conc_Shift")
)

In [30]:
def conv_POD_um(pod_df, conv_df, pod_col, conv_pod_nm):
    pod_df = pod_df.join(conv_df, on="Metadata_Compound")

    pod_df = pod_df.with_columns(
        (10 ** (pl.col(pod_col) - pl.col("Metadata_Conc_Shift"))).alias(conv_pod_nm)
    )

    return pod_df
    

## Cell count and biochemical assays

In [33]:
# Compile cell count and biochemical assay readouts
cc = pl.read_parquet(f"{dino_dir}/ccpods.parquet").filter(pl.col("all.pass") == True).select(["Metadata_Compound", "bmd", "adv.incr"]).rename({"bmd": "Metadata_cc_POD"})
cc = conv_POD_um(cc, conc_conv, "Metadata_cc_POD", "Metadata_cc_POD_um")

ldh = pl.read_parquet(f"{dino_dir}/ldhpods.parquet").filter(pl.col("all.pass") == True).select(["Metadata_Compound", "bmd", "adv.incr"]).rename({"bmd": "Metadata_ldh_POD"})
ldh = conv_POD_um(ldh, conc_conv, "Metadata_ldh_POD", "Metadata_ldh_POD_um")

mtt = pl.read_parquet(f"{dino_dir}/mttpods.parquet").filter(pl.col("all.pass") == True).select(["Metadata_Compound", "bmd", "adv.incr"]).rename({"bmd": "Metadata_mtt_POD"})
mtt = conv_POD_um(mtt, conc_conv, "Metadata_mtt_POD", "Metadata_mtt_POD_um")

In [27]:

print(f"cell count: {cc.shape[0]} total")
print(f"cell count: {cc.filter(pl.col('adv.incr') == 'true').shape[0]} increase")
print(cc.filter(pl.col('adv.incr') == 'true').select("Metadata_Compound"))
print(f"cell count: {cc.filter(pl.col('adv.incr') == 'false').shape[0]} decrease")

print(f"LDH: {ldh.shape[0]} total")
print(f"LDH: {ldh.filter(pl.col('adv.incr') == 'true').shape[0]} increase")
print(f"LDH: {ldh.filter(pl.col('adv.incr') == 'false').shape[0]} decrease")

print(f"MTT: {mtt.shape[0]} total")
print(f"MTT: {mtt.filter(pl.col('adv.incr') == 'true').shape[0]} increase")
print(mtt.filter(pl.col('adv.incr') == 'true').select("Metadata_Compound"))
print(f"MTT: {mtt.filter(pl.col('adv.incr') == 'false').shape[0]} decrease")

cell count: 221 total
cell count: 3 increase
shape: (3, 1)
┌───────────────────────────┐
│ Metadata_Compound         │
│ ---                       │
│ str                       │
╞═══════════════════════════╡
│ Alectinib (Hydrochloride) │
│ Baloxavir marboxil        │
│ Oritavancin (diphosphate) │
└───────────────────────────┘
cell count: 218 decrease
LDH: 144 total
LDH: 143 increase
LDH: 1 decrease
MTT: 430 total
MTT: 10 increase
shape: (10, 1)
┌─────────────────────────────────┐
│ Metadata_Compound               │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ Compound_211c050e               │
│ Tiratricol                      │
│ Tiaprofenic acid                │
│ Benzarone                       │
│ Estradiol                       │
│ Tolcapone                       │
│ 6-Mercaptopurine                │
│ 2-Ethyl-2-(hydroxymethyl)propa… │
│ 2-Ethylanthracene-9,10-dione    │
│ Atazanavir                      │
└─────────

In [28]:
cc_cmpds = cc.filter(pl.col('adv.incr') == 'false').select(pl.col("Metadata_Compound")).to_series().to_list()
ldh_cmpds = ldh.filter(pl.col('adv.incr') == 'true').select(pl.col("Metadata_Compound")).to_series().to_list()
mtt_cmpds = mtt.filter(pl.col('adv.incr') == 'false').select(pl.col("Metadata_Compound")).to_series().to_list()

cytotox_cmpds = list(set(cc_cmpds + ldh_cmpds + mtt_cmpds))

print(f"Cytotox cmpds: {len(cytotox_cmpds)}")

Cytotox cmpds: 429


In [29]:
cc_cmpds = cc.select(pl.col("Metadata_Compound")).to_series().to_list()
ldh_cmpds = ldh.select(pl.col("Metadata_Compound")).to_series().to_list()
mtt_cmpds = mtt.select(pl.col("Metadata_Compound")).to_series().to_list()

active_cmpds = list(set(cc_cmpds + ldh_cmpds + mtt_cmpds))

print(f"Active cmpds: {len(active_cmpds)}")

Active cmpds: 438


## Morphology

In [46]:
# cellprofiler
cellprofiler_bmds = pl.read_parquet(f"{cellprofiler_dir}/bmds.parquet").with_columns(
    (3*pl.col("SDctrl")).alias("SDres_thresh")
)
cellprofiler_bmds = conv_POD_um(cellprofiler_bmds, conc_conv, "bmd", "bmd_um")

# gmd
cellprofiler_gmd = cellprofiler_bmds.filter(
    (pl.col("gene.id") == "gmd") & (pl.col("all.pass") == "true") & (pl.col("SDres") < pl.col("SDres_thresh"))
)
cellprofiler_gmd_num = cellprofiler_gmd.select("Metadata_Compound").to_series().unique().shape
print(f"cellprofiler gmd #: {cellprofiler_gmd_num}")
print(f"cellprofiler log-scale gmd median: {cellprofiler_gmd.select(pl.col('bmd').median()).item()}")
print(f"cellprofiler um gmd median: {cellprofiler_gmd.select(pl.col('bmd_um').median()).item()}")

# cmd
cellprofiler_cmd = cellprofiler_bmds.filter(
    (pl.col("gene.id") != "gmd") & (pl.col("all.pass") == "true") & (pl.col("SDres") < pl.col("SDres_thresh"))
).group_by("Metadata_Compound").agg(
    pl.col("bmd").min().alias("bmd"),
    pl.col("bmd_um").min().alias("bmd_um")
)
cellprofiler_cmd_num = cellprofiler_cmd.select("Metadata_Compound").to_series().unique().shape
print(f"cellprofiler cmd #: {cellprofiler_cmd_num}")
print(f"cellprofiler log-scale cmd median: {cellprofiler_cmd.select(pl.col('bmd').median()).item()}")
print(f"cellprofiler um cmd median: {cellprofiler_cmd.select(pl.col('bmd_um').median()).item()}")

# cmd & gmd
cellprofiler_all = pl.read_parquet(f"{cellprofiler_dir}/pods.parquet")
cellprofiler_all = conv_POD_um(cellprofiler_all, conc_conv, "bmd", "bmd_um")
print(f"cellprofiler gmd & cmd #: {len(cellprofiler_all.select('Metadata_Compound').to_series().unique().to_list())}")
print(f"cellprofiler um gen bioac median: {cellprofiler_all.select(pl.col('bmd_um').median()).item()}")

cellprofiler gmd #: (172,)
cellprofiler log-scale gmd median: 3.5222091934258897
cellprofiler um gmd median: 50.6408064359295
cellprofiler cmd #: (598,)
cellprofiler log-scale cmd median: 2.951186789189215
cellprofiler um cmd median: 12.856598464772283
cellprofiler gmd & cmd #: 607
cellprofiler um gen bioac median: 13.050927719515423


In [44]:
# CPCNN
cpcnn_bmds = pl.read_parquet(f"{cpcnn_dir}/bmds.parquet").with_columns(
    (3*pl.col("SDctrl")).alias("SDres_thresh")
)
cpcnn_bmds = conv_POD_um(cpcnn_bmds, conc_conv, "bmd", "bmd_um")

cpcnn_gmd = cpcnn_bmds.filter(
    (pl.col("gene.id") == "gmd") & (pl.col("all.pass") == "true") & (pl.col("SDres") < pl.col("SDres_thresh"))
)
cpcnn_gmd_num = cpcnn_gmd.select("Metadata_Compound").to_series().unique().shape

print(f"cpcnn gmd #: {cpcnn_gmd_num}")
print(f"cpcnn log-scale gmd median: {cpcnn_gmd.select(pl.col('bmd').median()).item()}")
print(f"cpcnn um gmd median: {cpcnn_gmd.select(pl.col('bmd_um').median()).item()}")

cpcnn gmd #: (538,)
cpcnn log-scale gmd median: 3.239297233623755
cpcnn um gmd median: 26.319270777215756


In [47]:
# DINO
dino_bmds = pl.read_parquet(f"{dino_dir}/bmds.parquet").with_columns(
    (3*pl.col("SDctrl")).alias("SDres_thresh")
)
dino_bmds = conv_POD_um(dino_bmds, conc_conv, "bmd", "bmd_um")

# gmd
dino_gmd = dino_bmds.filter(
    (pl.col("gene.id") == "gmd") & (pl.col("all.pass") == "true") & (pl.col("SDres") < pl.col("SDres_thresh"))
)
dino_gmd_num = dino_gmd.select("Metadata_Compound").to_series().unique().shape
print(f"dino gmd #: {dino_gmd_num}")
print(f"dino log-scale gmd median: {dino_gmd.select(pl.col('bmd').median()).item()}")
print(f"dino um gmd median: {dino_gmd.select(pl.col('bmd_um').median()).item()}")

# cmd
dino_cmd = dino_bmds.filter(
    (pl.col("gene.id") != "gmd") & (pl.col("all.pass") == "true") & (pl.col("SDres") < pl.col("SDres_thresh"))
).group_by("Metadata_Compound").agg(
    pl.col("bmd").min().alias("bmd"),
    pl.col("bmd_um").min().alias("bmd_um")
)
dino_cmd_num = dino_cmd.select("Metadata_Compound").to_series().unique().shape
print(f"dino cmd #: {dino_cmd_num}")
print(f"dino log-scale cmd median: {dino_cmd.select(pl.col('bmd').median()).item()}")
print(f"dino um cmd median: {dino_cmd.select(pl.col('bmd_um').median()).item()}")

# cmd & gmd
dino_all = pl.read_parquet(f"{dino_dir}/pods.parquet")
dino_all = conv_POD_um(dino_all, conc_conv, "bmd", "bmd_um")
print(f"Dino gmd & cmd #: {len(dino_all.select('Metadata_Compound').to_series().unique().to_list())}")
print(f"Dino um gen bioac median: {dino_all.select(pl.col('bmd_um').median()).item()}")

dino gmd #: (546,)
dino log-scale gmd median: 3.00183062346817
dino um gmd median: 15.10327685892894
dino cmd #: (624,)
dino log-scale cmd median: 3.162274437178345
dino um cmd median: 21.811475458278654
Dino gmd & cmd #: 642
Dino um gen bioac median: 17.22436693027669
