# Format tables as CSV for SI

Most results produced by Snakemake are in parquet format. We want nicely formatted CSVs to include in the Supplementary Information.

- PODs for LDH, MT, and cell counts
- PODs for each representation (one csv with all passing QA/QC with labels for category, another csv with the lowest POD across all)
- AUROC for DINO and "all" profiles


In [1]:
import polars as pl

input_dir = "../../1_snakemake/inputs"

In [9]:
# Get info to convert PODs
conc_conv = pl.read_parquet(f"{input_dir}/metadata/metadata.parquet")
conc_conv = conc_conv.filter(pl.col("Metadata_Compound") != "DMSO").select(
    ["Metadata_Compound", "Metadata_Concentration"]
).group_by("Metadata_Compound").agg([
    pl.col("Metadata_Concentration").min().alias("Metadata_Min_Conc")
])

conc_conv = conc_conv.with_columns(
    (pl.col("Metadata_Min_Conc")/3).log10().abs().alias("Metadata_Conc_Shift")
)

def conv_POD_um(pod_df, conv_df, pod_col, conv_pod_nm):
    pod_df = pod_df.join(conv_df, on="Metadata_Compound")

    pod_df = pod_df.with_columns(
        (10 ** (pl.col(pod_col) - pl.col("Metadata_Conc_Shift"))).alias(conv_pod_nm)
    ).drop(["Metadata_Min_Conc", "Metadata_Conc_Shift"])

    return pod_df

def conv_whole_df(pod_df, conv_df):
    pod_df = conv_POD_um(pod_df, conv_df, "bmd", "POD_um")
    pod_df = conv_POD_um(pod_df, conv_df, "bmdl", "POD_um_l")
    pod_df = conv_POD_um(pod_df, conv_df, "bmdu", "POD_um_u")

    pod_df = pod_df.select(["Metadata_OASIS_ID", "Metadata_Compound", "gene.id", "POD_um", "POD_um_l", "POD_um_u"]).rename({
        "Metadata_OASIS_ID": "OASIS_ID",
        "Metadata_Compound": "Compound_name",
        "gene.id": "Assay_Endpoint"
        })

    return pod_df

In [40]:
oasis_id = pl.read_parquet("../../1_snakemake/outputs/dino/mad_featselect/profiles/mad_featselect.parquet").select(
    ["Metadata_OASIS_ID", "Metadata_Compound"]
).unique()

In [41]:
# Cytotox assays
cc = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/ccpods.parquet").filter(pl.col("all.pass") == "true").join(
    oasis_id, on="Metadata_Compound", how="left"
    )

ldh = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/ldhpods.parquet").filter(pl.col("all.pass") == "true").join(
    oasis_id, on="Metadata_Compound", how="left"
    )

mt = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/mttpods.parquet").filter(pl.col("all.pass") == "true").join(
    oasis_id, on="Metadata_Compound", how="left"
    )

# Convert concentration to um
cc = conv_whole_df(cc, conc_conv)
ldh = conv_whole_df(ldh, conc_conv)
mt = conv_whole_df(mt, conc_conv)

In [42]:
# Cell Painting assays

# cellprofiler
cellprofiler_all = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/bmds.parquet").filter(
    (pl.col("all.pass") == True) &
    (pl.col("SDres") < 3*pl.col("SDctrl"))
).join(oasis_id, on="Metadata_Compound", how="left")

cellprofiler_all = conv_whole_df(cellprofiler_all, conc_conv)

cellprofiler_all = cellprofiler_all.with_columns(
    (pl.col("POD_um") == pl.col("POD_um").min().over("Compound_name")).alias("Bioactivity_POD")
)

# cp-cnn
cpcnn_all = pl.read_parquet("../../1_snakemake/outputs/cpcnn/mad_featselect/curves/bmds.parquet").filter(
    (pl.col("all.pass") == True) &
    (pl.col("SDres") < 3*pl.col("SDctrl"))
).join(oasis_id, on="Metadata_Compound", how="left")

cpcnn_all = conv_whole_df(cpcnn_all, conc_conv)

cpcnn_all = cpcnn_all.with_columns(
    (pl.col("POD_um") == pl.col("POD_um").min().over("Compound_name")).alias("Bioactivity_POD")
)

# dino
dino_all = pl.read_parquet("../../1_snakemake/outputs/dino/mad_featselect/curves/bmds.parquet").filter(
    (pl.col("all.pass") == True) &
    (pl.col("SDres") < 3*pl.col("SDctrl"))
).join(oasis_id, on="Metadata_Compound", how="left")

dino_all = conv_whole_df(dino_all, conc_conv)

dino_all = dino_all.with_columns(
    (pl.col("POD_um") == pl.col("POD_um").min().over("Compound_name")).alias("Bioactivity_POD")
)

In [18]:
# write out results
cc.write_csv("../compiled_results/SI_tables/cellcount_pods.csv")
mt.write_csv("../compiled_results/SI_tables/mt_pods.csv")
ldh.write_csv("../compiled_results/SI_tables/ldh_pods.csv")

cellprofiler_all.write_csv("../compiled_results/SI_tables/cellpainting_cellprofiler_pods.csv")
cpcnn_all.write_csv("../compiled_results/SI_tables/cellpainting_cpcnn_pods.csv")
dino_all.write_csv("../compiled_results/SI_tables/cellpainting_dino_pods.csv")

In [47]:
# Table of conventionally toxic compounds
cc_cmpds = cc.select(["Compound_name", "OASIS_ID"]).with_columns(
    pl.lit("Yes").alias("Cell_count_hit")
)
mt_cmpds = mt.select(["Compound_name", "OASIS_ID"]).with_columns(
    pl.lit("Yes").alias("MT_hit")
)
ldh_cmpds = ldh.select(["Compound_name", "OASIS_ID"]).with_columns(
    pl.lit("Yes").alias("LDH_hit")
)
morph_cmpds = dino_all.select(["Compound_name", "OASIS_ID"]).unique().with_columns(
    pl.lit("Yes").alias("Cell_Painting_hit")
)

oasis_map = oasis_id.rename({
    "Metadata_OASIS_ID": "OASIS_ID",
    "Metadata_Compound": "Compound_name"
})

hit_summary = oasis_map.join(cc_cmpds, on=["Compound_name", "OASIS_ID"], how="left").with_columns(
    pl.when(pl.col("Cell_count_hit") == "Yes").then(pl.lit("Yes")).otherwise(pl.lit("No")).alias("Cell_count_hit")
).join(mt_cmpds, on=["Compound_name", "OASIS_ID"], how="left").with_columns(
    pl.when(pl.col("MT_hit") == "Yes").then(pl.lit("Yes")).otherwise(pl.lit("No")).alias("MT_hit")
).join(ldh_cmpds, on=["Compound_name", "OASIS_ID"], how="left").with_columns(
    pl.when(pl.col("LDH_hit") == "Yes").then(pl.lit("Yes")).otherwise(pl.lit("No")).alias("LDH_hit")
).join(morph_cmpds, on=["Compound_name", "OASIS_ID"], how="left").with_columns(
    pl.when(pl.col("Cell_Painting_hit") == "Yes").then(pl.lit("Yes")).otherwise(pl.lit("No")).alias("Cell_Painting_hit")
).with_columns(
    pl.when(
        (pl.col("Cell_count_hit") == "Yes") &
        (pl.col("MT_hit") == "Yes") &
        (pl.col("LDH_hit") == "Yes") &
        (pl.col("Cell_Painting_hit") == "Yes")
        ).then(pl.lit("Yes")).otherwise(pl.lit("No")).alias("Hit_in_all_assays")
)

In [49]:
# write out results
cc.write_csv("../compiled_results/SI_tables/cellcount_pods.csv")
mt.write_csv("../compiled_results/SI_tables/mt_pods.csv")
ldh.write_csv("../compiled_results/SI_tables/ldh_pods.csv")

cellprofiler_all.write_csv("../compiled_results/SI_tables/cellpainting_cellprofiler_pods.csv")
cpcnn_all.write_csv("../compiled_results/SI_tables/cellpainting_cpcnn_pods.csv")
dino_all.write_csv("../compiled_results/SI_tables/cellpainting_dino_pods.csv")

hit_summary.write_csv("../compiled_results/SI_tables/hit_summary.csv")