# Format tables as CSV for SI

Most results produced by Snakemake are in parquet format. We want nicely formatted CSVs to include in the Supplementary Information.

- PODs for LDH, MT, and cell counts
- PODs for each representation (one csv with all passing QA/QC with labels for category, another csv with the lowest POD across all)
- AUROC for DINO and "all" profiles


In [1]:
import polars as pl

input_dir = "../../1_snakemake/inputs"

In [2]:
# Get info to convert PODs
conc_conv = pl.read_parquet(f"{input_dir}/metadata/metadata.parquet")
conc_conv = conc_conv.filter(pl.col("Metadata_Compound") != "DMSO").select(
    ["Metadata_Compound", "Metadata_Concentration"]
).group_by("Metadata_Compound").agg([
    pl.col("Metadata_Concentration").min().alias("Metadata_Min_Conc")
])

conc_conv = conc_conv.with_columns(
    (pl.col("Metadata_Min_Conc")/3).log10().abs().alias("Metadata_Conc_Shift")
)

def conv_POD_um(pod_df, conv_df, pod_col, conv_pod_nm):
    pod_df = pod_df.join(conv_df, on="Metadata_Compound")

    pod_df = pod_df.with_columns(
        (10 ** (pl.col(pod_col) - pl.col("Metadata_Conc_Shift"))).alias(conv_pod_nm)
    ).drop(["Metadata_Min_Conc", "Metadata_Conc_Shift"])

    return pod_df

def conv_whole_df(pod_df, conv_df):
    pod_df = conv_POD_um(pod_df, conv_df, "bmd", "POD_um")
    pod_df = conv_POD_um(pod_df, conv_df, "bmdl", "POD_um_l")
    pod_df = conv_POD_um(pod_df, conv_df, "bmdu", "POD_um_u")

    pod_df = pod_df.select(["Metadata_OASIS_ID", "Metadata_Compound", "gene.id", "POD_um", "POD_um_l", "POD_um_u"]).rename({
        "Metadata_OASIS_ID": "OASIS_ID",
        "Metadata_Compound": "Compound_name",
        "gene.id": "Assay_Endpoint"
        })

    return pod_df

In [17]:
# Cytotox assays

cc = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/ccpods.parquet").filter(pl.col("all.pass") == "true")
ldh = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/ldhpods.parquet").filter(pl.col("all.pass") == "true")
mt = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/mttpods.parquet").filter(pl.col("all.pass") == "true")

# Convert concentration to um
cc = conv_whole_df(cc, conc_conv)
ldh = conv_whole_df(ldh, conc_conv)
mt = conv_whole_df(mt, conc_conv)

In [3]:
oasis_id = pl.read_csv("../../1_snakemake/inputs/annotations/seal_input/v5_oasis_03Sept2024_simple.csv").select(
    ["OASIS_ID", "PREFERRED_NAME"]
).rename({
    "OASIS_ID": "Metadata_OASIS_ID",
    "PREFERRED_NAME": "Metadata_Compound"
})

In [8]:
# Cell Painting assays

# cellprofiler
cellprofiler_all = pl.read_parquet("../../1_snakemake/outputs/cellprofiler/mad_featselect/curves/bmds.parquet").filter(
    (pl.col("all.pass") == True) &
    (pl.col("SDres") < 3*pl.col("SDctrl"))
).join(oasis_id, on="Metadata_Compound")

cellprofiler_all = conv_whole_df(cellprofiler_all, conc_conv)

cellprofiler_all = cellprofiler_all.with_columns(
    (pl.col("POD_um") == pl.col("POD_um").min().over("Compound_name")).alias("Bioactivity_POD")
)

# cp-cnn
cpcnn_all = pl.read_parquet("../../1_snakemake/outputs/cpcnn/mad_featselect/curves/bmds.parquet").filter(
    (pl.col("all.pass") == True) &
    (pl.col("SDres") < 3*pl.col("SDctrl"))
).join(oasis_id, on="Metadata_Compound")

cpcnn_all = conv_whole_df(cpcnn_all, conc_conv)

cpcnn_all = cpcnn_all.with_columns(
    (pl.col("POD_um") == pl.col("POD_um").min().over("Compound_name")).alias("Bioactivity_POD")
)

# dino
dino_all = pl.read_parquet("../../1_snakemake/outputs/dino/mad_featselect/curves/bmds.parquet").filter(
    (pl.col("all.pass") == True) &
    (pl.col("SDres") < 3*pl.col("SDctrl"))
).join(oasis_id, on="Metadata_Compound")

dino_all = conv_whole_df(dino_all, conc_conv)

dino_all = dino_all.with_columns(
    (pl.col("POD_um") == pl.col("POD_um").min().over("Compound_name")).alias("Bioactivity_POD")
)

In [18]:
# write out results
cc.write_csv("../compiled_results/SI_tables/cellcount_pods.csv")
mt.write_csv("../compiled_results/SI_tables/mt_pods.csv")
ldh.write_csv("../compiled_results/SI_tables/ldh_pods.csv")

cellprofiler_all.write_csv("../compiled_results/SI_tables/cellpainting_cellprofiler_pods.csv")
cpcnn_all.write_csv("../compiled_results/SI_tables/cellpainting_cpcnn_pods.csv")
dino_all.write_csv("../compiled_results/SI_tables/cellpainting_dino_pods.csv")

In [None]:
# AUROC scores
metrics = pl.read_parquet("/dgx1nas1/storage/data/jess/repos/2024_09_09_Axiom_OASIS/2_downstream_analysis/compiled_results/compiled_toxcast_cellbased_metrics.parquet")

In [15]:
metrics.filter(pl.col("Metadata_AggType") == "all").filter(pl.col("Model_type") == "Actual").filter(pl.col("Feat_type") == "dino")

Metadata_AggType,Metadata_Label,Model_type,AUROC,PRAUC,Metadata_Count_0,Metadata_Count_1,Feat_type
str,str,str,f64,f64,i32,i32,str
"""all""","""APR_HepG2_p53Act_24hr""","""Actual""",0.458861,0.030466,158,6,"""dino"""
"""all""","""LTEA_HepaRG_CCND1""","""Actual""",0.566667,0.036393,162,5,"""dino"""
"""all""","""ATG_EGR_CIS""","""Actual""",0.668553,0.129464,279,19,"""dino"""
"""all""","""OT_ER_ERaERa_1440""","""Actual""",0.297477,0.036006,191,11,"""dino"""
"""all""","""ATG_PPRE_CIS""","""Actual""",0.517504,0.158391,251,47,"""dino"""
…,…,…,…,…,…,…,…
"""all""","""TOX21_MMP_ratio""","""Actual""",0.584151,0.162444,568,84,"""dino"""
"""all""","""OT_ER_ERaERb_0480""","""Actual""",0.447415,0.06068,187,15,"""dino"""
"""all""","""ATG_Pax6_CIS""","""Actual""",0.715426,0.156119,282,16,"""dino"""
"""all""","""LTEA_HepaRG_GADD45A""","""Actual""",0.553763,0.082525,155,12,"""dino"""
