In [1]:
import polars as pl 
import numpy as np

In [5]:
output_path = "../../1_snakemake/outputs"

In [3]:
def common_elements_in_lists(*lists):
    if not lists:
        return []
    
    intersection_set = set(lists[0])
    for lst in lists[1:]:
        intersection_set.intersection_update(lst)
    return list(intersection_set)

def print_pod_stats(feat_type: str):
    pods = pl.read_parquet(f"{output_path}/{feat_type}/mad_featselect/curves/pods.parquet").select(
    ["Metadata_Compound", "bmd"]
    )
    pods_int = pl.read_parquet(f"{output_path}/{feat_type}/mad_int_featselect/curves/pods.parquet").select(
        ["Metadata_Compound", "bmd"]
    ).rename({"bmd": "bmd_int"})
    pods_log10 = pl.read_parquet(f"{output_path}/{feat_type}/mad_featselect_log10/curves/pods.parquet").select(
        ["Metadata_Compound", "bmd"]
    ).rename({"bmd": "bmd_log10"})
    pods_ap = pl.read_parquet(f"{output_path}/{feat_type}/mad_featselect_ap/curves/pods.parquet").select(
        ["Metadata_Compound", "bmd"]
    ).rename({"bmd": "bmd_ap"})

    cmpds = pods.select("Metadata_Compound").to_series().to_list()
    cmpds_int = pods_int.select("Metadata_Compound").to_series().to_list()
    cmpds_log10 = pods_log10.select("Metadata_Compound").to_series().to_list()
    cmpds_ap = pods_ap.select("Metadata_Compound").to_series().to_list()

    common_cmpds = common_elements_in_lists(cmpds, cmpds_int, cmpds_log10, cmpds_ap)
    print(len(common_cmpds))

    print(f"{feat_type} num PODs: {pods.shape[0]}")
    print(f"{feat_type}_int num PODs: {pods_int.shape[0]}")
    print(f"{feat_type}_log10 num PODs: {pods_log10.shape[0]}")
    print(f"{feat_type}_ap num PODs: {pods_ap.shape[0]}")

    print(f"{feat_type} median POD: {np.round(pods.filter(pl.col('Metadata_Compound').is_in(common_cmpds)).select('bmd').median().item(), 2)}")
    print(f"{feat_type}_int median POD: {np.round(pods_int.filter(pl.col('Metadata_Compound').is_in(common_cmpds)).select('bmd_int').median().item(), 2)}")
    print(f"{feat_type}_log10 median POD: {np.round(pods_log10.filter(pl.col('Metadata_Compound').is_in(common_cmpds)).select('bmd_log10').median().item(), 2)}")
    print(f"{feat_type}_ap median POD: {np.round(pods_ap.filter(pl.col('Metadata_Compound').is_in(common_cmpds)).select('bmd_ap').median().item(), 2)}")

In [6]:
print_pod_stats("cellprofiler")

187
cellprofiler num PODs: 564
cellprofiler_int num PODs: 606
cellprofiler_log10 num PODs: 564
cellprofiler_ap num PODs: 432
cellprofiler median POD: 2.82
cellprofiler_int median POD: 2.69
cellprofiler_log10 median POD: 2.84
cellprofiler_ap median POD: 2.76


In [7]:
print_pod_stats("dino")

431
dino num PODs: 642
dino_int num PODs: 644
dino_log10 num PODs: 614
dino_ap num PODs: 480
dino median POD: 2.89
dino_int median POD: 2.89
dino_log10 median POD: 2.59
dino_ap median POD: 2.89
