# Compute metrics and combine across datasets

In [1]:
import polars as pl 
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

output_dir = "../1_snakemake/outputs"
results_dir = "./compiled_results"

## Shared functions

In [3]:
def compute_metrics(y_pred, y_actual, y_prob):
    try:
        auroc = roc_auc_score(y_actual, y_prob)
    except ValueError:
        auroc = None

    try:
        precision, recall, _ = precision_recall_curve(y_actual, y_prob)
        prauc = auc(recall, precision)
    except ValueError:
        prauc = None
    
    return auroc, prauc


def process_metrics(pred: pl.DataFrame):
    class_balance = pred.select(["Metadata_AggType", "Metadata_Label", "Metadata_Count_0", "Metadata_Count_1"]).unique()

    grouped = pred.group_by(['Metadata_AggType', 'Metadata_Label']).agg([
        pl.col('y_pred').alias('y_pred_list'),
        pl.col('y_actual').alias('y_actual_list'),
        pl.col('y_prob').alias('y_prob_list'),
    ])

    result = grouped.with_columns([
        pl.struct(['y_pred_list', 'y_actual_list', 'y_prob_list']).map_elements(
            lambda s: compute_metrics(s['y_pred_list'], s['y_actual_list'], s['y_prob_list'])
        ).alias('metrics')
    ])

    result = result.with_columns([
        pl.col('metrics').map_elements(lambda s: s[0]).alias('AUROC'),
        pl.col('metrics').map_elements(lambda s: s[1]).alias('PRAUC'),
    ])

    result = result.drop(['y_pred_list', 'y_actual_list', 'y_prob_list', 'metrics'])
    return result.join(class_balance, on=['Metadata_AggType', 'Metadata_Label'])

## Seal outcomes

In [5]:
# Compute metrics
cpcnn = process_metrics(pl.read_parquet(f"{output_dir}/cpcnn/mad_featselect/classifier_results/seal_binary_predictions.parquet"))
dino = process_metrics(pl.read_parquet(f"{output_dir}/dino/mad_featselect/classifier_results/seal_binary_predictions.parquet"))
cellprofiler = process_metrics(pl.read_parquet(f"{output_dir}/cellprofiler/mad_featselect/classifier_results/seal_binary_predictions.parquet"))

# Combine
cpcnn = cpcnn.with_columns(pl.lit("cpcnn").alias("Feat_type"))
dino = dino.with_columns(pl.lit("dino").alias("Feat_type"))
cellprofiler = cellprofiler.with_columns(pl.lit("cellprofiler").alias("Feat_type"))

all_results = pl.concat([cpcnn, dino, cellprofiler], how="vertical")
all_results.write_parquet(f"{results_dir}/compiled_seal_metrics.parquet")

  result = grouped.with_columns([
  result = grouped.with_columns([
  result = grouped.with_columns([


## Motive outcomes

In [6]:
# Compute metrics
cpcnn = process_metrics(pl.read_parquet(f"{output_dir}/cpcnn/mad_featselect/classifier_results/motive_binary_predictions.parquet"))
dino = process_metrics(pl.read_parquet(f"{output_dir}/dino/mad_featselect/classifier_results/motive_binary_predictions.parquet"))
cellprofiler = process_metrics(pl.read_parquet(f"{output_dir}/cellprofiler/mad_featselect/classifier_results/motive_binary_predictions.parquet"))

# Combine
cpcnn = cpcnn.with_columns(pl.lit("cpcnn").alias("Feat_type"))
dino = dino.with_columns(pl.lit("dino").alias("Feat_type"))
cellprofiler = cellprofiler.with_columns(pl.lit("cellprofiler").alias("Feat_type"))

all_results = pl.concat([cpcnn, dino, cellprofiler], how="vertical")
all_results.write_parquet(f"{results_dir}/compiled_motive_metrics.parquet")

  result = grouped.with_columns([
  result = grouped.with_columns([
  result = grouped.with_columns([
