# Compute metrics and combine across datasets

In [2]:
import polars as pl 
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve, auc

output_dir = "../1_snakemake/outputs"
results_dir = "./compiled_results"

## Shared functions

In [3]:
def compute_metrics(y_pred, y_actual, y_prob):
    f1 = f1_score(y_actual, y_pred, average='macro')

    try:
        auroc = roc_auc_score(y_actual, y_prob)
    except ValueError:
        auroc = None

    try:
        precision, recall, _ = precision_recall_curve(y_actual, y_prob)
        prauc = auc(recall, precision)
    except ValueError:
        prauc = None
    
    return f1, auroc, prauc

## Seal outcomes

In [3]:
# CPCNN
pred = pl.read_parquet(f"{output_dir}/cpcnn/mad_featselect/classifier_results/seal_binary_predictions.parquet")
class_balance = pred.select(["Metadata_AggType", "Metadata_Label", "Metadata_Count_0", "Metadata_Count_1"]).unique()

grouped = pred.group_by(['Metadata_AggType', 'Metadata_Label']).agg([
    pl.col('y_pred').alias('y_pred_list'),
    pl.col('y_actual').alias('y_actual_list'),
    pl.col('y_prob').alias('y_prob_list'),
])

result = grouped.with_columns([
    pl.struct(['y_pred_list', 'y_actual_list', 'y_prob_list']).map_elements(
        lambda s: compute_metrics(s['y_pred_list'], s['y_actual_list'], s['y_prob_list'])
    ).alias('metrics')
])

result = result.with_columns([
    pl.col('metrics').map_elements(lambda s: s[0]).alias('F1_Score'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('AUROC'),
    pl.col('metrics').map_elements(lambda s: s[2]).alias('PRAUC'),
])

result = result.drop(['y_pred_list', 'y_actual_list', 'y_prob_list', 'metrics'])
cpcnn = result.join(class_balance, on=['Metadata_AggType', 'Metadata_Label'])

  result = grouped.with_columns([


In [4]:
# Dino
pred = pl.read_parquet(f"{output_dir}/dino/mad_featselect/classifier_results/seal_binary_predictions.parquet")
class_balance = pred.select(["Metadata_AggType", "Metadata_Label", "Metadata_Count_0", "Metadata_Count_1"]).unique()

grouped = pred.group_by(['Metadata_AggType', 'Metadata_Label']).agg([
    pl.col('y_pred').alias('y_pred_list'),
    pl.col('y_actual').alias('y_actual_list'),
    pl.col('y_prob').alias('y_prob_list'),
])

result = grouped.with_columns([
    pl.struct(['y_pred_list', 'y_actual_list', 'y_prob_list']).map_elements(
        lambda s: compute_metrics(s['y_pred_list'], s['y_actual_list'], s['y_prob_list'])
    ).alias('metrics')
])

result = result.with_columns([
    pl.col('metrics').map_elements(lambda s: s[0]).alias('F1_Score'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('AUROC'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('PRAUC'),
])

result = result.drop(['y_pred_list', 'y_actual_list', 'y_prob_list', 'metrics'])
dino = result.join(class_balance, on=['Metadata_AggType', 'Metadata_Label'])

  result = grouped.with_columns([


In [5]:
# CellProfiler
pred = pl.read_parquet(f"{output_dir}/cellprofiler/mad_featselect/classifier_results/seal_binary_predictions.parquet")
class_balance = pred.select(["Metadata_AggType", "Metadata_Label", "Metadata_Count_0", "Metadata_Count_1"]).unique()

grouped = pred.group_by(['Metadata_AggType', 'Metadata_Label']).agg([
    pl.col('y_pred').alias('y_pred_list'),
    pl.col('y_actual').alias('y_actual_list'),
    pl.col('y_prob').alias('y_prob_list'),
])

result = grouped.with_columns([
    pl.struct(['y_pred_list', 'y_actual_list', 'y_prob_list']).map_elements(
        lambda s: compute_metrics(s['y_pred_list'], s['y_actual_list'], s['y_prob_list'])
    ).alias('metrics')
])

result = result.with_columns([
    pl.col('metrics').map_elements(lambda s: s[0]).alias('F1_Score'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('AUROC'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('PRAUC'),
])

result = result.drop(['y_pred_list', 'y_actual_list', 'y_prob_list', 'metrics'])
cellprofiler = result.join(class_balance, on=['Metadata_AggType', 'Metadata_Label'])

  result = grouped.with_columns([


In [6]:
# Combine
cpcnn = cpcnn.with_columns(pl.lit("cpcnn").alias("Feat_type"))
dino = dino.with_columns(pl.lit("dino").alias("Feat_type"))
cellprofiler = cellprofiler.with_columns(pl.lit("cellprofiler").alias("Feat_type"))

all_results = pl.concat([cpcnn, dino, cellprofiler], how="vertical")
all_results.write_parquet(f"{results_dir}/compiled_seal_metrics.parquet")

## Motive outcomes

In [4]:
# CPCNN
pred = pl.read_parquet(f"{output_dir}/cpcnn/mad_featselect/classifier_results/motive_binary_predictions.parquet")
class_balance = pred.select(["Metadata_AggType", "Metadata_Label", "Metadata_Count_0", "Metadata_Count_1"]).unique()

grouped = pred.group_by(['Metadata_AggType', 'Metadata_Label']).agg([
    pl.col('y_pred').alias('y_pred_list'),
    pl.col('y_actual').alias('y_actual_list'),
    pl.col('y_prob').alias('y_prob_list'),
])

result = grouped.with_columns([
    pl.struct(['y_pred_list', 'y_actual_list', 'y_prob_list']).map_elements(
        lambda s: compute_metrics(s['y_pred_list'], s['y_actual_list'], s['y_prob_list'])
    ).alias('metrics')
])

result = result.with_columns([
    pl.col('metrics').map_elements(lambda s: s[0]).alias('F1_Score'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('AUROC'),
    pl.col('metrics').map_elements(lambda s: s[2]).alias('PRAUC'),
])

result = result.drop(['y_pred_list', 'y_actual_list', 'y_prob_list', 'metrics'])
cpcnn = result.join(class_balance, on=['Metadata_AggType', 'Metadata_Label'])

  result = grouped.with_columns([


In [5]:
## Dino
pred = pl.read_parquet(f"{output_dir}/dino/mad_featselect/classifier_results/motive_binary_predictions.parquet")
class_balance = pred.select(["Metadata_AggType", "Metadata_Label", "Metadata_Count_0", "Metadata_Count_1"]).unique()

grouped = pred.group_by(['Metadata_AggType', 'Metadata_Label']).agg([
    pl.col('y_pred').alias('y_pred_list'),
    pl.col('y_actual').alias('y_actual_list'),
    pl.col('y_prob').alias('y_prob_list'),
])

result = grouped.with_columns([
    pl.struct(['y_pred_list', 'y_actual_list', 'y_prob_list']).map_elements(
        lambda s: compute_metrics(s['y_pred_list'], s['y_actual_list'], s['y_prob_list'])
    ).alias('metrics')
])

result = result.with_columns([
    pl.col('metrics').map_elements(lambda s: s[0]).alias('F1_Score'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('AUROC'),
    pl.col('metrics').map_elements(lambda s: s[1]).alias('PRAUC'),
])

result = result.drop(['y_pred_list', 'y_actual_list', 'y_prob_list', 'metrics'])
dino = result.join(class_balance, on=['Metadata_AggType', 'Metadata_Label'])

  result = grouped.with_columns([


In [None]:
## CellProfiler

In [6]:
# Combine
cpcnn = cpcnn.with_columns(pl.lit("cpcnn").alias("Feat_type"))
dino = dino.with_columns(pl.lit("dino").alias("Feat_type"))
#cellprofiler = cellprofiler.with_columns(pl.lit("cellprofiler").alias("Feat_type"))

#all_results = pl.concat([cpcnn, dino, cellprofiler], how="vertical")
all_results = pl.concat([cpcnn, dino], how="vertical")
all_results.write_parquet(f"{results_dir}/compiled_motive_metrics.parquet")