In [7]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import os

from sklearn.metrics import f1_score, recall_score


In [2]:
def get_threshold(targets,probs,recall):
    diff = 1e10
    for thresh in np.arange(0,1,0.001):
        recall_ = recall_score(targets,probs>thresh)
        if np.abs(recall_-recall) < diff:
            best_thresh = thresh
            diff = np.abs(recall_-recall)

    return best_thresh

In [26]:
modes = ["erm","true_subclass_gdro"]
seeds = [101,102,103,104,105]
split = "val"
recall = 0.55

for mode in modes:
    overall_f1s = []
    robust_f1s = []
    tubes_f1s = []
    notubes_f1s = []
    for seed in seeds:
        result_dir = f"/media/nvme_data/gas_results/pmx/{mode}/seed_{seed}"
        outputs_dir = os.path.join(result_dir,"outputs.pt")

        outputs = torch.load(outputs_dir)[split]

        probs = outputs["probs"]
        targets = outputs["targets"]
        subclass_labels = outputs["true_subclass"]

        thresh = get_threshold(targets,probs,recall)
        preds = probs > thresh
        
        mask = np.logical_not(subclass_labels)
        neg_mask = np.logical_not(targets)
        postube_mask = np.logical_and(subclass_labels,targets)
        mask1 = np.logical_or(neg_mask,postube_mask)
        posnotube_mask = np.logical_and(np.logical_not(subclass_labels),targets) 
        mask2 = np.logical_or(neg_mask,posnotube_mask)

        overall_f1s.append(f1_score(targets,preds))
        robust_f1s.append(f1_score(targets[mask], preds[mask]))
        tubes_f1s.append(f1_score(targets[mask1], preds[mask1]))
        notubes_f1s.append(f1_score(targets[mask2], preds[mask2]))

    overall_f1s = np.array(overall_f1s)
    robust_f1s = np.array(robust_f1s)
    tubes_f1s = np.array(tubes_f1s)
    notubes_f1s = np.array(notubes_f1s)

    print(f"\n{mode.upper()} on {split}\n")
    print(f"Overall F1-score: {100*overall_f1s.mean():.1f} +/- {100*1.96*overall_f1s.std()/np.sqrt(len(seeds)):.1f}")
    print(f"Robust F1-score: {100*robust_f1s.mean():.1f} +/- {100*1.96*robust_f1s.std()/np.sqrt(len(seeds)):.1f}")
    print(f"Pmx w/ Tubes F1-score: {100*tubes_f1s.mean():.1f} +/- {100*1.96*tubes_f1s.std()/np.sqrt(len(seeds)):.1f}")
    print(f"Pmx w/o Tubes F1-score: {100*notubes_f1s.mean():.1f} +/- {100*1.96*notubes_f1s.std()/np.sqrt(len(seeds)):.1f}\n")
    print("="*20)


ERM on val

Overall F1-score: 60.8 +/- 3.6
Robust F1-score: 36.5 +/- 5.8
Pmx w/ Tubes F1-score: 63.5 +/- 4.6
Pmx w/o Tubes F1-score: 27.3 +/- 6.4


TRUE_SUBCLASS_GDRO on val

Overall F1-score: 57.4 +/- 4.8
Robust F1-score: 31.4 +/- 2.7
Pmx w/ Tubes F1-score: 58.0 +/- 6.5
Pmx w/o Tubes F1-score: 26.7 +/- 4.8

