In [None]:
import tensorflow as tf

for summary in tf.compat.v1.train.summary_iterator("./lightning_logs_private/scaffle/version_1/events.out.tfevents.1687079795.ip-10-0-0-10.eu-west-1.compute.internal.16796.82"):
    print(summary)

In [45]:
from pathlib import Path
from typing import Dict

import torch
from torchmetrics import Metric
from copy import deepcopy
from typing import Any, Dict, Optional, Union
import numpy as np
import os
import torch
from torch import Tensor
from torch.nn import ModuleList

from torchmetrics.metric import Metric
from torchmetrics.utilities import apply_to_collection

class TopkAccuracy(Metric):
    def __init__(self, k, dist_sync_on_step=False):
        # call `self.add_state`for every internal state that is needed for the metrics computations
        # dist_reduce_fx indicates the function that should be used to reduce
        # state from multiple processes
        super().__init__(dist_sync_on_step=dist_sync_on_step)

        self.add_state("tp_acc", default=torch.tensor(0).float(), dist_reduce_fx="sum")
        self.add_state("total_acc", default=torch.tensor(0).float(), dist_reduce_fx="sum")
        self.k = k
        self.name = f"TopkAccuracy_{k}"

    def update(self, preds: torch.Tensor, target: torch.Tensor, mask: torch.Tensor, scores: torch.Tensor):
        preds = preds * mask.float()
        scores = scores * mask.float().unsqueeze(-1)
        target = target * mask
        # scores = scores * preds.unsqueeze(-1)
        #print(target, scores, mask)
        scores = scores[:, :, 1]

        inds = scores.topk(self.k, dim=0).indices
        #print( inds.shape)
        gathered = torch.gather(target, 0, inds).sum(0) > 0

        # if torch.sum(gathered) > 2:
        #     print(scores)
        #     print(target)
        #     print(gathered)
        #     print(inds)

        self.tp_acc += torch.sum(gathered)
        self.total_acc += torch.sum(torch.any(target, dim=0))

    def compute(self):
        return (self.tp_acc.float() / max(self.total_acc, 1))

def bootstrap( preds, target, masks, scores):
        bootstraped_metrics = []
        metrics = [TopkAccuracy(1), TopkAccuracy(3), TopkAccuracy(5)]
        for i in range(10000):
            bootstraped_ids = np.random.choice(preds.shape[1], preds.shape[1], replace=True)
            bd_preds, bd_targets, bd_masks, bd_scores = (preds[:, bootstraped_ids], target[:, bootstraped_ids], 
            masks[:, bootstraped_ids], scores[:, bootstraped_ids])
            #print(bd_preds)
            #print(bd_scores)
            for m in metrics:
                m.update(bd_preds, bd_targets, bd_masks, scores=bd_scores)
            computed_vals = torch.stack([m.compute() for m in metrics], dim=0)
            for m in metrics:
                m.reset()
            bootstraped_metrics.extend(torch.unsqueeze(computed_vals, dim=0))
        return torch.stack(bootstraped_metrics, dim=0).T
model_names = [
    'bert_fine',
    'bert_our',
    'bert_caching',
    #'deep_analyze',
    'scaffle_bert', 
    'scaffle_only'
]
quantile = 0.0001

model_preds = {
}

def pretty_print(output_dict):
    for k, v in output_dict.items():
        if 'quantile' in k:
            print(f"{k}: (", round(v[0].item(), 3), ',', round(v[1].item(), 3), ')')
        else:
            print(f"{k}:", round(v.item(), 3))


for i, logs_save_path in enumerate(sorted(Path("/home/centos/bug_ml/new/training/l_logs_private/").glob("*/*"))):
    preds_vals = torch.load(os.path.join(logs_save_path,  'preds.pt'), map_location=torch.device('cpu'))
    targets_vals = torch.load(os.path.join(logs_save_path,  'targets_vals.pt'), map_location=torch.device('cpu'))
    masks_vals = torch.load(os.path.join(logs_save_path,  'masks_vals.pt'), map_location=torch.device('cpu'))
    scores_vals = torch.load(os.path.join(logs_save_path,  'scores_vals.pt'), map_location=torch.device('cpu'))
    model_name = model_names[i]
    computed_vals = (bootstrap(preds_vals, targets_vals, masks_vals, scores_vals))
    model_preds[model_name] = computed_vals.clone()
    for i, m in enumerate([TopkAccuracy(1), TopkAccuracy(3), TopkAccuracy(5)]):
        output_dict = {}
        output_dict[model_name + f"_{m.name}_mean"] = computed_vals[i, :].mean(dim=0)
        output_dict[model_name + f"_{m.name}_std"] = computed_vals[i, :].std(dim=0)
        low_q = round(quantile/2, 3)
        high_q = round(1-quantile/2, 3)
        output_dict[model_name + f"_{m.name}_quantile"] = (torch.quantile(computed_vals[i, :], quantile/2, interpolation='lower'),  torch.quantile(computed_vals[i, :], 1-quantile/2, interpolation='lower'))
        pretty_print(output_dict)
    

bert_fine_TopkAccuracy_1_mean: 0.509
bert_fine_TopkAccuracy_1_std: 0.016
bert_fine_TopkAccuracy_1_quantile: ( 0.441 , 0.569 )
bert_fine_TopkAccuracy_3_mean: 0.695
bert_fine_TopkAccuracy_3_std: 0.015
bert_fine_TopkAccuracy_3_quantile: ( 0.638 , 0.747 )
bert_fine_TopkAccuracy_5_mean: 0.803
bert_fine_TopkAccuracy_5_std: 0.013
bert_fine_TopkAccuracy_5_quantile: ( 0.757 , 0.852 )
bert_our_TopkAccuracy_1_mean: 0.519
bert_our_TopkAccuracy_1_std: 0.016
bert_our_TopkAccuracy_1_quantile: ( 0.458 , 0.577 )
bert_our_TopkAccuracy_3_mean: 0.731
bert_our_TopkAccuracy_3_std: 0.014
bert_our_TopkAccuracy_3_quantile: ( 0.676 , 0.784 )
bert_our_TopkAccuracy_5_mean: 0.819
bert_our_TopkAccuracy_5_std: 0.012
bert_our_TopkAccuracy_5_quantile: ( 0.775 , 0.862 )
bert_caching_TopkAccuracy_1_mean: 0.48
bert_caching_TopkAccuracy_1_std: 0.016
bert_caching_TopkAccuracy_1_quantile: ( 0.423 , 0.533 )
bert_caching_TopkAccuracy_3_mean: 0.678
bert_caching_TopkAccuracy_3_std: 0.015
bert_caching_TopkAccuracy_3_quantile: ( 

In [46]:
import scipy.stats as ss

for i, m in enumerate([TopkAccuracy(1), TopkAccuracy(3), TopkAccuracy(5)]):
    output_dict = {}
    print(f"Metric: {m.name}, T-test between bert_our and bert_fine:", ss.ttest_ind(a=model_preds['bert_our'][i, :], b=model_preds['bert_fine'][i, :], equal_var=True))
    print(f"Metric: {m.name}, T-test between bert_caching and bert_fine:", ss.ttest_ind(a=model_preds['bert_caching'][i, :], b=model_preds['bert_fine'][i, :], equal_var=True))
    print(f"Metric: {m.name}, T-test between bert_caching and bert_our:", ss.ttest_ind(a=model_preds['bert_caching'][i, :], b=model_preds['bert_our'][i, :], equal_var=True))
    print(f"Metric: {m.name}, T-test between scaffle and bert_our:", ss.ttest_ind(a=model_preds['scaffle_only'][i, :], b=model_preds['bert_our'][i, :]))
    print(f"Metric: {m.name}, T-test between scaffle and scaffle_bert:", ss.ttest_ind(a=model_preds['scaffle_only'][i, :], b=model_preds['scaffle_bert'][i, :]))



Metric: TopkAccuracy_1, T-test between bert_our and bert_fine: Ttest_indResult(statistic=45.30423719681878, pvalue=0.0)
Metric: TopkAccuracy_1, T-test between bert_caching and bert_fine: Ttest_indResult(statistic=-130.4606141005818, pvalue=0.0)
Metric: TopkAccuracy_1, T-test between bert_caching and bert_our: Ttest_indResult(statistic=-175.67499247956457, pvalue=0.0)
Metric: TopkAccuracy_1, T-test between scaffle and bert_our: Ttest_indResult(statistic=-457.0323261501074, pvalue=0.0)
Metric: TopkAccuracy_1, T-test between scaffle and scaffle_bert: Ttest_indResult(statistic=-104.31055527897986, pvalue=0.0)
Metric: TopkAccuracy_3, T-test between bert_our and bert_fine: Ttest_indResult(statistic=175.46935148277979, pvalue=0.0)
Metric: TopkAccuracy_3, T-test between bert_caching and bert_fine: Ttest_indResult(statistic=-82.39267408542895, pvalue=0.0)
Metric: TopkAccuracy_3, T-test between bert_caching and bert_our: Ttest_indResult(statistic=-256.88911279102376, pvalue=0.0)
Metric: TopkAccu

In [17]:
model_names = [
    'bert',
    'bert',
    'bert_caching',
    'deep_analyze',
    'scaffle_bert', 
    'scaffle_only'
]
for logs_save_path in sorted(Path("/home/centos/bug_ml/new/training/l_logs_private/").glob("*/*")):
    print(logs_save_path)

/home/centos/bug_ml/new/training/l_logs_private/bert/version_0
/home/centos/bug_ml/new/training/l_logs_private/bert/version_1
/home/centos/bug_ml/new/training/l_logs_private/bert_caching/version_0
/home/centos/bug_ml/new/training/l_logs_private/deep_analyze/version_1
/home/centos/bug_ml/new/training/l_logs_private/scaffle/scaffle_bert
/home/centos/bug_ml/new/training/l_logs_private/scaffle/scaffle_only


In [None]:
def eval_diff_predictions(logdir, our_best_model=''):
    models_preds = {}
    for dir in glob.glob(logdir + "/*"):
        model_name = dir.split("/")[-1]

        models_preds[model_name] = torch.load(model_name + '_preds')

    our_best_model_preds = models_preds[our_best_model]
    split_on_batches = torch.load('preds_lens')
    split_on_batches = torch.cumsum(split_on_batches, dim=0)
    our_best_model_preds_batched = torch.tensor_split(split_on_batches, dim=0)
    for model_name, model_preds in models_preds.items():
        if model_name != our_best_model:
            model_preds_batched = torch.tensor_split(split_on_batches, dim=0)
            i = 0
            for our_best_model_pred, model_pred in zip(our_best_model_preds, model_preds):
                our_scores = our_best_model_pred[:, :, 1]
                our_inds = our_scores.topk(self.k, dim=0).indices
                scores = model_pred[:, :, 1]
                inds = scores.topk(self.k, dim=0).indices

                if (sum(sorted(our_inds) - sorted(inds)) != 0:
                    print(f"Found diff at : {i}")
                    print("Our prediction: ", our_inds)
                    print("Their prediction: ", inds)
                    print("_________________________")
            
            i += 1

eval_diff_predictions("/home/centos/bug_ml/new/training/l_logs_private/", 'bert_caching')