In [11]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, hamming_loss, precision_score, recall_score, ndcg_score
import numpy as np
import torch

In [12]:
%load_ext autoreload
%autoreload 2

# from utils.DataClass import ProcessDNNTSP, ProcessSFCNTSP,ProcessGP, DataClass, ProcessLANET, ProcessTCMBN
from utils.DataClass import DataClass, ProcessFile, ProcessGP

from utils.Analysis import ModelComparison, Metric, ExperimentInfo
from utils.Analysis import avg_size_of_pred_set,  plot_label_distribution, kl_divergence, plot_probas_distribution, get_label_frequencies 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def phr(predictions, ground_truth, top_n=10):
    """
    Calculate the Precision at Hits Rate (PHR) metric.

    Parameters:
    - predictions (numpy.array): A 2D array where each row contains scores for items.
    - ground_truth (numpy.array): A 2D binary array (multi-hot encoded) where each row
                                  indicates the actual relevance of items (1 if relevant, 0 otherwise).
    - top_n (int): Number of top-scored items to consider for checking hits.

    Returns:
    - float: PHR metric value.
    """
    hits = 0
    
    for pred_scores, true_labels in zip(predictions, ground_truth):
        top_indices = np.argsort(pred_scores)[::-1][:top_n]
        
        if np.any(true_labels[top_indices] == 1):
            hits += 1
    
    phr_score = hits / len(predictions)
    
    return phr_score


In [33]:
# Here you need to define two arrays with the metrics you want to take
# metric_list - These are the metrics that are counted in the multilabel setting
# metric_list_for_labels - These are the metrics that are counted in the binary categorisation setting.
# the metric_list_for_labels array is used to build metrics per label

# into these arrays you pass:
# 1. metric function object
# 2. 'pred_labels' or 'probas' depending on what the metric is working with.
# the thing is, under the bonnet, it's got its own strategy for selecting thresholds, instead of taking 
# the usual 0.5 that sklearn does if you pass it probabilities instead of labels.
# - with the name argument you pass how to name the metric (for displaying on charts).
# if you don't pass this parameter, the default will be metric.
# metric_func + str(kwargs)
# - then pass the desired arguments to kwargs, which you may want to put into the
# the metric itself. For example, passing the parameter average=<...> to the initialiser
# when the corresponding metric is called, this parameter will be substituted into the metric itself


metric_list_for_set_sizes = [
    Metric(f1_score, 'pred_labels', name='weighted f1', average='weighted', zero_division=1.0),
    Metric(hamming_loss, 'pred_labels'),
]


def roc_auc(y_true, y_pred, **kwargs):
    tasks_with_non_trivial_targets = np.where(y_true.sum(axis=0) != 0)[0]
    y_pred_copy = y_pred[:, tasks_with_non_trivial_targets]
    y_true_copy = y_true[:, tasks_with_non_trivial_targets]
    roc_auc = roc_auc_score(y_true=y_true_copy, y_score=y_pred_copy, **kwargs)
    return roc_auc

metric_list = [
    Metric(hamming_loss, 'pred_labels', name='hamming loss'),
    Metric(ndcg_score, 'probas', name=f'ndcg {10}', k=10),
    Metric(ndcg_score, 'probas', name=f'ndcg {30}', k=30),
    Metric(phr, 'probas', name=f'PHR {10}', k=10),
    Metric(phr, 'probas', name=f'PHR {30}', k=30),
    Metric(f1_score, 'pred_labels', name='micro f1', average='micro'),
    Metric(f1_score, 'pred_labels', name='macro f1', average='macro'),
    Metric(f1_score, 'pred_labels', name='weighted f1', average='weighted'),
    Metric(roc_auc, 'probas', name='micro ROC AUC', average='micro'),
    Metric(roc_auc, 'probas', name='macro ROC AUC', average='macro'),
    Metric(roc_auc, 'probas', name='weighted ROC AUC', average='weighted'),


]



metric_list_for_labels = [
    Metric(accuracy_score, 'pred_labels'),
    Metric(precision_score, 'pred_labels')
]

In [34]:
def eval_dataset(dataset_name) -> ModelComparison:


	sfcntsp = ProcessFile(dataset_name, '/app/All_models/model_pred_and_gt/SFCNTSP', 'SFCNTSP')
	sfcntsp = DataClass(sfcntsp)

	dnntsp = ProcessFile(dataset_name, '/app/All_models/model_pred_and_gt/DNNTSP', 'DNNTSP')
	dnntsp = DataClass(dnntsp)

	lanet = ProcessFile(dataset_name, '/app/All_models/model_pred_and_gt/LANET', 'LANET')
	lanet = DataClass(lanet)
 
	gp = ProcessGP(dataset_name, '/app/MyGP_topfreq/model_pred_and_gt', 'GPTopFreq')
	gp = DataClass(gp)

	tcmbn = ProcessFile(dataset_name, '/app/All_models/model_pred_and_gt/TCMBN', 'TCMBN')
	tcmbn = DataClass(tcmbn)

	print('Running tests')
      
	test_comp = ModelComparison(sfcntsp, dnntsp, lanet, gp, tcmbn)

	print('Done')

	return test_comp

## Updated Example

In [3]:
 # custom class for loading and preprocessing into the required format
 # created according to the rules described in the readme

sfcntsp = ProcessFile('DC_preprocessed', '/app/All_models/model_pred_and_gt/SFCNTSP', 'SFCNTSP')

sfcntsp = DataClass(sfcntsp)

dnntsp = ProcessFile('DC_preprocessed', '/app/All_models/model_pred_and_gt/DNNTSP', 'DNNTSP')
dnntsp = DataClass(dnntsp)

lanet = ProcessFile('DC_preprocessed', '/app/All_models/model_pred_and_gt/LANET', 'LANET')
lanet = DataClass(lanet)

gp = ProcessGP('DC_preprocessed', '/app/MyGP_topfreq/model_pred_and_gt')
gp = DataClass(gp)

tcmbn = ProcessFile('DC_preprocessed', '/app/All_models/model_pred_and_gt/TCMBN', 'TCMBN')
tcmbn = DataClass(tcmbn)


# In the initialiser of the ModelComparison class
# pass DataClass objects from different models on the same dataset that we want to compare
# During initialisation it will check that the data is from the same dataset
# And the corresponding messages will be printed out
test_comp = ModelComparison(sfcntsp, dnntsp, lanet, gp, tcmbn)

# You call the evaluate_and_save method on the ModelComparison class object
# it will plot all the graphs, calculate the metrics and save them
# the metrics you have listed in the arrays above will be used for the calculation
# there are also named parameters show and save
# show is responsible for outputting the graphs (if False, no graphs will be output),
# it is necessary not to clog the output if necessary.
# save is responsible for saving

test_comp.evaluate_and_save(metric_list_for_set_sizes, metric_list_for_labels, metric_list, figsize=(12,8),show=True)


## Dunnhumby Carbo analysis 

In [None]:
test_comp = eval_dataset("DC_preprocessed")

In [None]:
test_comp.evaluate_and_save(metric_list_for_set_sizes, metric_list_for_labels, metric_list, figsize=(12,8),show=False)

## Mimic3

In [None]:
test_comp = eval_dataset("mimic3_preprocessed")

In [None]:
test_comp.evaluate_and_save(metric_list_for_set_sizes, metric_list_for_labels, metric_list, figsize=(12,8),show=False)

## Synthea

In [None]:
test_comp = eval_dataset('synthea_preprocessed')

In [None]:
test_comp.evaluate_and_save(metric_list_for_set_sizes, metric_list_for_labels, metric_list, figsize=(12,8),show=False)

## Instacart

In [None]:
test_comp = eval_dataset("instacart_preprocessed")

In [None]:
test_comp.evaluate_and_save(metric_list_for_set_sizes, metric_list_for_labels, metric_list, figsize=(12,8),show=False)