# Environment

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import divexplorer 
import pandas as pd
pd.set_option('max_colwidth', None)
import os
import numpy as np
from copy import deepcopy
from sklearn.metrics import accuracy_score, f1_score

from utils_analysis import plotMultipleSV, plotShapleyValue

import warnings
warnings.filterwarnings('ignore')

# Util Functions

In [None]:
## Function for sorting data cohorts
def sortItemset(x, abbreviations={}):
    x = list(x)
    x.sort()
    x = ", ".join(x)
    for k, v in abbreviations.items():
        x = x.replace(k, v)
    return x

In [None]:
def attributes_in_itemset(itemset, attributes, alls = True):
    """ Check if attributes are in the itemset (all or at least one)
    
    Args:
        itemset (frozenset): the itemset
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        
    """
    # Avoid returning the empty itemset (i.e., info of entire dataset)
    if itemset == frozenset() and attributes:
        return False
    
    for item in itemset:
        # Get the attribute
        attr_i = item.split("=")[0]
        
        #If True, check if ALL attributes of the itemset are the input attributes.
        if alls:
            # Check if the attribute is present. If not, the itemset is not admitted
            if attr_i not in attributes:
                return False
        else:
            # Check if least one attribute. If yes, return True
            if attr_i in attributes:
                return True
    if alls:
        # All attributes of the itemset are indeed admitted
        return True
    else:
        # Otherwise, it means that we find None
        return False
    
def filter_itemset_df_by_attributes(df: pd.DataFrame, attributes: list, alls = True, itemset_col_name: str = "itemsets") -> pd.DataFrame:
    """Get the set of itemsets that have the attributes in the input list (all or at least one)
    
    Args:
        df (pd.DataFrame): the input itemsets (with their info). 
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        itemset_col_name (str) : the name of the itemset column, "itemsets" as default
        
    Returns:
        pd.DataFrame: the set of itemsets (with their info)
    """

    return df.loc[df[itemset_col_name].apply(lambda x: attributes_in_itemset(x, attributes, alls = alls))]

In [None]:
## Define abbreviations for plot and visualization
from divexplorer.FP_Divergence import abbreviateDict
abbreviations = {'Self-reported fluency level=native': 'fluency=native', \
                  'total_silence':'tot_silence', 'location': 'loc', \
                  'Current language used for work/school=English (United States)': 'lang=EN_US', \
                  'ageRange': 'age', \
                  'speakerId' : 'spkID', \
                  'First Language spoken=English (United States)':  'lang=EN_US', \
                  'trimmed': 'trim', \
                  'total_': 'tot_', \
                  'speed_rate_word':'speakRate', \
                  'speed_rate_char':'speakCharRate', \
                  'change language': 'change lang', \
                  'duration': 'dur'}

abbreviations_shorter = abbreviations.copy()

# Define targets

In [None]:
## Target for DivExplorer: 
# 'prediction' is 1 if predicted_intet == original_intent, 0 otherwise
target_col = 'prediction' 
target_metric = 'd_posr'
target_div = 'd_accuracy'
t_value_col = 't_value_tp_fn'

In [None]:
## Columns for visualization
show_cols = ['support', 'itemsets', '#errors', '#corrects', 'accuracy', \
                'd_accuracy', 't_value', 'support_count', 'length']
remapped_cols = {'tn': '#errors', 'tp': '#corrects', 'posr': 'accuracy', \
                target_metric: target_div, 't_value_tp_fn': 't_value'}

# FSC

## Retrieve Data and Compute Divergence

In [None]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Columns of the df file that we are going to analyze 
demo_cols = ['Self-reported fluency level ', 'First Language spoken',
       'Current language used for work/school', 'gender', 'ageRange']

slot_cols = ['action', 'object', 'location']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
       'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'] 

input_cols = demo_cols + signal_cols + slot_cols

In [None]:
approach = "divexplorer" 
min_sup = 0.03
k = 2

configs = [
    "fsc_original",
    "fsc_csi",
    "fsc_cm",
    "fsc_knn",
    "fsc_random", 
    "fsc_supervised_oracle",
    "fsc_metadata_oracle",
    "fsc_all"
    ] 
    
FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    if config == "fsc_original" or config == "fsc_all":
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "fsc", config, "42", "df_test.csv")
    else:
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "fsc", config, f"k_{k}", "42", "df_test.csv")
    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ## Add SpeakerID information if it is present in the df
    if "speakerId" in input_cols:
        df['speakerId'] = df.index.map(lambda x: x.split("/")[2])

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)
        
    df_discretized.loc[df_discretized["location"]=="none_location", "location"] = "none"
    df_discretized.loc[df_discretized["object"]=="none_object", "object"] = "none"

    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

In [None]:
## Compute the accuracy of the models
for config in configs:
    prediction = df_dict[config]['prediction'].sum()/len(df_dict[config])
    print(f"Accuracy of {config}:", round(100*prediction,3))
    print("----------")

## Divergence wav2vec 2.0 base

In [None]:
th_redundancy = None

### Original

In [None]:
## Compute the divergence for wav2vec 2.0 base
config = 'fsc_original'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### Random

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ random samples 
config = 'fsc_random'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ random samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### CM

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ CM samples
config = 'fsc_cm'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base rebalanced w/ CM samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### CSI

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ CSI samples
config = 'fsc_csi'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ CSI samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### KNN

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ KNN 
config = 'fsc_knn'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ KNN 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### Supervised Oracle

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ Supervised Oracle 
config = 'fsc_supervised_oracle'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ Supervised Oracle
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### Metadata Oracle

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ Metadata Oracle 
config = 'fsc_metadata_oracle'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ Metadata Oracle  
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### All

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ all the samples
config = 'fsc_all'
fp_divergence_i = fp_divergence_dict[config]

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(k).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ all the samples 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

# ITALIC

## Retrieve Data and Compute Divergence

In [None]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Columns of the df file that we are going to analyze 
demo_cols = ['gender', 'age', 'region', 'nationality', 'lisp', 'education']

slot_cols = ['action', 'scenario']

rec_set_cols = ['environment', 'device', 'field']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'] 

input_cols = demo_cols + slot_cols + rec_set_cols + signal_cols 

In [None]:
approach = "divexplorer" 
min_sup = 0.03
k = 2

configs = [
    "italic_original",
    "italic_csi",
    "italic_cm",
    "italic_knn",
    "italic_random", 
    "italic_supervised_oracle",
    "italic_metadata_oracle",
    "italic_all"
    ] 
    
FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    if config == "italic_original" or config == "italic_all":
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "italic", config, "42", "df_test.csv")
    else:
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "italic", config, f"k_{k}", "42", "df_test.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)
    df['action'] = df['intent'].apply(lambda x: x.split("_")[0])
    df['scenario'] = df['intent'].apply(lambda x: x.split("_")[1])

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)

    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

## Divergence XLSR300m

In [None]:
th_redundancy = None 

### Original

In [None]:
## Compute the divergence for XLSR300m
config = 'italic_original'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### Random

In [None]:
## Compute the divergence for XLSR300m w/ random samples
config = 'italic_random'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m w/ random samples 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### CM

In [None]:
## Compute the divergence for XLSR300m w/ CM samples
config = 'italic_cm'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m w/ CM samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### CSI

In [None]:
## Compute the divergence for XLSR300m w/ CSI samples
config = 'italic_csi'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m w/ CSI samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### KNN

In [None]:
## Compute the divergence for XLSR300m w/ KNN samples
config = 'italic_knn'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m w/ KNN samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### Supervised Oracle

In [None]:
## Compute the divergence for XLSR300m w/ Supervised Oracle
config = 'italic_supervised_oracle'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m w/ Supervised Oracle
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### Metadata Oracle

In [None]:
## Compute the divergence for XLSR300m w/ Metadata Oracle
config = 'italic_metadata_oracle'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for XLSR300m w/ Metadata Oracle
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

### All 

In [None]:
## Compute the divergence for XLSR300m w/ all the samples
config = 'italic_all'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base w/ all the samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_accuracy'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['accuracy'].mean(), 3))

# LibriSpeech

## Retrieve Data and Compute Divergence

In [None]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Target for DivExplorer: 'WER'
target_col = 'WER' 
target_metric = 'd_outcome'
target_div = f'd_{target_col}'
t_value_col = 't_value_outcome'
printable_columns = ['support', 'itemsets','WER', 'd_WER', 't_value']

In [None]:
## Columns for visualization
remapped_cols = { "outcome": target_col, "d_outcome": target_div, t_value_col: 't_value'}
show_cols = ['support', 'itemsets', target_col, target_div, 'support_count', 'length', 't_value']

In [None]:
## Columns of the df file that we are going to analyze 
demo_cols = ['gender']

signal_cols = ['total_silence', 'total_duration', 'n_pauses', 'n_words', 'speed_rate_word'] 

input_cols = demo_cols + signal_cols 

In [None]:
approach = "divexplorer" 
min_sup = 0.03
k = 2

configs = [
    "librispeech_original",
    "librispeech_csi",
    "librispeech_cm",
    "librispeech_knn",
    "librispeech_random", 
    "librispeech_supervised_oracle",
    "librispeech_metadata_oracle",
    "librispeech_all"
    ] 
    
FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    if config == "librispeech_original" or config == "librispeech_all":
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "librispeech", config, "42", "df_test.csv")
    else:
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "librispeech", config, f"k_{k}", "42", "df_test.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)

    ## Create dict of Divergence df
    df_dict[config] = df_discretized

    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        target_name=target_col
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
        
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['WER'] = round(FP_fm['WER'], 5)
    FP_fm['d_WER'] = round(FP_fm['d_WER'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

In [None]:
## Compute WER for each config
from jiwer import wer

wers = {}

for config in configs:

    print(config)
    
    if config == "librispeech_original" or config == "librispeech_all":
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "librispeech", config, "42", "df_test.csv")
    else:
        input_file_divexplorer = os.path.join(\
            os.getcwd(), "results", "librispeech", config, f"k_{k}", "42", "df_test.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ground_truth = list(df['text'])
    hypothesis =list(df['transcription'])
    WER = wer(ground_truth, hypothesis)*100
    wers[config] = WER

    print(f"WER of {config}:", round(WER, 3))
    print("-------------")

## Divergence Whisper base

In [None]:
th_redundancy = None 

### Original

In [None]:
## Compute the divergence for whisper base
config = 'librispeech_original'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### Random

In [None]:
## Compute the divergence for whisper base w/ random samples
config = 'librispeech_random'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ random samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### CM

In [None]:
## Compute the divergence for whisper base w/ CM samples
config = 'librispeech_cm'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ CM samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### CSI

In [None]:
## Compute the divergence for whisper base w/ CSI samples
config = 'librispeech_csi'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ CSI samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### KNN

In [None]:
## Compute the divergence for whisper base w/ KNN samples
config = 'librispeech_knn'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ KNN samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### Supervised Oracle

In [None]:
## Compute the divergence for whisper base w/ Supervised Oracle
config = 'librispeech_supervised_oracle'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ Supervised Oracle
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### Metadata Oracle

In [None]:
## Compute the divergence for whisper base w/ Metadata Oracle
config = 'librispeech_metadata_oracle'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ Metadata Oracle
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))

### All 

In [None]:
## Compute the divergence for whisper base w/ all the samples
config = 'librispeech_all'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["WER"] = (pr_bot["WER"]*100).round(3)
pr_bot["d_WER"] = ((pr_bot["WER"] - wers[config])).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "WER", "d_WER", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for whisper base w/ all the samples
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
pr = FPdiv[FPdiv['d_WER'] > 0].head(k).copy()
print(f"Mean negative divergence top {k}:", round(100*pr['d_WER'].mean(), 3))
print(f"Mean negative accuracy top {k}:",   round(100*pr['WER'].mean(), 3))