# Environment

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import divexplorer 
import pandas as pd
pd.set_option('max_colwidth', None)
import os
import numpy as np
from copy import deepcopy

from utils_analysis import plotMultipleSV, plotShapleyValue

import warnings
warnings.filterwarnings('ignore')

# Util Functions

In [None]:
## Function for sorting data cohorts
def sortItemset(x, abbreviations={}):
    x = list(x)
    x.sort()
    x = ", ".join(x)
    for k, v in abbreviations.items():
        x = x.replace(k, v)
    return x

In [None]:
def attributes_in_itemset(itemset, attributes, alls = True):
    """ Check if attributes are in the itemset (all or at least one)
    
    Args:
        itemset (frozenset): the itemset
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        
    """
    # Avoid returning the empty itemset (i.e., info of entire dataset)
    if itemset == frozenset() and attributes:
        return False
    
    for item in itemset:
        # Get the attribute
        attr_i = item.split("=")[0]
        
        #If True, check if ALL attributes of the itemset are the input attributes.
        if alls:
            # Check if the attribute is present. If not, the itemset is not admitted
            if attr_i not in attributes:
                return False
        else:
            # Check if least one attribute. If yes, return True
            if attr_i in attributes:
                return True
    if alls:
        # All attributes of the itemset are indeed admitted
        return True
    else:
        # Otherwise, it means that we find None
        return False
    
def filter_itemset_df_by_attributes(df: pd.DataFrame, attributes: list, alls = True, itemset_col_name: str = "itemsets") -> pd.DataFrame:
    """Get the set of itemsets that have the attributes in the input list (all or at least one)
    
    Args:
        df (pd.DataFrame): the input itemsets (with their info). 
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        itemset_col_name (str) : the name of the itemset column, "itemsets" as default
        
    Returns:
        pd.DataFrame: the set of itemsets (with their info)
    """

    return df.loc[df[itemset_col_name].apply(lambda x: attributes_in_itemset(x, attributes, alls = alls))]

In [None]:
## Define abbreviations for plot and visualization
from divexplorer.FP_Divergence import abbreviateDict
abbreviations = {'Self-reported fluency level=native': 'fluency=native', \
                  'total_silence':'tot_silence', 'location': 'loc', \
                  'Current language used for work/school=English (United States)': 'lang=EN_US', \
                  'ageRange': 'age', \
                  'speakerId' : 'spkID', \
                  'First Language spoken=English (United States)':  'lang=EN_US', \
                  'trimmed': 'trim', \
                  'total_': 'tot_', \
                  'speed_rate_word':'speakRate', \
                  'speed_rate_char':'speakCharRate', \
                  'change language': 'change lang', \
                  'duration': 'dur'}

abbreviations_shorter = abbreviations.copy()

# Define targets

In [None]:
## Target for DivExplorer: 
# 'prediction' is 1 if predicted_intent == original_intent, 0 otherwise
target_col = 'prediction' 
target_metric = 'd_posr'
target_div = 'd_accuracy'
t_value_col = 't_value_tp_fn'

In [None]:
## Columns for visualization
show_cols = ['support', 'itemsets', '#errors', '#corrects', 'accuracy', \
                'd_accuracy', 't_value', 'support_count', 'length']
remapped_cols = {'tn': '#errors', 'tp': '#corrects', 'posr': 'accuracy', \
                target_metric: target_div, 't_value_tp_fn': 't_value'}

# FSC

## Retrieve Data and Compute Divergence

In [None]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Columns of the df file that we are going to analyze 
demo_cols = ['Self-reported fluency level ', 'First Language spoken',
       'Current language used for work/school', 'gender', 'ageRange']

slot_cols = ['action', 'object', 'location']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
       'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'] 
signal_cols = ['total_silence', 'total_duration', 'n_words', 'speed_rate_word',] 

input_cols = demo_cols + signal_cols + slot_cols

In [None]:
approach = "divexplorer" # "divexplorer" or "clustering"

## Define the minimum support threshold for data subgroups
min_sup = 0.03

configs = [
    "original", 
    "contrastive_intents",
    "contrastive_subgroups",
    "contrastive_subgroups_errors",
    "contrastive_subgroups_errors_star",
    "clues",
    "augmentation",
    "adversarial",
    "acquisition",
    ]

FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    input_file_divexplorer = os.path.join(\
        os.getcwd(), "results", "fsc", config, "42", "df_test.csv")
    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ## Add SpeakerID information if it is present in the df
    if "speakerId" in input_cols:
        df['speakerId'] = df.index.map(lambda x: x.split("/")[2])

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)
    
    df_discretized.loc[df_discretized["location"]=="none_location", "location"] = "none"
    df_discretized.loc[df_discretized["object"]=="none_object", "object"] = "none"
    
    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

## Divergence wav2vec 2.0 base

In [None]:
th_redundancy = None

### wav2vec 2.0 Original

In [None]:
## Compute the divergence for wav2vec 2.0 base
config = 'original'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Contrastive Intents ($L_i$)

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ contrastive learning on intents
config = 'contrastive_intents'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Contrastive Subgroups ($L_s$)

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ L_i + L_s 
config = 'contrastive_subgroups'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Contrastive Subgroups + Errors ($L_s$ + $L_e$)

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ L_s + L_e 
config = 'contrastive_subgroups_errors'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Contrastive Subgroups + Errors* ($L_s$ + $L_e^*$)

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ L_s + L_e*
config = 'contrastive_subgroups_errors_star'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ CLUES ($L_i$ + $L_s$ + $L_e$)

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ CLUES 
config = 'clues'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Data Augmentation

In [None]:
## Compute the divergence for wav2vec 2.0 w/ data augmentation
config = 'augmentation'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Adversarial

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ adversarial loss
config = 'adversarial'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### wav2vec 2.0 w/ Acquisition

In [None]:
## Compute the divergence for wav2vec 2.0 base w/ targeted acquisition 
config = 'acquisition'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

# ITALIC

## Retrieve Data and Compute Divergence

In [None]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Columns of the df file that we are going to analyze 
demo_cols = ['gender', 'age', 'region', 'nationality', 'lisp', 'education']

slot_cols = ['action', 'scenario']

rec_set_cols = ['environment', 'device', 'field']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
    'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'] 

input_cols = demo_cols + slot_cols + rec_set_cols + signal_cols 

In [None]:
approach = "divexplorer" 
min_sup = 0.03

configs = [
    "original", 
    "contrastive_intents",
    "contrastive_subgroups",
    "contrastive_subgroups_errors",
    "contrastive_subgroups_errors_star",
    "clues",
    "augmentation",
    "adversarial",
    "acquisition",
    ]

FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    ## Read csv file
    input_file_divexplorer = os.path.join(\
        os.getcwd(), "results", "italic", config, "42", "df_test.csv")                
    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ## create scenario and action columns from intent
    df["scenario"] = df["intent"].apply(lambda x: x.split("_")[0])
    df["action"] = df["intent"].apply(lambda x: x.split("_")[1])

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)

    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

In [None]:
## Compute the accuracy of the models
for config in configs:
    prediction = df_dict[config]['prediction'].sum()/len(df_dict[config])
    print(f"Accuracy of {config}:", round(100*prediction,3))
    print("----------")

## Divergence XLSR

In [None]:
th_redundancy = None

### XLSR Original

In [None]:
## Compute the divergence for XLSR
config = 'original'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Contrastive Intents ($L_i$)

In [None]:
## Compute the divergence for XLSR w/ contrastive learning on intents
config = 'contrastive_intents'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Contrastive Subgroups ($L_s$)

In [None]:
## Compute the divergence for XLSR w/ L_i + L_s 
config = 'contrastive_subgroups'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Contrastive Subgroups + Errors ($L_s$ + $L_e$)

In [None]:
## Compute the divergence for XLSR w/ L_s + L_e 
config = 'contrastive_subgroups_errors'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Contrastive Subgroups + Errors* ($L_s$ + $L_e^*$)

In [None]:
## Compute the divergence for XLSR w/ L_s + L_e*
config = 'contrastive_subgroups_errors_star'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ CLUES ($L_i$ + $L_s$ + $L_e$)

In [None]:
## Compute the divergence for XLSR w/ CLUES 
config = 'clues'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Data Augmentation

In [None]:
## Compute the divergence for XLSR w/ data augmentation
config = 'augmentation'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Adversarial

In [None]:
## Compute the divergence for XLSR w/ adversarial loss
config = 'adversarial'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### XLSR w/ Acquisition

In [None]:
## Compute the divergence for XLSR w/ targeted acquisition 
config = 'acquisition'
fp_divergence_i = fp_divergence_dict[config]

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean negative divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean positive divergence
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean positive divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean positive divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean positive divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean positive divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean positive divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))