# Environment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import divexplorer 
import pandas as pd
pd.set_option('max_colwidth', None)
import os
import numpy as np
from copy import deepcopy
from sklearn.metrics import accuracy_score, f1_score

from utils_analysis import plotMultipleSV, plotShapleyValue

import warnings
warnings.filterwarnings('ignore')

# Util Functions

In [3]:
## Function for sorting data cohorts
def sortItemset(x, abbreviations={}):
    x = list(x)
    x.sort()
    x = ", ".join(x)
    for k, v in abbreviations.items():
        x = x.replace(k, v)
    return x

In [4]:
def attributes_in_itemset(itemset, attributes, alls = True):
    """ Check if attributes are in the itemset (all or at least one)
    
    Args:
        itemset (frozenset): the itemset
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        
    """
    # Avoid returning the empty itemset (i.e., info of entire dataset)
    if itemset == frozenset() and attributes:
        return False
    
    for item in itemset:
        # Get the attribute
        attr_i = item.split("=")[0]
        
        #If True, check if ALL attributes of the itemset are the input attributes.
        if alls:
            # Check if the attribute is present. If not, the itemset is not admitted
            if attr_i not in attributes:
                return False
        else:
            # Check if least one attribute. If yes, return True
            if attr_i in attributes:
                return True
    if alls:
        # All attributes of the itemset are indeed admitted
        return True
    else:
        # Otherwise, it means that we find None
        return False
    
def filter_itemset_df_by_attributes(df: pd.DataFrame, attributes: list, alls = True, itemset_col_name: str = "itemsets") -> pd.DataFrame:
    """Get the set of itemsets that have the attributes in the input list (all or at least one)
    
    Args:
        df (pd.DataFrame): the input itemsets (with their info). 
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        itemset_col_name (str) : the name of the itemset column, "itemsets" as default
        
    Returns:
        pd.DataFrame: the set of itemsets (with their info)
    """

    return df.loc[df[itemset_col_name].apply(lambda x: attributes_in_itemset(x, attributes, alls = alls))]

In [5]:
## Define abbreviations for plot and visualization
from divexplorer.FP_Divergence import abbreviateDict
abbreviations = {'Self-reported fluency level=native': 'fluency=native', \
                  'total_silence':'tot_silence', 'location': 'loc', \
                  'Current language used for work/school=English (United States)': 'lang=EN_US', \
                  'ageRange': 'age', \
                  'speakerId' : 'spkID', \
                  'First Language spoken=English (United States)':  'lang=EN_US', \
                  'trimmed': 'trim', \
                  'total_': 'tot_', \
                  'speed_rate_word':'speakRate', \
                  'speed_rate_char':'speakCharRate', \
                  'change language': 'change lang', \
                  'duration': 'dur'}

abbreviations_shorter = abbreviations.copy()

# Define targets

In [6]:
## Target for DivExplorer: 
# 'prediction' is 1 if predicted_intet == original_intent, 0 otherwise
target_col = 'prediction' 
target_metric = 'd_posr'
target_div = 'd_accuracy'
t_value_col = 't_value_tp_fn'

In [7]:
## Columns for visualization
show_cols = ['support', 'itemsets', '#errors', '#corrects', 'accuracy', \
                'd_accuracy', 't_value', 'support_count', 'length']
remapped_cols = {'tn': '#errors', 'tp': '#corrects', 'posr': 'accuracy', \
                target_metric: target_div, 't_value_tp_fn': 't_value'}

# FSC

## Metadata

In [8]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [9]:
## Columns of the df file that we are going to analyze 
demo_cols = ['Self-reported fluency level ', 'First Language spoken',
       'Current language used for work/school', 'gender', 'ageRange']

slot_cols = ['action', 'object', 'location']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
       'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'] 

input_cols = demo_cols + signal_cols + slot_cols

## Data Acquisition

In [None]:
approach = "divexplorer" # "divexplorer" or "clustering"

## Define the minimum support threshold for data subgroups
if approach == "divexplorer":
    min_sup = 0.03
elif approach == "clustering":
    min_sup = 0.000001
    num_clusters = 20

 
configs = [
    "fsc_original", 
    ] 
    
FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    input_file_divexplorer = os.path.join(\
        os.getcwd(), config, "0", "predictions_5.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ## Add SpeakerID information if it is present in the df
    if "speakerId" in input_cols:
        df['speakerId'] = df.index.map(lambda x: x.split("/")[2])

    if approach == 'divexplorer':

        ## Discretize the dataframe
        from divergence_utils import discretize

        df_discretized = discretize(
            df[input_cols+[target_col]],
            bins=3,
            attributes=input_cols,
            strategy="quantile", 
            round_v = 2,
            min_distinct=5,
        )

        ## Replace values with ranges: "low", "medium", "high"
        replace_values = {}

        for i in range(0,len(signal_cols)):

            for v in df_discretized[signal_cols[i]].unique():
                if "<=" == v[0:2]:
                    replace_values[v] = "low"
                elif ">" == v[0]:
                    replace_values[v] = "high"
                elif "("  == v[0] and "]"  == v[-1]:
                    replace_values[v] = "medium"
                else:
                    raise ValueError(v)

            df_discretized[signal_cols[i]].replace(replace_values, inplace=True)
    
    elif approach == 'clustering':

        df_discretized = df[[f'speech_cluster_id_{k}' for k in [num_clusters]] + [target_col]]
    
    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

In [None]:
## Discretize the dataframe
from divergence_utils import discretize

df_train_rest = pd.read_csv("data/fsc/train_data_20.csv")

if approach == 'divexplorer':
    df_discretized_rest = discretize(
        df_train_rest[input_cols],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized_rest[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized_rest[signal_cols[i]].replace(replace_values, inplace=True)

elif approach == 'clustering':
    
    df_discretized_rest = df_train_rest[[f'speech_cluster_id_{k}' for k in [num_clusters]]]

In [None]:
from tqdm import tqdm

RANDOM = False
th_redundancy = 0.15
NUM_SUBGROUPS = [2,3,4,5]

if RANDOM:

    # for num_samples in [176, 463, 388, 599, 547, 829, 600, 1027]:
    for num_samples in [226, 406, 382, 874, 422, 1046, 509, 1276]:

        df_train_rest = pd.read_csv("data/fsc/train_data_20.csv")
        df_train_rest = df_train_rest.sample(frac=1).reset_index(drop=True)
        df_train_rest = df_train_rest.head(num_samples)
        print("Total number of samples in to be added: ", len(df_train_rest))

        df_train = pd.read_csv("data/fsc/train_data_80.csv")
        df_train = df_train.append(df_train_rest, ignore_index=True)
        df_train.to_csv(f"data/fsc/new_data/train_data_random_{num_samples}.csv", index=False)
        print("----------------------------------")

else: 

    for NS in NUM_SUBGROUPS:

        print("Number of problematic subgroups: ", NS)

        fp_divergence_i = fp_divergence_dict[config]
        FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
        pr_bot = FPdiv.head(NS).copy()
        itemsets = []
        for i in range(NS):
            if approach == 'divexplorer':
                itemsets.append(list(pr_bot.itemsets.values[i]))
            elif approach == 'clustering':
                itemsets.append(list(pr_bot.itemsets.values[i])[0])

        ## Create a column in the df, and assign a class to each sample:
        # - 1 if the sample is in the most divergent itemset
        # - 2 if the sample is in the second most divergent itemset
        # - 3 if the sample is in the third most divergent itemset
        # - ...
        # - 0 otherwise
        df_discretized_rest["subgID"] = 0

        if approach == 'divexplorer':
            for i in range(0, len(df_discretized_rest)):
                for value,itemset in enumerate(itemsets):
                    ks = []
                    vs = []
                    for item in itemset:
                        k, v = item.split("=")
                        ks.append(k)
                        vs.append(v)
                    if all(df_discretized_rest.loc[i, ks] == vs):
                        if df_discretized_rest.loc[i, "subgID"] == 0:
                            df_discretized_rest.loc[i, "subgID"] = value+1
                        else:
                            continue
                    else:
                        continue
        elif approach == 'clustering':
            for i in range(0, len(df_discretized_rest)):
                for value,itemset in enumerate(itemsets):
                    k, v = itemset.split("=")
                    if df_discretized_rest.loc[i, k] == int(v):
                        if df_discretized_rest.loc[i, "subgID"] == 0:
                            df_discretized_rest.loc[i, "subgID"] = value+1
                        else:
                            continue
                    else:
                        continue

        ## Keep in df_discretized_rest only the elements with subgID != 0
        df_train_rest = pd.read_csv("data/fsc/train_data_20.csv")
        df_train_rest = df_train_rest.loc[df_discretized_rest["subgID"]!=0]
        print("Total number of samples in to be added: ", len(df_train_rest))
        
        ## Append df_discretized_rest to df_train
        df_train = pd.read_csv("data/fsc/train_data_80.csv")
        df_train = df_train.append(df_train_rest, ignore_index=True)
        df_train.to_csv(f"data/fsc/new_data/train_data_{approach}_k{NS}.csv", index=False)
        print("----------------------------------")

## Divergence wav2vec 2.0

### Retrieve Data and Compute Divergence 

In [None]:
min_sup = 0.03

configs = [
    "fsc_original", 
    "fsc_divexplorer_test-k_2",
    "fsc_divexplorer_test-k_3",
    "fsc_divexplorer_test-k_4",
    "fsc_divexplorer_test-k_5",
    "fsc_clustering_test-k_2",
    "fsc_clustering_test-k_3",
    "fsc_clustering_test-k_4",
    "fsc_clustering_test-k_5",
    "fsc_random_test-226",
    "fsc_random_test-382",
    "fsc_random_test-406",
    "fsc_random_test-422",
    "fsc_random_test-509",
    "fsc_random_test-874",
    "fsc_random_test-1046",
    "fsc_random_test-1276"
    ] 
    
FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    if "divexplorer" in config or "clustering" in config or "random" in config:
        folders = config.split("-")
        input_file_divexplorer = os.path.join(\
            os.getcwd(), folders[0], folders[1], "0", "df_test.csv")
    else:
        input_file_divexplorer = os.path.join(\
            os.getcwd(), config, "0", "df_test.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)

    ## Add SpeakerID information if it is present in the df
    if "speakerId" in input_cols:
        df['speakerId'] = df.index.map(lambda x: x.split("/")[2])

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)
            
    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

### Original

In [None]:
## Compute the divergence for wav2vec 2.0 base
config = 'fsc_original'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None 

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### Random

In [None]:
## Compute the divergence for wav2vec 2.0 base rebalanced w/ random boosting 
config = 'fsc_random_test-874'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None 

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base rebalanced w/ random boosting 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### DivExplorer

In [None]:
## Compute the divergence for wav2vec 2.0 base rebalanced w/ DivExplorer 
config = 'fsc_divexplorer_test-k_2'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None 

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base rebalanced w/ DivExplorer 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))
print("Total subgroups: ", len(FPdiv))

### Clustering

In [None]:
## Compute the divergence for wav2vec 2.0 base rebalanced w/ Clustering 
config = 'fsc_clustering_test-k_2'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean negative divergence for wav2vec 2.0 base rebalanced w/ DivExplorer 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean negative divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean negative divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean negative divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean negative divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("\nMean divergence all:", round(100*pr['d_accuracy'].mean(), 3))
print("Total subgroups: ", len(FPdiv))

# ITALIC

## Metadata

In [8]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [9]:
## Columns of the df file that we are going to analyze 
demo_cols = ['gender', 'age', 'region', 'nationality', 'lisp', 'education']

slot_cols = ['action', 'scenario']

rec_set_cols = ['environment', 'device', 'field']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'] 

input_cols = demo_cols + slot_cols + rec_set_cols + signal_cols 

## Data Acquisition

In [None]:
approach = "divexplorer" # "divexplorer" or "clustering"

## Define the minimum support threshold for data subgroups
if approach == "divexplorer":
    min_sup = 0.03
elif approach == "clustering":
    min_sup = 0.000001
    num_clusters = 10

configs = [
    "italic_original", 
    ]

FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    input_file_divexplorer = os.path.join(\
        os.getcwd(), config, "0", "predictions_7.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)
    df['action'] = df['intent'].apply(lambda x: x.split("_")[0])
    df['scenario'] = df['intent'].apply(lambda x: x.split("_")[1])

    if approach == 'divexplorer':

        ## Discretize the dataframe
        from divergence_utils import discretize

        df_discretized = discretize(
            df[input_cols+[target_col]],
            bins=3,
            attributes=input_cols,
            strategy="quantile", 
            round_v = 2,
            min_distinct=5,
        )

        ## Replace values with ranges: "low", "medium", "high"
        replace_values = {}

        for i in range(0,len(signal_cols)):

            for v in df_discretized[signal_cols[i]].unique():
                if "<=" == v[0:2]:
                    replace_values[v] = "low"
                elif ">" == v[0]:
                    replace_values[v] = "high"
                elif "("  == v[0] and "]"  == v[-1]:
                    replace_values[v] = "medium"
                else:
                    raise ValueError(v)

            df_discretized[signal_cols[i]].replace(replace_values, inplace=True)

    elif approach == 'clustering':

        df_discretized = df[[f'speech_cluster_id_{k}' for k in [num_clusters]] + [target_col]]
    
    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

In [None]:
## Discretize the dataframe
from divergence_utils import discretize

df_train_rest = pd.read_csv("data/italic/train_data_20.csv")
df_train_rest['action'] = df_train_rest['intent'].apply(lambda x: x.split("_")[0])
df_train_rest['scenario'] = df_train_rest['intent'].apply(lambda x: x.split("_")[1])

if approach == 'divexplorer':
    df_discretized_rest = discretize(
        df_train_rest[input_cols],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized_rest[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized_rest[signal_cols[i]].replace(replace_values, inplace=True)

elif approach == 'clustering':
    
    df_discretized_rest = df_train_rest[[f'speech_cluster_id_{k}' for k in [num_clusters]]]

In [None]:
from tqdm import tqdm

RANDOM = False
NUM_SUBGROUPS = [2,3,4,5]
th_redundancy = 0.04

if RANDOM:

    for num_samples in [154, 252, 383, 540, 548, 604, 945, 1035]:

        df_train_rest = pd.read_csv("data/italic/train_data_20.csv")
        df_train_rest = df_train_rest.sample(frac=1).reset_index(drop=True)
        df_train_rest = df_train_rest.head(num_samples)
        print("Total number of samples in to be added: ", len(df_train_rest))

        df_train = pd.read_csv("data/italic/train_data_80.csv")
        df_train = df_train.append(df_train_rest, ignore_index=True)
        df_train.to_csv(f"data/italic/new_data/train_data_random_k{num_samples}.csv", index=False)
        print("----------------------------------")

else: 

    for NS in NUM_SUBGROUPS:

        print("Number of problematic subgroups: ", NS)

        fp_divergence_i = fp_divergence_dict[config]
        FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
        pr_bot = FPdiv.head(NS).copy()
        itemsets = []
        for i in range(NS):
            if approach == 'divexplorer':
                itemsets.append(list(pr_bot.itemsets.values[i]))
            elif approach == 'clustering':
                itemsets.append(list(pr_bot.itemsets.values[i])[0])

        ## Create a column in the df, and assign a class to each sample:
        # - 1 if the sample is in the most divergent itemset
        # - 2 if the sample is in the second most divergent itemset
        # - 3 if the sample is in the third most divergent itemset
        # - ...
        # - 0 otherwise
        df_discretized_rest["subgID"] = 0

        if approach == 'divexplorer':
            for i in range(0, len(df_discretized_rest)):
                for value,itemset in enumerate(itemsets):
                    ks = []
                    vs = []
                    for item in itemset:
                        k, v = item.split("=")
                        ks.append(k)
                        vs.append(v)
                    if all(df_discretized_rest.loc[i, ks] == vs):
                        if df_discretized_rest.loc[i, "subgID"] == 0:
                            df_discretized_rest.loc[i, "subgID"] = value+1
                        else:
                            continue
                    else:
                        continue
        elif approach == 'clustering':
            for i in range(0, len(df_discretized_rest)):
                for value,itemset in enumerate(itemsets):
                    k, v = itemset.split("=")
                    if df_discretized_rest.loc[i, k] == int(v):
                        if df_discretized_rest.loc[i, "subgID"] == 0:
                            df_discretized_rest.loc[i, "subgID"] = value+1
                        else:
                            continue
                    else:
                        continue

        ## Keep in df_discretized_rest only the elements with subgID != 0
        df_train_rest = pd.read_csv("data/italic/train_data_20.csv")
        df_train_rest = df_train_rest.loc[df_discretized_rest["subgID"]!=0]
        print("Total number of samples in to be added: ", len(df_train_rest))

        ## Append df_discretized_rest to df_train
        df_train = pd.read_csv("data/italic/train_data_80.csv")
        df_train = df_train.append(df_train_rest, ignore_index=True)
        df_train.to_csv(f"data/italic/new_data/train_data_{approach}_k{NS}_{limit}.csv", index=False)
        print("----------------------------------")

## Divergence XLSR300m

In [None]:
min_sup = 0.03

configs = [
    "xlsr_300_original", 
    "xlsr_300_random",
    "xlsr_300_divexplorer",
    "xlsr_300_clustering",
    ]

FP_fm_dict = {}
fp_divergence_dict = {}
df_dict = {}

for config in configs:

    print(config)

    if "divexplorer" in config or "clustering" in config or "random" in config:
        folders = config.split("-")
        input_file_divexplorer = os.path.join(\
            os.getcwd(), folders[0], folders[1], "0", "df_test.csv")
    else:
        input_file_divexplorer = os.path.join(\
            os.getcwd(), config, "0", "df_test.csv")

    df = pd.read_csv(input_file_divexplorer, index_col=0)
    df['action'] = df['intent'].apply(lambda x: x.split("_")[0])
    df['scenario'] = df['intent'].apply(lambda x: x.split("_")[1])

    ## Discretize the dataframe
    from divergence_utils import discretize

    df_discretized = discretize(
        df[input_cols+[target_col]],
        bins=3,
        attributes=input_cols,
        strategy="quantile", 
        round_v = 2,
        min_distinct=5,
    )

    ## Replace values with ranges: "low", "medium", "high"
    replace_values = {}

    for i in range(0,len(signal_cols)):

        for v in df_discretized[signal_cols[i]].unique():
            if "<=" == v[0:2]:
                replace_values[v] = "low"
            elif ">" == v[0]:
                replace_values[v] = "high"
            elif "("  == v[0] and "]"  == v[-1]:
                replace_values[v] = "medium"
            else:
                raise ValueError(v)

        df_discretized[signal_cols[i]].replace(replace_values, inplace=True)

    ## Create dict of Divergence df
    df_dict[config] = df_discretized
    fp_diver = FP_DivergenceExplorer(
        df_discretized, 
        true_class_name=target_col, 
        class_map={"P":1, "N":0}
        )
    FP_fm = fp_diver.getFrequentPatternDivergence(
        min_support=min_sup, 
        metrics=[target_metric]
        )
    FP_fm.rename(
        columns=remapped_cols, 
        inplace=True
        )
    FP_fm = FP_fm[show_cols].copy()
    FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
    FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
    FP_fm['t_value'] = round(FP_fm['t_value'], 2)
    FP_fm_dict[config] = FP_fm
    fp_divergence_dict[config] = FP_Divergence(FP_fm, target_div)

### Original

In [None]:
## Compute the divergence for XLSR300m
config = 'xlsr300_original'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### Random

In [None]:
## Compute the divergence for XLSR300m rebalanced with Random Boosting
config = 'xlsr_300_random'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### DivExplorer

In [None]:
## Compute the divergence for XLSR300m rebalanced with DivExplorer
config = 'xlsr_300_divexplorer'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

### Clustering

In [None]:
## Compute the divergence for XLSR300m rebalanced with Clustering
config = 'xlsr_300_clustering'
fp_divergence_i = fp_divergence_dict[config]
th_redundancy = None

n = 2

## Retrieve Most Negatively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(n).copy()
pr_bot["support"] = pr_bot["support"].round(2)
pr_bot["#errors"] = pr_bot["#errors"].astype(int)
pr_bot["#corrects"] = pr_bot["#corrects"].astype(int)
pr_bot["accuracy"] = (pr_bot["accuracy"]*100).round(3)
pr_bot["d_accuracy"] = (pr_bot["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_bot = pr_bot[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_bot['itemsets'] = pr_l_bot['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_bot)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1]
print("Total negative subgroups: ", len(FPdiv[FPdiv['d_accuracy'] <= 0]))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(5).copy()
print("Mean divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] < 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))

## Retrieve Most Positively Divergent Itemsets 
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy) 
pr_top = FPdiv.head(n).copy()
pr_top["support"] = pr_top["support"].round(2)
pr_top["#errors"] = pr_top["#errors"].astype(int)
pr_top["#corrects"] = pr_top["#corrects"].astype(int)
pr_top["accuracy"] = (pr_top["accuracy"]*100).round(3)
pr_top["d_accuracy"] = (pr_top["d_accuracy"]*100).round(3)
## Choose columns for better visualization 
pr_l_top = pr_top[[ "itemsets", "support", "accuracy", "d_accuracy", "t_value"]].copy()
pr_l_top['itemsets'] = pr_l_top['itemsets'].apply(lambda x: sortItemset(x, abbreviations))
display(pr_l_top)

## Compute the mean divergence for XLSR300m
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)
print("Total positive subgroups: ", len(FPdiv[FPdiv['d_accuracy'] > 0]))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(5).copy()
print("Mean divergence top 5:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(10).copy()
print("Mean divergence top 10:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(20).copy()
print("Mean divergence top 20:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].head(50).copy()
print("Mean divergence top 50:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv[FPdiv['d_accuracy'] > 0].copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))
pr = FPdiv.copy()
print("Mean divergence all:", round(100*pr['d_accuracy'].mean(), 3))