# Configuring the Environment

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from jiwer import wer

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
import divexplorer 
import pandas as pd
pd.set_option('max_colwidth', None)
import os
import numpy as np

from utils_analysis import filter_itemset_df_by_attributes, slice_by_itemset

from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)

# Loading pretrained features

In [None]:
## Load dataset
dataset_train = load_dataset("librispeech_asr", "clean", split="train.360")
dataset_valid = load_dataset("librispeech_asr", "clean", split="validation")
dataset_test = load_dataset("librispeech_asr", "clean", split="test")

In [None]:
len(dataset_train), len(dataset_valid), len(dataset_test)

In [None]:
len(set(dataset_train['speaker_id'])), len(set(dataset_valid['speaker_id'])), len(set(dataset_test['speaker_id']))

In [None]:
## Load hidden states and logits
print("Loading train features...")
avg_hidden_states_train = torch.load('pretrained/librispeech/avg_hidden_states_train.pt')
last_hidden_states_train = torch.load('pretrained/librispeech/last_hidden_states_train.pt')
logits_concatenation_train = torch.load('pretrained/librispeech/logits_concatenation_train.pt')
sequence_lengths_train = torch.load('pretrained/librispeech/sequence_lengths_train.pt')
transcriptions_train = torch.load('pretrained/librispeech/transcriptions_train.pt')
wers_train = torch.load('pretrained/librispeech/wers_train.pt')

print("Loading valid features...")
avg_hidden_states_valid = torch.load('pretrained/librispeech/avg_hidden_states_valid.pt')
last_hidden_states_valid = torch.load('pretrained/librispeech/last_hidden_states_valid.pt')
logits_concatenation_valid = torch.load('pretrained/librispeech/logits_concatenation_valid.pt')
sequence_lengths_valid = torch.load('pretrained/librispeech/sequence_lengths_valid.pt')
transcriptions_valid = torch.load('pretrained/librispeech/transcriptions_valid.pt')
wers_valid = torch.load('pretrained/librispeech/wers_valid.pt')

print("Loading test features...")
avg_hidden_states_test = torch.load('pretrained/librispeech/avg_hidden_states_test.pt')
last_hidden_states_test = torch.load('pretrained/librispeech/last_hidden_states_test.pt')
logits_concatenation_test = torch.load('pretrained/librispeech/logits_concatenation_test.pt')
sequence_lengths_test = torch.load('pretrained/librispeech/sequence_lengths_test.pt')
transcriptions_test = torch.load('pretrained/librispeech/transcriptions_test.pt')
wers_test = torch.load('pretrained/librispeech/wers_test.pt')

# Prediction

In [None]:
prediction_train = (np.array(dataset_train["text"]) == np.array(transcriptions_train)).astype(int)
prediction_valid = (np.array(dataset_valid["text"]) == np.array(transcriptions_valid)).astype(int)
prediction_test = (np.array(dataset_test["text"]) == np.array(transcriptions_test)).astype(int)

# Confidence Model 

In [None]:
## Confidence model
class ConfidenceModel(nn.Module):
    def __init__(self, input_size=768, hidden_size=500, output_size=1):
        super(ConfidenceModel, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.GELU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)
        self.norm = nn.LayerNorm(hidden_size)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                nn.init.zeros_(m.bias)
                                     
    def forward(self,x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.norm(x)
        x = self.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.norm(x)
        x = self.sigmoid(self.linear3(x))
        return x

In [None]:
## Train, valid and test
def train(model, inputs, labels, criterion, optimizer):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs.float())
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    return outputs, loss.item()

def val(model, inputs, labels, criterion):
    model.eval()
    outputs = model(inputs.float())
    loss = criterion(outputs, labels)
    return outputs, loss.item()

def test(model, inputs, labels=None, criterion=None):
    model.eval()
    if labels is None and criterion is None:
        outputs = model(inputs.float())
        return outputs
    else:
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        return outputs, loss.item()

In [None]:
from torchsummary import summary

model = ConfidenceModel(input_size=768, hidden_size=100, output_size=1)
model = model.to(device)
summary(model, input_size=(768,))

# Problem Setup

In [None]:
HIDDEN_SIZE = 100
BATCH_SIZE = 4096
NUM_SUBGROUPS = 2
EPOCHS = 10000
MIN_SUP = 0.05
PRETRAIN = True

# DivExplorer

## Prepare df

In [None]:
speakers = {}
with open('data/librispeech/SPEAKERS.TXT', 'r') as f:
    lines = f.readlines()
    for i,line in enumerate(lines):
            speaker_id = line.strip().split(' ')[0]
            if len(speaker_id) == 2:
                gender = line.strip().split(' ')[4]
            elif len(speaker_id) == 3:
                gender = line.strip().split(' ')[3]
            else:
                gender = line.strip().split(' ')[2]
            speakers[speaker_id] = gender

gender_train = [speakers[str(sID)] for sID in dataset_train["speaker_id"]]
dataset_train = dataset_train.add_column("gender", gender_train)

gender_valid = [speakers[str(sID)] for sID in dataset_valid["speaker_id"]]
dataset_valid = dataset_valid.add_column("gender", gender_valid) 

gender_test = [speakers[str(sID)] for sID in dataset_test["speaker_id"]]
dataset_test = dataset_test.add_column("gender", gender_test) 

In [None]:
df_train = pd.read_csv('data/librispeech/speech_metadata_train.csv')
df_train['gender'] = gender_train
df_train['WER'] = wers_train
df_train['id'] = dataset_train['id']

df_valid = pd.read_csv('data/librispeech/speech_metadata_valid.csv')
df_valid['gender'] = gender_valid
df_valid['WER'] = wers_valid
df_valid['id'] = dataset_valid['id']

df_test = pd.read_csv('data/librispeech/speech_metadata_test.csv')
df_test['gender'] = gender_test
df_test['WER'] = wers_test
df_test['id'] = dataset_test['id']

In [None]:
## Divide the training set into training and heldout
dataset_train_1 = dataset_train.train_test_split(test_size=0.2, seed=42)

## Keep only the rows belonging to the training set
df_train_new = df_train[df_train['id'].isin(dataset_train_1['train']['id'])].reset_index(drop=True)
df_heldout = df_train[df_train['id'].isin(dataset_train_1['test']['id'])].reset_index(drop=True)

len(df_train_new), len(df_heldout)

## Utils

In [None]:
## Define abbreviations for plot and visualization
from divexplorer.FP_Divergence import abbreviateDict
abbreviations = {
    'total_silence': 'tot_silence', \
    'speaker_id' : 'spkID', \
    'trimmed': 'trim', \
    'total_':'tot_', \
    'speed_rate_word_trimmed': 'speakRate_trim', \
    'trim_duration': 'trim_dur', \
    'speed_rate_word':'speakRate', \
    'speed_rate_char':'speakCharRate', \
    'duration': 'dur'
    }

abbreviations_shorter = abbreviations.copy()

## Function for sorting data cohorts
def sortItemset(x, abbreviations={}):
    x = list(x)
    x.sort()
    x = ", ".join(x)
    for k, v in abbreviations.items():
        x = x.replace(k, v)
    return x

def attributes_in_itemset(itemset, attributes, alls = True):
    """ Check if attributes are in the itemset (all or at least one)
    
    Args:
        itemset (frozenset): the itemset
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        
    """
    # Avoid returning the empty itemset (i.e., info of entire dataset)
    if itemset == frozenset() and attributes:
        return False
    
    for item in itemset:
        # Get the attribute
        attr_i = item.split("=")[0]
        
        #If True, check if ALL attributes of the itemset are the input attributes.
        if alls:
            # Check if the attribute is present. If not, the itemset is not admitted
            if attr_i not in attributes:
                return False
        else:
            # Check if least one attribute. If yes, return True
            if attr_i in attributes:
                return True
    if alls:
        # All attributes of the itemset are indeed admitted
        return True
    else:
        # Otherwise, it means that we find None
        return False
    
def filter_itemset_df_by_attributes(df: pd.DataFrame, attributes: list, alls = True, itemset_col_name: str = "itemsets") -> pd.DataFrame:
    """Get the set of itemsets that have the attributes in the input list (all or at least one)
    
    Args:
        df (pd.DataFrame): the input itemsets (with their info). 
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        itemset_col_name (str) : the name of the itemset column, "itemsets" as default
        
    Returns:
        pd.DataFrame: the set of itemsets (with their info)
    """

    return df.loc[df[itemset_col_name].apply(lambda x: attributes_in_itemset(x, attributes, alls = alls))]

In [None]:
## Target for DivExplorer: 'WER'
target_col = 'WER' 
target_metric = 'd_outcome'
target_div = f'd_{target_col}'
t_value_col = 't_value_outcome'
printable_columns = ['support', 'itemsets','WER', 'd_WER', 't_value']

In [None]:
## Columns for visualization
remapped_cols = { 
       "outcome": target_col, 
       "d_outcome": target_div, 
       t_value_col: 't_value'}
show_cols = [
       'support', 
       'itemsets', 
       target_col, 
       target_div, 
       'support_count', 
       'length', 
       't_value'
       ]

## Columns of the df file that we are going to analyze 
demo_cols = ['gender']

signal_cols = ['total_silence', 'total_duration', 'n_words', 'speed_rate_word']

input_cols = demo_cols + signal_cols 

## Train

In [None]:
## Discretize the dataframe
from divergence_utils import discretize

df_discretized = discretize(
    df_train_new[input_cols+[target_col]],
    bins=3,
    attributes=input_cols,
    strategy="quantile", 
    round_v = 2,
    min_distinct=5,
)

## Replace values with ranges: "low", "medium", "high"
replace_values = {}

for i in range(0,len(signal_cols)):

    for v in df_discretized[signal_cols[i]].unique():
        if "<=" == v[0:2]:
            replace_values[v] = "low"
        elif ">" == v[0]:
            replace_values[v] = "high"
        elif "("  == v[0] and "]"  == v[-1]:
            replace_values[v] = "medium"
        else:
            raise ValueError(v)

    df_discretized[signal_cols[i]].replace(replace_values, inplace=True)

## Create dict of Divergence df
fp_diver = FP_DivergenceExplorer(df_discretized, target_name=target_col)
FP_fm = fp_diver.getFrequentPatternDivergence(min_support=MIN_SUP, metrics=[target_metric])
FP_fm.rename(columns=remapped_cols, inplace=True)
FP_fm = FP_fm[show_cols].copy()
FP_fm['WER'] = round(FP_fm['WER'], 5)
FP_fm['d_WER'] = round(FP_fm['d_WER'], 5)
FP_fm['t_value'] = round(FP_fm['t_value'], 2)
fp_divergence = FP_Divergence(FP_fm, target_div)

In [None]:
## Compute the divergence for Wav2Vec2-Base
FPdiv = fp_divergence.getDivergence(th_redundancy=0.001)[::-1] 

## Retrieve Most Divergent Itemsets 
from copy import deepcopy
pr = FPdiv.head(NUM_SUBGROUPS).copy()
pr["support"] = pr["support"].round(2)
pr["WER"] = (pr["WER"]*100).round(3)
pr["d_WER"] = (pr["d_WER"]*100).round(3)
display(pr)

In [None]:
## Create a column in the df, and assign a class to each sample:
# - 1 if the sample is in the most divergent itemset
# - 2 if the sample is in the second most divergent itemset
# - 3 if the sample is in the third most divergent itemset
# - ...
# - 0 otherwise

df_discretized["subgID"] = 0
itemsets = []

for i in range(NUM_SUBGROUPS):
    itemsets.append(list(pr.itemsets.values[i]))

for i in tqdm(range(0, len(df_discretized))):
    
    for value,itemset in enumerate(itemsets):
        ks = []
        vs = []
        for item in itemset:
            k, v = item.split("=")
            ks.append(k)
            vs.append(v)
        if all(df_discretized.loc[i, ks] == vs):
            if df_discretized.loc[i, "subgID"] == 0:
                df_discretized.loc[i, "subgID"] = value+1
            else:
                continue
        else:
            continue

for i in range(0,NUM_SUBGROUPS+1):
    print(len(df_discretized.loc[df_discretized["subgID"]==i]))

## Valid

In [None]:
## Discretize the dataframe
from divergence_utils import discretize

df_discretized_valid = discretize(
    df_valid[input_cols+[target_col]],
    bins=3,
    attributes=input_cols,
    strategy="quantile", 
    round_v = 2,
    min_distinct=5,
)

## Replace values with ranges: "low", "medium", "high"
replace_values = {}

for i in range(0,len(signal_cols)):

    for v in df_discretized_valid[signal_cols[i]].unique():
        if "<=" == v[0:2]:
            replace_values[v] = "low"
        elif ">" == v[0]:
            replace_values[v] = "high"
        elif "("  == v[0] and "]"  == v[-1]:
            replace_values[v] = "medium"
        else:
            raise ValueError(v)

    df_discretized_valid[signal_cols[i]].replace(replace_values, inplace=True)

In [None]:
## Create a column in the df, and assign a class to each sample:
# - 1 if the sample is in the most divergent itemset
# - 2 if the sample is in the second most divergent itemset
# - 3 if the sample is in the third most divergent itemset
# - ...
# - 0 otherwise

df_discretized_valid["subgID"] = 0
for i in tqdm(range(0, len(df_discretized_valid))):
    for value,itemset in enumerate(itemsets):
        ks = []
        vs = []
        for item in itemset:
            k, v = item.split("=")
            ks.append(k)
            vs.append(v)
        if all(df_discretized_valid.loc[i, ks] == vs):
            if df_discretized_valid.loc[i, "subgID"] == 0:
                df_discretized_valid.loc[i, "subgID"] = value+1
            else:
                continue
        else:
            continue

for i in range(0,NUM_SUBGROUPS+1):
    print(len(df_discretized_valid.loc[df_discretized_valid["subgID"]==i]))

# CM Pretraining and Finetuning

In [None]:
df_cm = df_train_new[[
    'total_silence', 'n_words', 'speed_rate_word'
    ]]
df_cm_valid = df_valid[[
    'total_silence', 'n_words', 'speed_rate_word'
    ]]

## Pretraining the CM

In [None]:
X_train = torch.cat((
    torch.tensor(logits_concatenation_train[:len(df_train_new)]),
    torch.tensor(sequence_lengths_train[:len(df_train_new)]).unsqueeze(dim=1),
    torch.tensor(last_hidden_states_train[:len(df_train_new)]).squeeze(),
    torch.tensor(df_cm['total_silence']).unsqueeze(1),
    torch.tensor(df_cm['n_words']).unsqueeze(1),
    torch.tensor(df_cm['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y_train = torch.tensor(prediction_train[:len(df_train_new)]).unsqueeze(1)

X_val = torch.cat((
    torch.tensor(logits_concatenation_valid),
    torch.tensor(sequence_lengths_valid).unsqueeze(dim=1),
    torch.tensor(last_hidden_states_valid).squeeze(),
    torch.tensor(df_cm_valid['total_silence']).unsqueeze(1),
    torch.tensor(df_cm_valid['n_words']).unsqueeze(1),
    torch.tensor(df_cm_valid['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y_val = torch.tensor(prediction_valid).unsqueeze(1)

In [None]:
seeds = [1, 10, 42] 

for seed in seeds:

    SEED = seed
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)

    best_auc = 0
    best_acc = 0
    best_output = 0
    best_model = 0
    best_epoch = 0

    ## Create model
    model = ConfidenceModel(
        input_size=X_train.shape[1],
        hidden_size=HIDDEN_SIZE, 
        output_size=1
        ).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.NAdam(model.parameters(), lr=0.005) #NAdam

    ## Train model
    train_losses = []
    val_losses = []
    val_aucs = []

    for epoch in range(EPOCHS):
        
        ## Train in batches
        for i in range(0, len(X_train), BATCH_SIZE):
            train_output, train_loss = train(
                model, 
                X_train[i:i+BATCH_SIZE].float().to(device), 
                y_train[i:i+BATCH_SIZE].float().to(device), 
                criterion, 
                optimizer
                )
        train_losses.append(train_loss)
            
        val_output, val_loss = val(
            model, 
            X_val.float().to(device), 
            y_val.float().to(device),
            criterion
            )
        val_losses.append(val_loss)
        val_output = (val_output > 0.5).float()
        val_acc = accuracy_score(y_val, val_output.cpu().detach().numpy())
        val_auc = roc_auc_score(y_val, val_output.cpu().detach().numpy())
        val_aucs.append(val_auc)     
     
        if val_auc > best_auc:
            best_auc = val_auc
            best_acc = val_acc
            best_output = val_output
            best_model = model
            best_epoch = epoch

        if epoch > 1000:
            if val_losses[-1] > val_losses[-2] and val_losses[-2] > val_losses[-3]:
                break

    ## Print metrics 
    print("Best epoch: ", best_epoch)
    print("Val accuracy: ", round(best_acc*100, 2), "%")
    print("Val AUC: ", round(best_auc, 2))

    ## Save model
    torch.save(best_model, f'cm_pt_ft/librispeech/confidence_model_pt.pt')
    print("Model saved!")

## Challenging Subgroups Prediction

In [None]:
## Create train, val, test split
y_train_subs = torch.tensor(df_discretized['subgID'])
y_val_subs = torch.tensor(df_discretized_valid['subgID'])

In [None]:
seeds = [1, 10, 42]

for seed in seeds:

    SEED = seed
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)

    best_f1macro = 0
    best_acc = 0
    best_output = 0
    best_epoch = 0
    train_losses = []
    val_losses = []

    if PRETRAIN:
        best_model.linear3 = nn.Linear(
            HIDDEN_SIZE, 
            NUM_SUBGROUPS+1
            ).to(device)
        model = best_model
    else:
        model = ConfidenceModel(
            input_size=X_train.shape[1],
            hidden_size=HIDDEN_SIZE, 
            output_size=NUM_SUBGROUPS+1
            ).to(device)

    ## Criterion and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.NAdam(model.parameters(), lr=0.001)

    ## Train and validate model
    train_losses = []
    val_losses = []
    for epoch in range(EPOCHS):
        train_output, train_loss = train(
            model, 
            X_train.to(device), 
            y_train_subs.to(device), 
            criterion, 
            optimizer
            )
        val_output, val_loss = val(
            model, 
            X_val.to(device), 
            y_val_subs.to(device), 
            criterion
            )
        val_output = val_output.cpu().detach().numpy()
        val_output = np.argmax(val_output, axis=1)
        val_acc = accuracy_score(y_val_subs, val_output)
        val_f1 = f1_score(y_val_subs, val_output, average='macro')
        if val_f1 > best_f1macro:
            best_f1macro = val_f1
            best_acc = val_acc
            best_output = val_output
            best_epoch = epoch
    
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        if epoch > 5000 and val_loss >= val_losses[-2] and val_loss >= val_losses[-3]:
            break
        if epoch % 500 == 0:
            print(f'Epoch: {epoch} | Train Loss: {train_loss} | Val Loss: {val_loss}')

    ## Print best accuracy and F1 macro
    print("Best Epoch: ", best_epoch)
    print("Best Accuracy: ", best_acc)
    print("Best F1 Macro: ", best_f1macro)
    # print("Confusion Matrix: \n", confusion_matrix(y_test_subs, best_output))
    print("--------------------\n")

    ## Save model
    torch.save(model, f'cm_pt_ft/librispeech/confidence_model_ft.pt')
    print("Model saved!")

# Select New Data

## Discretize

In [None]:
## Discretize the dataframe
from divergence_utils import discretize

df_discretized_heldout = discretize(
    df_heldout[input_cols+[target_col]],
    bins=3,
    attributes=input_cols,
    strategy="quantile", 
    round_v = 2,
    min_distinct=5,
)

## Replace values with ranges: "low", "medium", "high"
replace_values = {}

for i in range(0,len(signal_cols)):

    for v in df_discretized_heldout[signal_cols[i]].unique():
        if "<=" == v[0:2]:
            replace_values[v] = "low"
        elif ">" == v[0]:
            replace_values[v] = "high"
        elif "("  == v[0] and "]"  == v[-1]:
            replace_values[v] = "medium"
        else:
            raise ValueError(v)

    df_discretized_heldout[signal_cols[i]].replace(replace_values, inplace=True)

## Predict Challenging Subgroup IDs

In [None]:
X_train_heldout = torch.cat((
    torch.tensor(logits_concatenation_train[len(df_train_new):]),
    torch.tensor(sequence_lengths_train[len(df_train_new):]).unsqueeze(dim=1),
    torch.tensor(last_hidden_states_train[len(df_train_new):]).squeeze(),
    torch.tensor(df_heldout['total_silence']).unsqueeze(1),
    torch.tensor(df_heldout['n_words']).unsqueeze(1),
    torch.tensor(df_heldout['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y_train_heldout = torch.tensor(prediction_train[len(df_train_new):]).unsqueeze(1)

In [None]:
model = torch.load(f'cm_pt_ft/librispeech/confidence_model_ft.pt')

train_left_out_output = test(
    model,
    X_train_heldout.to(device),
    )
train_left_out_output = train_left_out_output.cpu().detach().numpy()
train_left_out_output = np.argmax(train_left_out_output, axis=1)

In [None]:
# Retrieve the rows in df_left_out for which train_left_out_output is different from 0
df_heldout['subgID'] = train_left_out_output
print(len(df_heldout))

divergent_samples = df_heldout.loc[df_heldout['subgID']!=0]
print(len(divergent_samples))
num_samples = len(divergent_samples)

In [None]:
ids = list(divergent_samples['id'])

## Save ids as txt
with open('divergent_samples_librispeech_csi.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)

# Random Baseline

In [None]:
## Random baseline: assing each sample a random sample
random_pred = np.random.randint(0, NUM_SUBGROUPS+1, len(X_train_heldout))

In [None]:
# Retrieve the rows in df_left_out for which most_frequent_pred is different from 0
df_heldout['subgID'] = random_pred
print(len(df_heldout))

divergent_samples = df_heldout.loc[df_heldout['subgID']!=0]
print(len(divergent_samples))
divergent_samples = divergent_samples.sample(frac=1, random_state=42).reset_index(drop=True)
divergent_samples = divergent_samples[:num_samples]
print(len(divergent_samples))

In [None]:
ids = list(divergent_samples['id'])

## Save ids as txt
with open('divergent_samples_librispeech_random.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)

# KNN Baseline

In [None]:
## KNN baseline that assigns each sample to the most frequent class among its k nearest neighbors
from sklearn.neighbors import KNeighborsClassifier

SEED = 1
best_acc = 0
best_f1 = 0
best_k = 0

for k in range(2,10):
    
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train_subs)

    knn_pred = knn.predict(X_val)
    acc = accuracy_score(y_val_subs, knn_pred)
    f1 = f1_score(y_val_subs, knn_pred, average='macro')

    if acc > best_acc:
        best_k = k
        best_acc = acc
        best_f1 = f1

print("Best K: ", best_k)
print("Accuracy: ", best_acc)
print("F1 Macro: ", best_f1)

In [None]:
X_train_heldout = torch.cat((
    torch.tensor(logits_concatenation_train[len(df_train_new):]),
    torch.tensor(sequence_lengths_train[len(df_train_new):]).unsqueeze(dim=1),
    torch.tensor(last_hidden_states_train[len(df_train_new):]).squeeze(),
    torch.tensor(df_heldout['total_silence']).unsqueeze(1),
    torch.tensor(df_heldout['n_words']).unsqueeze(1),
    torch.tensor(df_heldout['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y_train_heldout = torch.tensor(prediction_train[len(df_train_new):]).unsqueeze(1)

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train_subs)

knn_pred = knn.predict(X_train_heldout)

In [None]:
# Retrieve the rows in df_left_out for which knn_pred is different from 0
df_heldout['subgID'] = knn_pred
print(len(df_heldout))

divergent_samples = df_heldout.loc[df_heldout['subgID']!=0]
print(len(divergent_samples))

In [None]:
ids = list(divergent_samples['id'])

## Save ids as txt
with open('divergent_samples_librispeech_knn.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)

# CM Baseline

In [None]:
X_train_heldout = torch.cat((
    torch.tensor(logits_concatenation_train[len(df_train_new):]),
    torch.tensor(sequence_lengths_train[len(df_train_new):]).unsqueeze(dim=1),
    torch.tensor(last_hidden_states_train[len(df_train_new):]).squeeze(),
    torch.tensor(df_heldout['total_silence']).unsqueeze(1),
    torch.tensor(df_heldout['n_words']).unsqueeze(1),
    torch.tensor(df_heldout['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y_train_heldout = torch.tensor(prediction_train[len(df_train_new):]).unsqueeze(1)

In [None]:
cm_model = torch.load(f'cm_pt_ft/librispeech/confidence_model_pt.pt')

In [None]:
train_left_out_output = test(
    cm_model,
    X_train_heldout.to(device),
    )
train_left_out_output = train_left_out_output.cpu().detach().numpy()
train_left_out_output = np.argmax(train_left_out_output, axis=1)

In [None]:
# Retrieve the rows in df_heldout for which train_left_out_output is different from 0
df_heldout['subgID'] = train_left_out_output
print(len(df_heldout))

divergent_samples = df_heldout.loc[df_heldout['subgID']!=0]
print(len(divergent_samples))

In [None]:
ids = list(divergent_samples['id'])

## Save ids as txt
with open('divergent_samples_librispeech_cm.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)

# Supervised Oracle

In [None]:
from transformers import pipeline
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from datasets import load_dataset, Audio
import evaluate
from tqdm import tqdm

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def is_target_text_in_range(ref):
    if ref.strip() == "ignore time segment in scoring":
        return False
    else:
        return ref.strip() != ""

def get_text(sample):
    return sample["utt"]

whisper_norm = BasicTextNormalizer()

def normalise(batch):
    batch["norm_text"] = whisper_norm(get_text(batch))
    return batch

def data(dataset):
    for i, item in enumerate(dataset):
        yield {**item["audio"], "reference": item["norm_text"]}

batch_size = 32
whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

whisper_asr.model.config.forced_decoder_ids = (
    whisper_asr.tokenizer.get_decoder_prompt_ids(language=args.language, task="transcribe")
    )

dataset = Dataset.from_pandas(df_heldout)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset = dataset.map(normalise)
dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])

predictions = []
references = []

# run streamed inference
for out in tqdm(whisper_asr(data(dataset), batch_size=batch_size)):
    predictions.append(whisper_norm(out["text"]))
    references.append(out["reference"][0])

wer = wer_metric.compute(references=references, predictions=predictions)
wer = round(100 * wer, 2)
cer = cer_metric.compute(references=references, predictions=predictions)
cer = round(100 * cer, 2)

print("WER:", wer)
print("CER:", cer)

In [None]:
## Take only the samples for which the prediction is not correct, i.e., the WER is not 0
df_heldout['wer'] = wer

df_heldout = df_heldout.loc[df_heldout['wer']!=0]
print(len(df_heldout))

## Save the ids
ids = list(df_heldout['id'])
with open('divergent_samples_librispeech_supervised_oracle.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)

# Clustering Baseline

In [None]:
df_discretized_rest = df_heldout[[f'speech_cluster_id_{k}' for k in [num_clusters]]]

In [None]:
print("Number of problematic subgroups: ", NUM_SUBGROUPS)

fp_divergence_i = fp_divergence_dict[config]
FPdiv = fp_divergence_i.getDivergence(th_redundancy=th_redundancy)[::-1] 
pr_bot = FPdiv.head(NUM_SUBGROUPS).copy()
itemsets = []
for i in range(NUM_SUBGROUPS):
    itemsets.append(list(pr_bot.itemsets.values[i])[0])

## Create a column in the df, and assign a class to each sample:
# - 1 if the sample is in the most divergent itemset
# - 2 if the sample is in the second most divergent itemset
# - 3 if the sample is in the third most divergent itemset
# - ...
# - 0 otherwise
df_discretized_rest["subgID"] = 0
for i in range(0, len(df_discretized_rest)):
    for value,itemset in enumerate(itemsets):
        k, v = itemset.split("=")
        if df_discretized_rest.loc[i, k] == int(v):
            if df_discretized_rest.loc[i, "subgID"] == 0:
                df_discretized_rest.loc[i, "subgID"] = value+1
            else:
                continue
        else:
            continue

## Take only the ones different from 0
df_discretized_heldout = df_discretized_rest.loc[df_discretized_rest["subgID"]!=0]

## Save the ids
ids = list(df_discretized_heldout['id'])
with open('divergent_samples_librispeech_clustering.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)

# Metadata Oracle 

In [None]:
## Discretize the dataframe
from divergence_utils import discretize

df_discretized_heldout = discretize(
    df_heldout[input_cols+[target_col]+['id']],
    bins=3,
    attributes=input_cols,
    strategy="quantile", 
    round_v = 2,
    min_distinct=5,
)

## Replace values with ranges: "low", "medium", "high"
replace_values = {}

for i in range(0,len(signal_cols)):

    for v in df_discretized_heldout[signal_cols[i]].unique():
        if "<=" == v[0:2]:
            replace_values[v] = "low"
        elif ">" == v[0]:
            replace_values[v] = "high"
        elif "("  == v[0] and "]"  == v[-1]:
            replace_values[v] = "medium"
        else:
            raise ValueError(v)

    df_discretized_heldout[signal_cols[i]].replace(replace_values, inplace=True)

In [None]:
## Create a column in the df, and assign a class to each sample:
# - 1 if the sample is in the most divergent itemset
# - 2 if the sample is in the second most divergent itemset
# - 3 if the sample is in the third most divergent itemset
# - ...
# - 0 otherwise

df_discretized_heldout["subgID"] = 0
for i in tqdm(range(0, len(df_discretized_heldout))):
    for value,itemset in enumerate(itemsets):
        ks = []
        vs = []
        for item in itemset:
            k, v = item.split("=")
            ks.append(k)
            vs.append(v)
        if all(df_discretized_heldout.loc[i, ks] == vs):
            if df_discretized_heldout.loc[i, "subgID"] == 0:
                df_discretized_heldout.loc[i, "subgID"] = value+1
            else:
                continue
        else:
            continue

for i in range(0,NUM_SUBGROUPS+1):
    print(len(df_discretized_heldout.loc[df_discretized_heldout["subgID"]==i]))

In [None]:
## Take only the ones different from 0
df_discretized_heldout = df_discretized_heldout.loc[df_discretized_heldout["subgID"]!=0]

## Save the ids
ids = list(df_discretized_heldout['id'])
with open('divergent_samples_librispeech_metadata_oracle.txt', 'w') as f:
    for item in ids:
        f.write("%s\n" % item)