# Configuring the Environment

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
import divexplorer 
import pandas as pd
pd.set_option('max_colwidth', None)
import os
import numpy as np

from utils_analysis import filter_itemset_df_by_attributes, slice_by_itemset

from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence

In [None]:
## Set device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [None]:
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)

# FSC Dataset - Inference and Evaluation

## Utils

In [None]:
def map_to_array(example, audio_col = 'path'):
    speech, _ = librosa.load(example[audio_col], sr=16000, mono=True)
    example["speech"] = speech
    return example

In [None]:
def preprocess_function(examples):
    inputs = feature_extractor(
      examples,
      sampling_rate=feature_extractor.sampling_rate, 
      padding=True, 
      return_tensors="pt")
    return inputs

## Model

In [None]:
## Load model
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic").to(device)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")

## Train and Valid

In [None]:
## Load and preprocess dataset
df_train = pd.read_csv('data/train_data.csv')
df_valid = pd.read_csv('data/valid_data.csv')
df = pd.concat([df_train, df_valid], ignore_index=True)
len(df)

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.map(lambda x: map_to_array(x, audio_col='path'))

In [None]:
## Inference
hidden_states_concatenation = []
logits_concatenation = []

with torch.no_grad():
    for i in tqdm(range(0, len(dataset))):
        inputs = preprocess_function(dataset[i]["speech"]).to(device)
        outputs = model(**inputs)
        hidden_states_concatenation.append(outputs.hidden_states[-1])
        logits_concatenation.append(outputs.logits)

In [None]:
torch.save(hidden_states_concatenation, 'pretrained/hidden_states.pt')
torch.save(logits_concatenation, 'pretrained/logits.pt')

In [None]:
## Load hidden states and logits
hidden_states_concatenation = torch.load('pretrained/hidden_states.pt')
logits_concatenation = torch.load('pretrained/logits.pt')

### Intent Accuracy

In [None]:
action_ids = []
for i in range(len(logits_concatenation)):
    logits = logits_concatenation[i].detach().cpu()
    action_ids.append(torch.argmax(logits[:, :6], dim=-1).item())
action_labels = [model.config.id2label[_id] for _id in action_ids]

action_gt = list(df['action'].values)

print("Action accuracy: ", round(accuracy_score(action_gt, action_labels)*100, 2), "%")

In [None]:
object_ids = []
for i in range(len(logits_concatenation)):
    logits = logits_concatenation[i].detach().cpu()
    object_ids.append(torch.argmax(logits[:, 6:20], dim=-1).item())
object_labels = [model.config.id2label[_id + 6] for _id in object_ids]

object_gt = list(df['object'].values)
object_gt = [f'{x}_object' if x=='none' else x for x in object_gt]

print("Obejct accuracy: ", round(accuracy_score(object_gt, object_labels)*100, 2), "%")

In [None]:
location_ids = []
for i in range(len(logits_concatenation)):
    logits = logits_concatenation[i].detach().cpu()
    location_ids.append(torch.argmax(logits[:, 20:24], dim=-1).item())
location_labels = [model.config.id2label[_id + 20] for _id in location_ids]

location_gt = list(df['location'].values)
location_gt = [f'{x}_location' if x=='none' else x for x in location_gt]

print("Location accuracy: ", round(accuracy_score(location_gt, location_labels)*100, 2), "%")

In [None]:
## Save predictions
intents_predicted = [ action_labels[i]  + " " + object_labels[i] + " " + location_labels[i] for i in range(0, len(df))]
intents_gt = [ action_gt[i]  + " " + object_gt[i] + " " + location_gt[i] for i in range(0, len(df))]

is_correct = (np.array(intents_predicted) == np.array(intents_gt)).astype(int)
df['prediction'] = is_correct
print("Accuracy: ", round(np.mean(is_correct)*100,2), "%")

In [None]:
## Save hidden states
df['hidden_states'] = [hs.detach().cpu().numpy().squeeze() for hs in hidden_states_concatenation]
df['hidden_states'] = df['hidden_states'].apply(lambda x: x.astype(float))

In [None]:
## Save action, object and location predictions 
df['predicted_action'] = [l[:, :6].detach().cpu().numpy().squeeze() for l in logits_concatenation]
df['predicted_object'] = [l[:, 6:20].detach().cpu().numpy().squeeze() for l in logits_concatenation]
df['predicted_location'] = [l[:, 20:24].detach().cpu().numpy().squeeze() for l in logits_concatenation]

In [None]:
output_folder = os.path.join(f'fsc_train_valid.csv')
df.to_csv(output_folder, index=False)

## Test

In [None]:
## Load and preprocess dataset
df_test = pd.read_csv('data/test_data.csv')
len(df_test)

In [None]:
dataset_test = Dataset.from_pandas(df_test) 
dataset_test = dataset_test.map(lambda x: map_to_array(x, audio_col='path'))

In [None]:
## Inference
hidden_states_concatenation_test = []
logits_concatenation_test = []

with torch.no_grad():
    for i in tqdm(range(0, len(dataset_test))):
        inputs = preprocess_function(dataset_test[i]["speech"]).to(device)
        outputs = model(**inputs)
        hidden_states_concatenation_test.append(outputs.hidden_states[-1])
        logits_concatenation_test.append(outputs.logits)

In [None]:
torch.save(hidden_states_concatenation_test, 'pretrained/hidden_states_test.pt')
torch.save(logits_concatenation_test, 'pretrained/logits_test.pt')

In [None]:
## Load hidden states and logits
hidden_states_concatenation_test = torch.load('pretrained/hidden_states_test.pt')
logits_concatenation_test = torch.load('pretrained/logits_test.pt')

### Intent Accuracy

In [None]:
action_ids = []
for i in range(len(logits_concatenation_test)):
    logits = logits_concatenation_test[i].detach().cpu()
    action_ids.append(torch.argmax(logits[:, :6], dim=-1).item())
action_labels = [model.config.id2label[_id] for _id in action_ids]

action_gt = list(df_test['action'].values)

print("Action accuracy: ", round(accuracy_score(action_gt, action_labels)*100, 2), "%")

In [None]:
object_ids = []
for i in range(len(logits_concatenation_test)):
    logits = logits_concatenation_test[i].detach().cpu()
    object_ids.append(torch.argmax(logits[:, 6:20], dim=-1).item())
object_labels = [model.config.id2label[_id + 6] for _id in object_ids]

object_gt = list(df_test['object'].values)
object_gt = [f'{x}_object' if x=='none' else x for x in object_gt]

print("Obejct accuracy: ", round(accuracy_score(object_gt, object_labels)*100, 2), "%")

In [None]:
location_ids = []
for i in range(len(logits_concatenation_test)):
    logits = logits_concatenation_test[i].detach().cpu()
    location_ids.append(torch.argmax(logits[:, 20:24], dim=-1).item())
location_labels = [model.config.id2label[_id + 20] for _id in location_ids]

location_gt = list(df_test['location'].values)
location_gt = [f'{x}_location' if x=='none' else x for x in location_gt]

print("Location accuracy: ", round(accuracy_score(location_gt, location_labels)*100, 2), "%")

In [None]:
## Save predictions
intents_predicted = [ action_labels[i]  + " " + object_labels[i] + " " + location_labels[i] for i in range(0, len(df_test))]
intents_gt = [ action_gt[i]  + " " + object_gt[i] + " " + location_gt[i] for i in range(0, len(df_test))]

is_correct = (np.array(intents_predicted) == np.array(intents_gt)).astype(int)
df_test['prediction'] = is_correct
print("Accuracy: ", round(np.mean(is_correct)*100,2), "%")

In [None]:
## Save hidden states
df_test['hidden_states'] = [hs.detach().cpu().numpy().squeeze() for hs in hidden_states_concatenation_test]
df_test['hidden_states'] = df_test['hidden_states'].apply(lambda x: x.astype(float))

In [None]:
## Save action, object and location predictions 
df_test['predicted_action'] = [l[:, :6].detach().cpu().numpy().squeeze() for l in logits_concatenation_test]
df_test['predicted_object'] = [l[:, 6:20].detach().cpu().numpy().squeeze() for l in logits_concatenation_test]
df_test['predicted_location'] = [l[:, 20:24].detach().cpu().numpy().squeeze() for l in logits_concatenation_test]

In [None]:
output_folder = os.path.join(f'fsc_test.csv')
df_test.to_csv(output_folder, index=False)

# Loading Pretrained Features

In [None]:
## Load model
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic").to(device)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")

In [None]:
## Load and preprocess dataset
df_train = pd.read_csv('data/train_data.csv')
df_valid = pd.read_csv('data/valid_data.csv')
df = pd.concat([df_train, df_valid], ignore_index=True)

## Load hidden states and logits
hidden_states_concatenation = torch.load('pretrained/hidden_states.pt')
logits_concatenation = torch.load('pretrained/logits.pt')

action_ids = []
for i in range(len(logits_concatenation)):
    logits = logits_concatenation[i].detach().cpu()
    action_ids.append(torch.argmax(logits[:, :6], dim=-1).item())
action_labels = [model.config.id2label[_id] for _id in action_ids]
action_gt = list(df['action'].values)

object_ids = []
for i in range(len(logits_concatenation)):
    logits = logits_concatenation[i].detach().cpu()
    object_ids.append(torch.argmax(logits[:, 6:20], dim=-1).item())
object_labels = [model.config.id2label[_id + 6] for _id in object_ids]
object_gt = list(df['object'].values)
object_gt = [f'{x}_object' if x=='none' else x for x in object_gt]

location_ids = []
for i in range(len(logits_concatenation)):
    logits = logits_concatenation[i].detach().cpu()
    location_ids.append(torch.argmax(logits[:, 20:24], dim=-1).item())
location_labels = [model.config.id2label[_id + 20] for _id in location_ids]
location_gt = list(df['location'].values)
location_gt = [f'{x}_location' if x=='none' else x for x in location_gt]

## Save predictions
intents_predicted = [ action_labels[i]  + " " + object_labels[i] + " " + location_labels[i] for i in range(0, len(df))]
intents_gt = [ action_gt[i]  + " " + object_gt[i] + " " + location_gt[i] for i in range(0, len(df))]
is_correct = (np.array(intents_predicted) == np.array(intents_gt)).astype(int)
df['prediction'] = is_correct

## Save hidden states
df['hidden_states'] = [hs.detach().cpu().numpy().squeeze() for hs in hidden_states_concatenation]
df['hidden_states'] = df['hidden_states'].apply(lambda x: x.astype(float))

## Save action, object and location predictions 
df['predicted_action'] = [l[:, :6].detach().cpu().numpy().squeeze() for l in logits_concatenation]
df['predicted_object'] = [l[:, 6:20].detach().cpu().numpy().squeeze() for l in logits_concatenation]
df['predicted_location'] = [l[:, 20:24].detach().cpu().numpy().squeeze() for l in logits_concatenation]

In [None]:
## Load and preprocess dataset
df_test = pd.read_csv('data/test_data.csv')

## Load hidden states and logits
hidden_states_concatenation_test = torch.load('pretrained/hidden_states_test.pt')
logits_concatenation_test = torch.load('pretrained/logits_test.pt')

action_ids = []
for i in range(len(logits_concatenation_test)):
    logits = logits_concatenation_test[i].detach().cpu()
    action_ids.append(torch.argmax(logits[:, :6], dim=-1).item())
action_labels = [model.config.id2label[_id] for _id in action_ids]
action_gt = list(df_test['action'].values)

object_ids = []
for i in range(len(logits_concatenation_test)):
    logits = logits_concatenation_test[i].detach().cpu()
    object_ids.append(torch.argmax(logits[:, 6:20], dim=-1).item())
object_labels = [model.config.id2label[_id + 6] for _id in object_ids]
object_gt = list(df_test['object'].values)
object_gt = [f'{x}_object' if x=='none' else x for x in object_gt]

location_ids = []
for i in range(len(logits_concatenation_test)):
    logits = logits_concatenation_test[i].detach().cpu()
    location_ids.append(torch.argmax(logits[:, 20:24], dim=-1).item())
location_labels = [model.config.id2label[_id + 20] for _id in location_ids]
location_gt = list(df_test['location'].values)
location_gt = [f'{x}_location' if x=='none' else x for x in location_gt]

## Save predictions
intents_predicted = [ action_labels[i]  + " " + object_labels[i] + " " + location_labels[i] for i in range(0, len(df_test))]
intents_gt = [ action_gt[i]  + " " + object_gt[i] + " " + location_gt[i] for i in range(0, len(df_test))]
is_correct = (np.array(intents_predicted) == np.array(intents_gt)).astype(int)
df_test['prediction'] = is_correct

## Save hidden states
df_test['hidden_states'] = [hs.detach().cpu().numpy().squeeze() for hs in hidden_states_concatenation_test]
df_test['hidden_states'] = df_test['hidden_states'].apply(lambda x: x.astype(float))

## Save action, object and location predictions 
df_test['predicted_action'] = [l[:, :6].detach().cpu().numpy().squeeze() for l in logits_concatenation_test]
df_test['predicted_object'] = [l[:, 6:20].detach().cpu().numpy().squeeze() for l in logits_concatenation_test]
df_test['predicted_location'] = [l[:, 20:24].detach().cpu().numpy().squeeze() for l in logits_concatenation_test]

# Confidence Model 

In [None]:
## Confidence model
class ConfidenceModel(nn.Module):
    def __init__(self, input_size=768, hidden_size=1000, output_size=1):
        super(ConfidenceModel, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.GELU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)
        self.norm = nn.LayerNorm(hidden_size)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                nn.init.zeros_(m.bias)
                                     
    def forward(self,x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.norm(x)
        x = self.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.norm(x)
        x = self.sigmoid(self.linear3(x))
        return x

In [None]:
## Train, valid and test
def train(model, inputs, labels, criterion, optimizer):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs.float())
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    return outputs, loss.item()

def val(model, inputs, labels, criterion):
    model.eval()
    outputs = model(inputs.float())
    loss = criterion(outputs, labels)
    return outputs, loss.item()

def test(model, inputs, labels=None, criterion=None):
    model.eval()
    if labels is None and criterion is None:
        outputs = model(inputs.float())
        return outputs
    else:
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        return outputs, loss.item()

# Problem Setup

In [None]:
HIDDEN_SIZE = 1000
BATCH_SIZE = 4096
NUM_SUBGROUPS = 2
EPOCHS = 10000
MIN_SUP = 0.03
TH_REDUNDANCY = 0.05
PRETRAIN = True

# Problematic Subgroups Identification - Step 2, DivExplorer

## Utils

In [None]:
## Define abbreviations for plot and visualization
from divexplorer.FP_Divergence import abbreviateDict
abbreviations = {'Self-reported fluency level=native': 'fluency=native', \
                  'total_silence':'tot_silence', 'location': 'loc', \
                  'Current language used for work/school=English (United States)': 'lang=EN_US', \
                  'speakerId' : 'spkID', \
                  'First Language spoken=English (United States)':  'lang=EN_US', \
                  'trimmed':'trim', \
                  'total_':'', \
                  'speed_rate_word':'speakRate', \
                  'speed_rate_char':'speakCharRate', \
                  'change language': 'change lang', \
                  'duration': 'dur'}

abbreviations_shorter = abbreviations.copy()

## Function for sorting data cohorts
def sortItemset(x, abbreviations={}):
    x = list(x)
    x.sort()
    x = ", ".join(x)
    for k, v in abbreviations.items():
        x = x.replace(k, v)
    return x

def attributes_in_itemset(itemset, attributes, alls = True):
    """ Check if attributes are in the itemset (all or at least one)
    
    Args:
        itemset (frozenset): the itemset
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        
    """
    # Avoid returning the empty itemset (i.e., info of entire dataset)
    if itemset == frozenset() and attributes:
        return False
    
    for item in itemset:
        # Get the attribute
        attr_i = item.split("=")[0]
        
        #If True, check if ALL attributes of the itemset are the input attributes.
        if alls:
            # Check if the attribute is present. If not, the itemset is not admitted
            if attr_i not in attributes:
                return False
        else:
            # Check if least one attribute. If yes, return True
            if attr_i in attributes:
                return True
    if alls:
        # All attributes of the itemset are indeed admitted
        return True
    else:
        # Otherwise, it means that we find None
        return False
    
def filter_itemset_df_by_attributes(df: pd.DataFrame, attributes: list, alls = True, itemset_col_name: str = "itemsets") -> pd.DataFrame:
    """Get the set of itemsets that have the attributes in the input list (all or at least one)
    
    Args:
        df (pd.DataFrame): the input itemsets (with their info). 
        attributes (list): list of itemset of interest
        alls (bool): If True, check if ALL attributes of the itemset are the input attributes. 
        If False, check AT LEAST one attribute of the itemset is in the input attributes.
        itemset_col_name (str) : the name of the itemset column, "itemsets" as default
        
    Returns:
        pd.DataFrame: the set of itemsets (with their info)
    """

    return df.loc[df[itemset_col_name].apply(lambda x: attributes_in_itemset(x, attributes, alls = alls))]

In [None]:
## Target for DivExplorer: 
# 'prediction' is 1 if predicted_intet == original_intent, 0 otherwise
target_col = 'prediction' 
target_metric = 'd_posr'
target_div = 'd_accuracy'
t_value_col = 't_value_tp_fn'

In [None]:
## Columns for visualization
show_cols = ['support', 'itemsets', '#errors', '#corrects', 'accuracy', \
                'd_accuracy', 't_value', 'support_count', 'length']
remapped_cols = {'tn': '#errors', 'tp': '#corrects', 'posr': 'accuracy', \
                target_metric: target_div, 't_value_tp_fn': 't_value'}

## Columns of the df file that we are going to analyze 
demo_cols = ['gender', 'ageRange']

slot_cols = ['action', 'object', 'location']

signal_cols = ['total_silence', 'total_duration', 'trimmed_duration', 
       'n_words', 'speed_rate_word', 'speed_rate_word_trimmed']      
 
input_cols = demo_cols + signal_cols + slot_cols

In [None]:
# select the columns of interest
df_divexpl = df[[
    'path', 'transcription', 
    'action', 'object', 'location', 
    'prediction', 
    'speakerId', 'gender', 'ageRange', 'Self-reported fluency level ', 'First Language spoken','Current language used for work/school',
    'total_silence', 'total_duration', 'trimmed_duration', 'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'
    ]]

df_test_divexpl = df_test[[
    'path', 'transcription', 
    'action', 'object', 'location', 
    'prediction', 
    'speakerId', 'gender', 'ageRange', 'Self-reported fluency level ', 'First Language spoken','Current language used for work/school',
    'total_silence', 'total_duration', 'trimmed_duration', 'n_words', 'speed_rate_word', 'speed_rate_word_trimmed'
    ]]

## Train

In [None]:
## Add SpeakerID information if it is present in the df
if "speakerId" in input_cols:
    df_divexpl['speakerId'] = df_divexpl.index.map(lambda x: x.split("/")[2])

## Discretize the dataframe
from util_discretization import discretize

df_discretized = discretize(
    df_divexpl[input_cols+[target_col]],
    bins=3,
    attributes=input_cols,
    strategy="quantile", 
    round_v = 2,
    min_distinct=5,
)

## Replace values with ranges: "low", "medium", "high"
replace_values = {}

for i in range(0,len(signal_cols)):

    for v in df_discretized[signal_cols[i]].unique():
        if "<=" == v[0:2]:
            replace_values[v] = "low"
        elif ">" == v[0]:
            replace_values[v] = "high"
        elif "("  == v[0] and "]"  == v[-1]:
            replace_values[v] = "medium"
        else:
            raise ValueError(v)

    df_discretized[signal_cols[i]].replace(replace_values, inplace=True)
    
df_discretized.loc[df_discretized["location"]=="none_location", "location"] = "none"
df_discretized.loc[df_discretized["object"]=="none_object", "object"] = "none"

## Create dict of Divergence df
fp_diver = FP_DivergenceExplorer(df_discretized, true_class_name=target_col, class_map={"P":1, "N":0})
FP_fm = fp_diver.getFrequentPatternDivergence(min_support=MIN_SUP, metrics=[target_metric])
FP_fm.rename(columns=remapped_cols, inplace=True)
FP_fm = FP_fm[show_cols].copy()
FP_fm['accuracy'] = round(FP_fm['accuracy'], 5)
FP_fm['d_accuracy'] = round(FP_fm['d_accuracy'], 5)
FP_fm['t_value'] = round(FP_fm['t_value'], 2)
fp_divergence = FP_Divergence(FP_fm, target_div)

In [None]:
## Compute the divergence for Wav2Vec2-Base
FPdiv = fp_divergence.getDivergence(th_redundancy=TH_REDUNDANCY)[::-1]

## Retrieve Most Divergent Itemsets 
from copy import deepcopy
pr = FPdiv.copy()
pr["support"] = pr["support"].round(2)
pr["#errors"] = pr["#errors"].astype(int)
pr["#corrects"] = pr["#corrects"].astype(int)
pr["accuracy"] = (pr["accuracy"]*100).round(3)
pr["d_accuracy"] = (pr["d_accuracy"]*100).round(3)
pr.head(NUM_SUBGROUPS)

In [None]:
## Create a column in the df, and assign a class to each sample:
# - 1 if the sample is in the most divergent itemset
# - 2 if the sample is in the second most divergent itemset
# - 3 if the sample is in the third most divergent itemset
# - ...
# - 0 otherwise

df_discretized["subgID"] = 0
itemsets = []
for i in range(NUM_SUBGROUPS):
    itemsets.append(list(pr.itemsets.values[i]))
for i in tqdm(range(0, len(df_discretized))):
    for value,itemset in enumerate(itemsets):
        ks = []
        vs = []
        for item in itemset:
            k, v = item.split("=")
            ks.append(k)
            vs.append(v)
        if all(df_discretized.loc[i, ks] == vs):
            if df_discretized.loc[i, "subgID"] == 0:
                df_discretized.loc[i, "subgID"] = value+1
            else:
                continue
        else:
            continue
if BINARY:
    df_discretized.loc[df_discretized["subgID"]!=0, "subgID"] = 1
for i in range(0,NUM_SUBGROUPS+1):
    print(len(df_discretized.loc[df_discretized["subgID"]==i]))
df_discretized.to_csv("df_discretized.csv", index=False)

## Test

In [None]:
## Add SpeakerID information if it is present in the df
if "speakerId" in input_cols:
    df_test_divexpl['speakerId'] = df_test_divexpl.index.map(lambda x: x.split("/")[2])

## Discretize the dataframe
from util_discretization import discretize

df_discretized_test = discretize(
    df_test_divexpl[input_cols+[target_col]],
    bins=3,
    attributes=input_cols,
    strategy="quantile", 
    round_v = 2,
    min_distinct=5,
)

## Replace values with ranges: "low", "medium", "high"
replace_values = {}

for i in range(0,len(signal_cols)):

    for v in df_discretized_test[signal_cols[i]].unique():
        if "<=" == v[0:2]:
            replace_values[v] = "low"
        elif ">" == v[0]:
            replace_values[v] = "high"
        elif "("  == v[0] and "]"  == v[-1]:
            replace_values[v] = "medium"
        else:
            raise ValueError(v)

    df_discretized_test[signal_cols[i]].replace(replace_values, inplace=True)
    
df_discretized_test.loc[df_discretized["location"]=="none_location", "location"] = "none"
df_discretized_test.loc[df_discretized["object"]=="none_object", "object"] = "none"

In [None]:
## Create a column in the df, and assign a class to each sample:
# - 1 if the sample is in the most divergent itemset
# - 2 if the sample is in the second most divergent itemset
# - 3 if the sample is in the third most divergent itemset
# - ...
# - 0 otherwise
df_discretized_test["subgID"] = 0
for i in tqdm(range(0, len(df_discretized_test))):
    for value,itemset in enumerate(itemsets):
        ks = []
        vs = []
        for item in itemset:
            k, v = item.split("=")
            ks.append(k)
            vs.append(v)
        if all(df_discretized_test.loc[i, ks] == vs):
            if df_discretized_test.loc[i, "subgID"] == 0:
                df_discretized_test.loc[i, "subgID"] = value+1
            else:
                continue
        else:
            continue
if BINARY:
    df_discretized_test.loc[df_discretized_test["subgID"]!=0, "subgID"] = 1
for i in range(0,NUM_SUBGROUPS+1):
    print(len(df_discretized_test.loc[df_discretized_test["subgID"]==i]))
df_discretized_test.to_csv("df_discretized_test.csv", index=False)

# Full pipeline - Steps 1 & 3

In [None]:
df_cm = df[[
    'prediction', 
    'predicted_action', 'predicted_object', 'predicted_location',
    'hidden_states',
    'total_silence', 'n_words', 'speed_rate_word'
    ]]
df_cm_test = df_test[[
    'prediction', 
    'predicted_action', 'predicted_object', 'predicted_location',
    'hidden_states',
    'total_silence', 'n_words', 'speed_rate_word'
    ]]

## Pretraining the CM

In [None]:
## Create train, val, test split
X = torch.cat((
    torch.tensor(df_cm['predicted_action']),
    torch.tensor(df_cm['predicted_object']),
    torch.tensor(df_cm['predicted_location']),
    torch.stack(
        [torch.mean(torch.tensor(el[-1])) for el in df_cm['hidden_states']]
        ).unsqueeze(1),
    torch.tensor(df_cm['total_silence']).unsqueeze(1),
    torch.tensor(df_cm['n_words']).unsqueeze(1),
    torch.tensor(df_cm['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y = torch.tensor(df_cm['prediction']).unsqueeze(1)
X_train = X[:23132]
y_train = y[:23132]
X_val = X[23132:]
y_val = y[23132:]

X_test = torch.cat((
    torch.tensor(df_cm_test['predicted_action']),
    torch.tensor(df_cm_test['predicted_object']),
    torch.tensor(df_cm_test['predicted_location']),
    torch.stack(
        [torch.mean(torch.tensor(el[-1])) for el in df_cm_test['hidden_states']]
        ).unsqueeze(1),
    torch.tensor(df_cm_test['total_silence']).unsqueeze(1),
    torch.tensor(df_cm_test['n_words']).unsqueeze(1),
    torch.tensor(df_cm_test['speed_rate_word']).unsqueeze(1),
    ), dim=1)
y_test = torch.tensor(df_cm_test['prediction']).unsqueeze(1)

In [None]:
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)

best_auc = 0
best_acc = 0
best_output = 0
best_model = 0

## Create model
model = ConfidenceModel(
    input_size=X_train.shape[1],
    hidden_size=HIDDEN_SIZE, 
    output_size=1
    ).to(device)        

criterion = nn.BCELoss()
optimizer = optim.NAdam(model.parameters(), lr=0.005)

## Train model
train_losses = []
val_losses = []
test_losses = []
test_aucs = []
val_aucs = []

for epoch in range(EPOCHS):
        
    ## Train in batches
    for i in range(0, len(X_train), BATCH_SIZE):
        train_output, train_loss = train(
            model, 
            X_train[i:i+BATCH_SIZE].float().to(device), 
            y_train[i:i+BATCH_SIZE].float().to(device), 
            criterion, 
            optimizer
            )
    train_losses.append(train_loss)
        
    val_output, val_loss = val(
        model, 
        X_val.float().to(device), 
        y_val.float().to(device),
        criterion
        )
    val_losses.append(val_loss)
    val_output = (val_output > 0.5).float()
    val_acc = accuracy_score(y_val, val_output.cpu().detach().numpy())
    val_auc = roc_auc_score(y_val, val_output.cpu().detach().numpy())
    val_aucs.append(val_auc)  

    if val_losses[-1] > val_losses[-2] and val_losses[-2] > val_losses[-3]:
        break   

## Test accuracy and AUC
test_output, test_loss = test(
    model, 
    X_test.float().to(device), 
    y_test.float().to(device), 
    criterion
    )
test_output = (test_output > 0.5).float()
test_acc = accuracy_score(y_test, test_output.cpu().detach().numpy())
test_auc = roc_auc_score(y_test, test_output.cpu().detach().numpy())
    
best_auc = test_auc
best_acc = test_acc
best_output = test_output
best_model = model

## Confusion matrix
from sklearn.metrics import confusion_matrix
print("Test accuracy: ", round(best_acc*100, 2), "%")
print("Test AUC: ", round(best_auc, 2))
print("Confusion Matrix: ")
print(confusion_matrix(
    y_test.cpu().detach().numpy(), 
    best_output.cpu().detach().numpy()
    ))

## Problematic Subgroups Prediction

In [None]:
## Create train, val, test split
X = torch.cat((
    torch.tensor(df_cm['predicted_action']),
    torch.tensor(df_cm['predicted_object']),
    torch.tensor(df_cm['predicted_location']),
    torch.stack(
        [torch.mean(torch.tensor(el[-1])) for el in df_cm['hidden_states']]
        ).unsqueeze(1),
    torch.tensor(df_cm['total_silence']).unsqueeze(1),
    torch.tensor(df_cm['n_words']).unsqueeze(1),
    torch.tensor(df_cm['speed_rate_word']).unsqueeze(1),
    ), dim=1)
X_train = X[:23132]
X_val = X[23132:]

X_test = torch.cat((
    torch.tensor(df_cm_test['predicted_action']),
    torch.tensor(df_cm_test['predicted_object']),
    torch.tensor(df_cm_test['predicted_location']),
    torch.stack(
        [torch.mean(torch.tensor(el[-1])) for el in df_cm_test['hidden_states']]
        ).unsqueeze(1),
    torch.tensor(df_cm_test['total_silence']).unsqueeze(1),
    torch.tensor(df_cm_test['n_words']).unsqueeze(1),
    torch.tensor(df_cm_test['speed_rate_word']).unsqueeze(1),
    ), dim=1)

In [None]:
## Create train, val, test split
y_subs = torch.tensor(df_discretized['subgID'])
y_train_subs = y_subs[:23132]
y_val_subs = y_subs[23132:]
y_test_subs = torch.tensor(df_discretized_test['subgID'])

In [None]:
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)

best_f1macro = 0
best_acc = 0
best_output = 0
best_epoch = 0
train_losses = []
val_losses = []
test_losses = []

if PRETRAIN:
    best_model.linear3 = nn.Linear(
        HIDDEN_SIZE,
        NUM_SUBGROUPS+1
        ).to(device)
    model = best_model
else:
    model = ConfidenceModel(
        input_size=X_train.shape[1],
        hidden_size=HIDDEN_SIZE,
        output_size=NUM_SUBGROUPS+1
        ).to(device)

## Criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(model.parameters(), lr=0.005)

## Train and validate model
train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    train_output, train_loss = train(
        model,
        X_train.to(device),
        y_train_subs.to(device),
        criterion,
        optimizer
        )
    val_output, val_loss = val(
        model,
        X_val.to(device),
        y_val_subs.to(device),
        criterion
        )

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss > val_losses[-2] and val_loss > val_losses[-3]:
        break

test_output, test_loss = test(
    model,
    X_test.to(device),
    y_test_subs.to(device),
    criterion
    )
test_output = test_output.cpu().detach().numpy()
test_output = np.argmax(test_output, axis=1)
test_acc = accuracy_score(y_test_subs, test_output)
test_f1 = f1_score(y_test_subs, test_output, average='macro')
        
best_f1macro = test_f1
best_acc = test_acc
best_output = test_outputå

print("Test Accuracy: ", best_acc)
print("Test F1 Macro: ", best_f1macro)
print("--------------------\n")

# Random Baseline

In [None]:
y_subs = torch.tensor(df_discretized['subgID'])
y_train_subs = y_subs[:23132]
y_val_subs = y_subs[23132:]
y_test_subs = torch.tensor(df_discretized_test['subgID'])

In [None]:
## Random baseline: assign random class to each sample

SEED = 1
np.random.seed(SEED)

random_pred = np.random.randint(0, NUM_SUBGROUPS+1, len(y_test_subs))
print("Test Accuracy: ", accuracy_score(y_test_subs, random_pred))
print("F1 Macro: ", f1_score(y_test_subs, random_pred, average='macro'))

## K = 2
# Test Accuracy:    0.3311363037173741
# F1 Macro:         0.21583683044821356

## K = 3
# Test Accuracy:    0.25046137621935144
# F1 Macro:         0.1365154990425591

## K = 4
# Test Accuracy:    0.1982599525441603
# F1 Macro:         0.11033841939589309

## K = 5    
# Test Accuracy:    0.17189559715264963
# F1 Macro:         0.10379229291419767

In [None]:
## Random baseline: assign each sample to the most frequent class

SEED = 1
np.random.seed(SEED)

# count number of samples for each class
counts = np.zeros(NUM_SUBGROUPS+1)
for i in range(NUM_SUBGROUPS+1):
    counts[i] = len(y_test_subs[y_test_subs == i])

most_frequent_pred = np.ones(len(y_test_subs))
most_frequent_pred = most_frequent_pred * np.argmax(counts)
print("Test Accuracy: ", accuracy_score(y_test_subs, most_frequent_pred))
print("F1 Macro: ", f1_score(y_test_subs, most_frequent_pred, average='macro'))

## K = 2
# Test Accuracy:    0.9058792512523068
# F1 Macro:         0.31687185871720386

## K = 3
# Test Accuracy:    0.8797785394147113
# F1 Macro:         0.23401122019635343

## K = 4
# Test Accuracy:    0.8404956498813604
# F1 Macro:         0.18266723965047987

## K = 5
# Test Accuracy:    0.7943580279462167
# F1 Macro:         0.14756587324909393