In [None]:
## Load Dataset
from datasets import load_dataset

dataset_name = "de-DE"
dataset = load_dataset("FBK-MT/Speech-MASSIVE", dataset_name)
ds_train = dataset["train"]
ds_validation = dataset["validation"]

In [None]:
import pandas as pd
df_train = pd.DataFrame(ds_train)
df_validation = pd.DataFrame(ds_validation)
df_train

In [None]:
speaker_ids = df_train["speaker_id"].unique()

# remove from val speakers that are in train
print(len(df_validation))
df_validation = df_validation[~df_validation["speaker_id"].isin(speaker_ids)]
print(len(df_validation))

In [None]:
df_train.intent_idx.value_counts().plot(kind='bar')

In [None]:
speakerids = df_train['speaker_id'].value_counts()

len(speakerids)

In [None]:
speakerids.plot(kind='bar')

In [None]:
from utils import set_seed
import random

def get_forget_retain_split(df_train, min_samples_forget=100, ratio=0.025, seed=42, speaker_col='speakerId'):

    speakerids = df_train[speaker_col].value_counts()

    set_seed(seed)

    # sample speakers that have at least 200 samples until 2.5% of the total dataset samples are reached
    speakers = speakerids[speakerids>min_samples_forget].index.tolist()
    total_samples = 0 
    speakers_to_sample = []
    while total_samples < len(df_train)*ratio:
        speaker = random.choice(speakers)
        speakers_to_sample.append(speaker)
        total_samples += speakerids[speaker]

    df_forget = df_train[df_train[speaker_col].isin(speakers_to_sample)]
    df_retain = df_train[~df_train[speaker_col].isin(speakers_to_sample)]
    return df_forget, df_retain

speakerl_col = 'speaker_id'
df_forget, df_retain = get_forget_retain_split(df_train, speaker_col=speakerl_col)

assert len(df_forget) + len(df_retain) == len(df_train)
assert len(set(df_forget[speakerl_col]).intersection(set(df_retain[speakerl_col]))) == 0

In [None]:
len(df_forget) / len(df_train)

In [None]:
df_forget.intent_idx.value_counts().plot(kind='bar')

In [None]:
df_retain.intent_idx.value_counts().plot(kind='bar')

In [None]:
import os 

# save the indexes in a txt file of the forget samples and the retain one 
forget_indexes = df_forget.index.tolist()
retain_indexes = df_retain.index.tolist()

In [None]:
import os 

# save the indexes in a txt file of the forget samples and the retain one 
forget_indexes = df_forget.index.tolist()
retain_indexes = df_retain.index.tolist()

os.makedirs(dataset_name, exist_ok=True)

with open(f'{dataset_name}/forget_indexes.txt', 'w') as f:
    for item in forget_indexes:
        f.write("%s\n" % item)

with open(f'{dataset_name}/retain_indexes.txt', 'w') as f:
    for item in retain_indexes:
        f.write("%s\n" % item)

In [None]:
def get_forget_retain_datasets(ds_train, data_path):
    with open(data_path + 'forget_indexes.txt') as f:
        forget_indexes = f.readlines()
    forget_indexes = [int(x.strip()) for x in forget_indexes]

    with open(data_path + 'retain_indexes.txt') as f:
        retain_indexes = f.readlines()
    retain_indexes = [int(x.strip()) for x in retain_indexes]

    ds_forget = ds_train.select(forget_indexes)
    ds_retain = ds_train.select(retain_indexes)

    return ds_forget, ds_retain

In [None]:
ds_forget, ds_retain = get_forget_retain_datasets(ds_train, dataset_name + '/')

In [None]:
ds_forget, ds_retain

In [None]:
import numpy as np

index_val = df_validation.index.tolist()
np.random.seed(42)
np.random.shuffle(index_val)
val_size = len(index_val) // 2
index_val_new = index_val[:val_size]
index_test_new = index_val[val_size:]

with open(f'{dataset_name}/val_indexes.txt', 'w') as f:
    for item in index_val_new:
        f.write("%s\n" % item)

with open(f'{dataset_name}/test_indexes.txt', 'w') as f:
    for item in index_test_new:
        f.write("%s\n" % item)

In [None]:
len(df_validation)