In [None]:
import pandas as pd 

df_train_path = 'data_slurp/train.csv'
df_train = pd.read_csv(df_train_path, index_col=None)

df_test_path = 'data_slurp/test.csv'
df_test = pd.read_csv(df_test_path, index_col=None)

# get intent of train data
intent_train = df_train['intent'].unique()
print('Intent of train data: ', len(intent_train))

# get intent of test data
intent_test = df_test['intent'].unique()
print('Intent of test data: ', len(intent_test))

# remove from test data intents that are not in train data
df_test = df_test[df_test['intent'].isin(intent_train)]

# get intent of test data
intent_test = df_test['intent'].unique()
print('Intent of test data: ', len(intent_test))

# save test data
df_test.to_csv('data_slurp_unlearning/test_unlearning.csv', index=False)

In [None]:
import pandas as pd 

def read_data(df_train_path, df_val_path):
    df_train = pd.read_csv(df_train_path, index_col=None)
    df_val = pd.read_csv(df_val_path, index_col=None)
    
    ## Prepare Labels
    labels = df_train['intent'].unique()
    label2id, id2label = dict(), dict()
    for i, label in enumerate(labels):
        label2id[label] = str(i)
        id2label[str(i)] = label
    num_labels = len(id2label)

    ## Train
    for index in range(0,len(df_train)):
        df_train.loc[index,'label'] = label2id[df_train.loc[index, 'intent']]
    df_train['label'] = df_train['label'].astype(int)

    ## Validation
    for index in range(0,len(df_val)):
        df_val.loc[index,'label'] = label2id[df_val.loc[index, 'intent']]
    df_val['label'] = df_val['label'].astype(int)

    print("Label2Id: ", label2id)
    print("Id2Label: ", id2label)
    print("Num Labels: ", num_labels)

    return df_train, df_val, num_labels, label2id, id2label, labels


df_train, df_val, num_labels, label2id, id2label, labels = read_data(
        "data_slurp_unlearning/train_unlearning.csv", 
        "data_slurp_unlearning/val_unlearning.csv", 
        )
print("Num labels: ", num_labels)
df_train, df_test, num_labels, label2id, id2label, labels = read_data(
        "data_slurp_unlearning/train_unlearning.csv", 
        "data_slurp_unlearning/test_unlearning.csv", 
        )


In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train.label.value_counts().plot(kind='bar')

In [None]:
df_val.label.value_counts().plot(kind='bar')

In [None]:
df_test.label.value_counts().plot(kind='bar')

In [None]:
df_identity

In [None]:
df = pd.concat([df_train, df_val, df_test], ignore_index=True)

# split by identities the dataset until the original numerosity of the datasets is reached
# we will have 3 datasets, one for training, one for validation and one for testing

df['speaker_id'] = df['speaker_id'].astype(str)
identities = df['speaker_id'].unique()

df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()
identities

In [None]:
n_identities = len(identities)
for identity in identities: 
    df_identity = df[df['speaker_id'] == identity]
    n_samples = len(df_identity)
    # put identities in the training set until we reach 80% of the original dataset
    if len(df_train) < 0.8 * len(df):
        df_train = pd.concat([df_train, df_identity], ignore_index=True)
    elif len(df_val) < 0.1 * len(df):
        df_val = pd.concat([df_val, df_identity], ignore_index=True)
    else:
        df_test = pd.concat([df_test, df_identity], ignore_index=True)
    
len(df_train), len(df_val), len(df_test)

In [None]:
df_train.label.value_counts().plot(kind='bar')

In [None]:
df_val.label.value_counts().plot(kind='bar')

In [None]:
df_test.label.value_counts().plot(kind='bar')

In [None]:
# check that the three dataset are disjoint and don't have any common identity

train_identities = df_train['speaker_id'].unique()
val_identities = df_val['speaker_id'].unique()
test_identities = df_test['speaker_id'].unique()

for identity in train_identities:
    assert identity not in val_identities
    assert identity not in test_identities

for identity in val_identities:
    assert identity not in test_identities

In [None]:
from utils import set_seed
import random

def get_forget_retain_split(df_train, min_samples_forget=100, ratio=0.025, seed=0, speaker_col='speakerId'):

    speakerids = df_train[speaker_col].value_counts()

    set_seed(seed)

    # sample speakers that have at least 200 samples until 2.5% of the total dataset samples are reached
    speakers = speakerids[speakerids>min_samples_forget].index.tolist()
    total_samples = 0 
    speakers_to_sample = []
    while total_samples < len(df_train)*ratio:
        speaker = random.choice(speakers)
        speakers_to_sample.append(speaker)
        total_samples += speakerids[speaker]

    df_forget = df_train[df_train[speaker_col].isin(speakers_to_sample)]
    df_retain = df_train[~df_train[speaker_col].isin(speakers_to_sample)]
    return df_forget, df_retain

speakerl_col = 'speaker_id'
df_forget, df_retain = get_forget_retain_split(df_train, speaker_col=speakerl_col)

assert len(df_forget) + len(df_retain) == len(df_train)
assert len(set(df_forget[speakerl_col]).intersection(set(df_retain[speakerl_col]))) == 0

In [None]:
len(df_forget) / len(df_train)

In [None]:
# count intent distribution in forget and retain datasets
len(df_forget.intent.unique()), len(df_retain.intent.unique())

In [None]:
df_forget.intent.value_counts().plot(kind='bar')

In [None]:
df_retain.intent.value_counts().plot(kind='bar')

In [None]:
# # save the indexes in a txt file of the forget samples and the retain one 
# forget_indexes = df_forget.index.tolist()
# retain_indexes = df_retain.index.tolist()

# with open('forget_indexes.txt', 'w') as f:
#     for item in forget_indexes:
#         f.write("%s\n" % item)

# with open('retain_indexes.txt', 'w') as f:
#     for item in retain_indexes:
#         f.write("%s\n" % item)

In [None]:
import numpy as np
speaker_ids_forget = np.unique(df_forget['speaker_id'])
speaker_ids_forget

In [None]:
#array(['ME-140', 'ME-144', 'MO-463'], dtype=object)

In [None]:
# remove from val and test the speakers that are in the forget
print("Before forget: ", len(df_val), len(df_test))
df_val_forget = df_val[~df_val['speaker_id'].isin(speaker_ids_forget)]
df_test_forget = df_test[~df_test['speaker_id'].isin(speaker_ids_forget)]
print("After forget: ", len(df_val_forget), len(df_test_forget))

In [None]:
# save the forget, retain, val and test
import os 
folder = "data_slurp_unlearning"
os.makedirs(folder, exist_ok=True)
df_forget.to_csv(os.path.join(folder, "forget.csv"), index=False)
df_retain.to_csv(os.path.join(folder, "retain.csv"), index=False)
# df_val_forget.to_csv(os.path.join(folder, "val_forget.csv"), index=False)
# df_test_forget.to_csv(os.path.join(folder, "test_forget.csv"), index=False)
# df_train.to_csv(os.path.join(folder, "train.csv"), index=False)
# df_val.to_csv(os.path.join(folder, "val.csv"), index=False)
# df_test.to_csv(os.path.join(folder, "test.csv"), index=False)