In [None]:
import torch
import pandas as pd
import numpy as np
import random

def read_data(df_train_path, df_test_path):
    df_train = pd.read_csv(df_train_path, index_col=None)
    df_test = pd.read_csv(df_test_path, index_col=None)
    
    ## Prepare Labels
    labels = df_train['intent'].unique()
    label2id, id2label = dict(), dict()
    for i, label in enumerate(labels):
        label2id[label] = str(i)
        id2label[str(i)] = label
    num_labels = len(id2label)

    ## Train
    for index in range(0,len(df_train)):
        df_train.loc[index,'label'] = label2id[df_train.loc[index,'intent']]
    df_train['label'] = df_train['label'].astype(int)

    ## Validation
    for index in range(0,len(df_test)):
        df_test.loc[index,'label'] = label2id[df_test.loc[index,'intent']]
    df_test['label'] = df_test['label'].astype(int)

    return df_train, df_test, num_labels, label2id, id2label, labels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
 
df_train, df_test, num_labels, label2id, id2label, labels = read_data("data_fsc/train.csv", "data_fsc/test.csv")

In [None]:
df_train.label.value_counts().plot(kind='bar')

In [None]:
speakerids = df_train['speakerId'].value_counts()

len(speakerids)

In [None]:
speakerids.plot(kind='bar')

In [None]:
from utils import set_seed

def get_forget_retain_split(df_train, min_samples_forget=100, ratio=0.025, seed=42, speaker_col='speakerId'):

    speakerids = df_train[speaker_col].value_counts()

    set_seed(seed)

    # sample speakers that have at least 200 samples until 2.5% of the total dataset samples are reached
    speakers = speakerids[speakerids>min_samples_forget].index.tolist()
    total_samples = 0 
    speakers_to_sample = []
    while total_samples < len(df_train)*ratio:
        speaker = random.choice(speakers)
        speakers_to_sample.append(speaker)
        total_samples += speakerids[speaker]

    df_forget = df_train[df_train['speakerId'].isin(speakers_to_sample)]
    df_retain = df_train[~df_train['speakerId'].isin(speakers_to_sample)]
    return df_forget, df_retain

df_forget, df_retain = get_forget_retain_split(df_train)

assert len(df_forget) + len(df_retain) == len(df_train)
assert len(set(df_forget['speakerId']).intersection(set(df_retain['speakerId']))) == 0

In [None]:
len(df_forget) / len(df_train)

In [None]:
df_forget.label.value_counts().plot(kind='bar')

In [None]:
df_retain.label.value_counts().plot(kind='bar')

In [None]:
df_forget.to_csv('data_fsc/forget.csv', index=False)
df_retain.to_csv('data_fsc/retain.csv', index=False)