# COMMON

In [12]:
%run 00_common.ipynb

0.27.0


In [2]:
# Tokenize the data
# The preprocess_function function is defined to preprocess the data by tokenizing the inputs and labels
def preprocess_function(examples):
    inputs = [f'{source_lang}: {text}' for text in examples[source_lang]]
    targets = examples[target_lang]
    encoding = tokenizer(inputs, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
    model_inputs = {
        'input_ids': encoding['input_ids'].squeeze(),
        'attention_mask': encoding['attention_mask'].squeeze(),
        'labels': tokenizer(targets, padding=True, truncation=True, return_tensors='pt')['input_ids'].squeeze()
    }
    return model_inputs

# define a data_collator function for batch processing
def data_collator(features):
    batch = {}
    # Pad input_ids and attention_mask to the maximum length within the batch
    max_length = max(len(feature['input_ids']) for feature in features)
    batch['input_ids'] = torch.stack([torch.tensor(feature['input_ids'] + [tokenizer.pad_token_id] * (max_length - len(feature['input_ids']))) for feature in features])
    batch['attention_mask'] = torch.stack([torch.tensor(feature['attention_mask'] + [0] * (max_length - len(feature['attention_mask']))) for feature in features])
    batch['labels'] = torch.stack([torch.tensor(feature['labels'] + [-100] * (max_length - len(feature['labels']))) for feature in features])
    return batch

# READ CSV

In [3]:
dataframes = {}

def load_dataframes(directory='results'):
    dataframes = {}

    for speaker_dir in os.listdir(directory):
        if os.path.isdir(os.path.join(directory, speaker_dir)):
            train_df = None
            test_df = None
            valid_df = None

            for file in os.listdir(os.path.join(directory, speaker_dir)):
                if file.endswith('.csv'):
                    df_name = os.path.splitext(file)[0]
                    df_path = os.path.join(directory, speaker_dir, file)

                    if 'train' in df_name:
                        if train_df is None:
                            train_df = pd.read_csv(df_path)
                        else:
                            train_df = pd.concat([train_df, pd.read_csv(df_path)], ignore_index=True)
                    elif 'test' in df_name:
                        if test_df is None:
                            test_df = pd.read_csv(df_path)
                        else:
                            test_df = pd.concat([test_df, pd.read_csv(df_path)], ignore_index=True)
                    elif 'validation' in df_name:
                        if valid_df is None:
                            valid_df = pd.read_csv(df_path)
                        else:
                            valid_df = pd.concat([valid_df, pd.read_csv(df_path)], ignore_index=True)

            combined_df = pd.concat([train_df, test_df, valid_df], ignore_index=True)

            if combined_df is not None:
                dataframes[speaker_dir] = combined_df

    return dataframes

dataframes = load_dataframes()


# for df_name, df in dataframes.items():
#     print(f'DataFrame: {df_name}')
#     print(df)
#     print()

# SEPERATE CSV BY IF KEEP ALL

In [4]:
dataframes_keep_all = {}
dataframes_no_keep_all = {}

for speaker_dir in os.listdir('results'):
    if os.path.isdir(os.path.join('results', speaker_dir)):
        train_df = None
        test_df = None
        valid_df = None

        for file in os.listdir(os.path.join('results', speaker_dir)):
            if file.endswith('.csv'):
                df_name = os.path.splitext(file)[0]
                df_path = os.path.join('results', speaker_dir, file)
                
                if 'train' in df_name:
                    if train_df is None:
                        train_df = pd.read_csv(df_path)
                    else:
                        train_df = pd.concat([train_df, pd.read_csv(df_path)], ignore_index=True)
                elif 'test' in df_name:
                    if test_df is None:
                        test_df = pd.read_csv(df_path)
                    else:
                        test_df = pd.concat([test_df, pd.read_csv(df_path)], ignore_index=True)
                elif 'validation' in df_name:
                    if valid_df is None:
                        valid_df = pd.read_csv(df_path)
                    else:
                        valid_df = pd.concat([valid_df, pd.read_csv(df_path)], ignore_index=True)
        
        combined_df = pd.concat([train_df, test_df, valid_df], ignore_index=True)
        
        if 'keep_all' in speaker_dir:
            dataframes_keep_all[speaker_dir] = combined_df
        else:
            dataframes_no_keep_all[speaker_dir] = combined_df

In [5]:
print_dataframe_sizes(dataframes_no_keep_all)
print_dataframe_sizes(dataframes_keep_all)

torgo_xlsr_finetune_F01 - Data Size: 14409
torgo_xlsr_finetune_F03 - Data Size: 8568
torgo_xlsr_finetune_F04 - Data Size: 9454
torgo_xlsr_finetune_M01 - Data Size: 6540
torgo_xlsr_finetune_M02 - Data Size: 9790
torgo_xlsr_finetune_M03 - Data Size: 9572
torgo_xlsr_finetune_M04 - Data Size: 11751
torgo_xlsr_finetune_M05 - Data Size: 8853

torgo_xlsr_finetune_F01_keep_all - Data Size: 16082
torgo_xlsr_finetune_F03_keep_all - Data Size: 16082
torgo_xlsr_finetune_F04_keep_all - Data Size: 16082
torgo_xlsr_finetune_M01_keep_all - Data Size: 16082
torgo_xlsr_finetune_M02_keep_all - Data Size: 16082
torgo_xlsr_finetune_M03_keep_all - Data Size: 16082
torgo_xlsr_finetune_M04__keep_all - Data Size: 16082
torgo_xlsr_finetune_M05_keep_all - Data Size: 16082



# SEPERATE SENTENCE AND WORD LEVEL

In [6]:
dataframes_word_keep_all = {}
dataframes_sentence_keep_all = {}

for speaker_dir, combined_df in dataframes_keep_all.items():
    combined_df['word_count'] = combined_df['references'].str.split().apply(len)
    word_df = combined_df[combined_df['word_count'] == 1]
    sentence_df = combined_df[combined_df['word_count'] > 1]
    word_df.drop(columns=['word_count'], inplace=True)
    sentence_df.drop(columns=['word_count'], inplace=True)
    dataframes_word_keep_all[speaker_dir] = word_df
    dataframes_sentence_keep_all[speaker_dir] = sentence_df

dataframes_word_no_keep_all = {}
dataframes_sentence_no_keep_all = {}

for speaker_dir, combined_df in dataframes_no_keep_all.items():
    combined_df['word_count'] = combined_df['references'].str.split().apply(len)
    word_df = combined_df[combined_df['word_count'] == 1]
    sentence_df = combined_df[combined_df['word_count'] > 1]
    word_df.drop(columns=['word_count'], inplace=True)
    sentence_df.drop(columns=['word_count'], inplace=True)
    dataframes_word_no_keep_all[speaker_dir] = word_df
    dataframes_sentence_no_keep_all[speaker_dir] = sentence_df

print_dataframe_sizes(dataframes_word_keep_all)
print_dataframe_sizes(dataframes_sentence_keep_all)
print_dataframe_sizes(dataframes_word_no_keep_all)
print_dataframe_sizes(dataframes_sentence_no_keep_all)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  word_df.drop(columns=['word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_df.drop(columns=['word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  word_df.drop(columns=['word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_df

torgo_xlsr_finetune_F01_keep_all - Data Size: 12129
torgo_xlsr_finetune_F03_keep_all - Data Size: 12129
torgo_xlsr_finetune_F04_keep_all - Data Size: 12129
torgo_xlsr_finetune_M01_keep_all - Data Size: 12115
torgo_xlsr_finetune_M02_keep_all - Data Size: 12129
torgo_xlsr_finetune_M03_keep_all - Data Size: 12129
torgo_xlsr_finetune_M04__keep_all - Data Size: 12129
torgo_xlsr_finetune_M05_keep_all - Data Size: 12129

torgo_xlsr_finetune_F01_keep_all - Data Size: 3953
torgo_xlsr_finetune_F03_keep_all - Data Size: 3953
torgo_xlsr_finetune_F04_keep_all - Data Size: 3953
torgo_xlsr_finetune_M01_keep_all - Data Size: 3967
torgo_xlsr_finetune_M02_keep_all - Data Size: 3953
torgo_xlsr_finetune_M03_keep_all - Data Size: 3953
torgo_xlsr_finetune_M04__keep_all - Data Size: 3953
torgo_xlsr_finetune_M05_keep_all - Data Size: 3953

torgo_xlsr_finetune_F01 - Data Size: 10795
torgo_xlsr_finetune_F03 - Data Size: 6813
torgo_xlsr_finetune_F04 - Data Size: 7482
torgo_xlsr_finetune_M01 - Data Size: 4404
tor

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  word_df.drop(columns=['word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_df.drop(columns=['word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  word_df.drop(columns=['word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_df

# REMOVE SAME COLUMNS

In [20]:


def process_dataframes(dataframes_dict, col_names):
    processed_dataframes = {}

    for df_name, df in dataframes_dict.items():
        # Remove duplicates and drop NaN values
        df = df.drop_duplicates(subset=col_names, keep=False)
        df = df.dropna() 
        processed_dataframes[df_name] = df

    return processed_dataframes

def convert_to_phonemes(data_frame, remove_num=True):
    g2p = G2p()

    for df_name, df in data_frame.items():
        df['predictions_phoneme'] = df['predictions'].apply(lambda x: " ".join(g2p(x)))
        df['references_phoneme'] = df['references'].apply(lambda x: " ".join(g2p(x)))

        if remove_num:
            # Remove numeric values
            df['predictions_phoneme'] = df['predictions_phoneme'].apply(lambda x: re.sub(r'\d+', '', x))
            df['references_phoneme'] = df['references_phoneme'].apply(lambda x: re.sub(r'\d+', '', x))

        df.drop(['predictions', 'references'], axis=1, inplace=True)

    return data_frame

def split_dataframes(dataframes_word):
    # Remove Duplicates
    for df_name, df in dataframes_word.items():
        # Remove rows where columns are the same
        df = df[df['predictions_phoneme'] != df['references_phoneme']]
        
        # Remove rows where columns only differ by the number of spaces
        df = df[df.apply(lambda row: row['predictions_phoneme'].strip() != row['references_phoneme'].strip(), axis=1)]
        
        df.reset_index(drop=True, inplace=True)
        
    ori_train_dataframes_word = {}
    test_dataframes_word = {}
    
    for df_name, df in dataframes_word.items():
        # Train-Test Split
        df = df.drop_duplicates(subset=['predictions_phoneme', 'references_phoneme'], keep=False)
        train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
        
        # Naming variables without pattern
        train_df_name = df_name
        test_df_name = f"{df_name}_test"
        
        ori_train_dataframes_word[train_df_name] = train_df
        test_dataframes_word[test_df_name] = test_df
    
    train_dataframes_word = {}
    val_dataframes_word = {} 
    
    for df_name, df in ori_train_dataframes_word.items():
        df = df.drop_duplicates(subset=['predictions_phoneme', 'references_phoneme'], keep=False)
        train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
        train_df_name = f"{df_name}_train"
        val_df_name = f"{df_name}_val"
        
        train_dataframes_word[train_df_name] = train_df
        val_dataframes_word[val_df_name] = val_df
    print_dataframe_sizes(train_dataframes_word)
    print_dataframe_sizes(val_dataframes_word)
    print_dataframe_sizes(test_dataframes_word)
 
    return train_dataframes_word, val_dataframes_word, test_dataframes_word



In [None]:
dataframes_word_keep_all = process_dataframes(dataframes_word_keep_all, col_names)
dataframes_sentence_keep_all = process_dataframes(dataframes_sentence_keep_all, col_names)
dataframes_word_no_keep_all = process_dataframes(dataframes_word_no_keep_all, col_names)
dataframes_sentence_no_keep_all = process_dataframes(dataframes_sentence_no_keep_all, col_names)

print_dataframe_sizes(dataframes_word_keep_all)
print_dataframe_sizes(dataframes_sentence_keep_all)
print_dataframe_sizes(dataframes_word_no_keep_all)
print_dataframe_sizes(dataframes_sentence_no_keep_all)

# output dont Keep ALL phrases

In [13]:
dataframes_word_no_keep_all = convert_to_phonemes(dataframes_word_no_keep_all)

In [14]:
train_dataframes_word, val_dataframes_word, test_dataframes_word=split_dataframes(dataframes_word_no_keep_all)

torgo_xlsr_finetune_F01_train - Data Size: 228
torgo_xlsr_finetune_F03_train - Data Size: 264
torgo_xlsr_finetune_F04_train - Data Size: 256
torgo_xlsr_finetune_M01_train - Data Size: 271
torgo_xlsr_finetune_M02_train - Data Size: 318
torgo_xlsr_finetune_M03_train - Data Size: 213
torgo_xlsr_finetune_M04_train - Data Size: 333
torgo_xlsr_finetune_M05_train - Data Size: 311

torgo_xlsr_finetune_F01_val - Data Size: 26
torgo_xlsr_finetune_F03_val - Data Size: 30
torgo_xlsr_finetune_F04_val - Data Size: 29
torgo_xlsr_finetune_M01_val - Data Size: 31
torgo_xlsr_finetune_M02_val - Data Size: 36
torgo_xlsr_finetune_M03_val - Data Size: 24
torgo_xlsr_finetune_M04_val - Data Size: 37
torgo_xlsr_finetune_M05_val - Data Size: 35

torgo_xlsr_finetune_F01_test - Data Size: 29
torgo_xlsr_finetune_F03_test - Data Size: 33
torgo_xlsr_finetune_F04_test - Data Size: 32
torgo_xlsr_finetune_M01_test - Data Size: 34
torgo_xlsr_finetune_M02_test - Data Size: 40
torgo_xlsr_finetune_M03_test - Data Size: 27


In [15]:
def calculate_error_rates(dataframes, phoneme_names=['predictions', 'references']):
    for df_name, df in dataframes.items():
        references = df[phoneme_names[1]].tolist()
        hypotheses = df[phoneme_names[0]].tolist()

        wer_score = wer(references, hypotheses)
        cer_score = cer(references, hypotheses)

        print(f'Speaker: {df_name}')
        print(f'Word Error Rate (WER): {wer_score:.2%}')
        print(f'Character Error Rate (CER): {cer_score:.2%}')
        print()

calculate_error_rates(test_dataframes_word, phoneme_names=['predictions_phoneme', 'references_phoneme'])

Speaker: torgo_xlsr_finetune_F01_test
Word Error Rate (WER): 68.32%
Character Error Rate (CER): 55.19%

Speaker: torgo_xlsr_finetune_F03_test
Word Error Rate (WER): 51.64%
Character Error Rate (CER): 39.15%

Speaker: torgo_xlsr_finetune_F04_test
Word Error Rate (WER): 40.54%
Character Error Rate (CER): 31.38%

Speaker: torgo_xlsr_finetune_M01_test
Word Error Rate (WER): 55.64%
Character Error Rate (CER): 43.25%

Speaker: torgo_xlsr_finetune_M02_test
Word Error Rate (WER): 68.92%
Character Error Rate (CER): 54.78%

Speaker: torgo_xlsr_finetune_M03_test
Word Error Rate (WER): 51.55%
Character Error Rate (CER): 42.79%

Speaker: torgo_xlsr_finetune_M04_test
Word Error Rate (WER): 59.24%
Character Error Rate (CER): 46.85%

Speaker: torgo_xlsr_finetune_M05_test
Word Error Rate (WER): 53.62%
Character Error Rate (CER): 40.55%



In [18]:
def save_dataframes_to_csv(train_dataframes, val_dataframes, test_dataframes, torgo_train_type):
    output_folder = f'data/{torgo_train_type}'

    os.makedirs(output_folder, exist_ok=True)

    for df_name, train_df in train_dataframes.items():
        train_df.to_csv(os.path.join(output_folder, f'{df_name}.csv'), index=False)

    for df_name, val_df in val_dataframes.items():
        val_df.to_csv(os.path.join(output_folder, f'{df_name}.csv'), index=False)

    for df_name, test_df in test_dataframes.items():
        test_df.to_csv(os.path.join(output_folder, f'{df_name}.csv'), index=False)

def convert_csv_to_json(csv_path, json_output_path):
    df = pd.read_csv(csv_path)
    json_data = df.to_json(orient='records')

    with open(json_output_path, 'w') as json_file:
        json_file.write(json_data)

TORGO_TRAIN_TYPE=TorgoTrainType.WORD_NO_KEEP.value
save_dataframes_to_csv(train_dataframes_word, 
                       val_dataframes_word, 
                       test_dataframes_word, 
                       TORGO_TRAIN_TYPE)

In [19]:


csv_folder = f'data/{TORGO_TRAIN_TYPE}'
json_folder = f'data/{TORGO_TRAIN_TYPE}'

os.makedirs(json_folder, exist_ok=True)

for csv_file_name in os.listdir(csv_folder):
    if csv_file_name.endswith('.csv'):
        csv_file_path = os.path.join(csv_folder, csv_file_name)
        json_file_name = csv_file_name.replace('.csv', '.json')
        json_output_file_path = os.path.join(json_folder, json_file_name)

        convert_csv_to_json(csv_file_path, json_output_file_path)

# output Keep ALL phrases

In [21]:
TORGO_TRAIN_TYPE=TorgoTrainType.WORD_KEEP.value
dataframes_word_keep_all = convert_to_phonemes(dataframes_word_keep_all)
train_dataframes_word, val_dataframes_word, test_dataframes_word=split_dataframes(dataframes_word_keep_all)
calculate_error_rates(test_dataframes_word, phoneme_names=['predictions_phoneme', 'references_phoneme'])
save_dataframes_to_csv(train_dataframes_word, 
                       val_dataframes_word, 
                       test_dataframes_word, 
                       TORGO_TRAIN_TYPE)
csv_folder = f'data/{TORGO_TRAIN_TYPE}'
json_folder = f'data/{TORGO_TRAIN_TYPE}'

os.makedirs(json_folder, exist_ok=True)

for csv_file_name in os.listdir(csv_folder):
    if csv_file_name.endswith('.csv'):
        csv_file_path = os.path.join(csv_folder, csv_file_name)
        json_file_name = csv_file_name.replace('.csv', '.json')
        json_output_file_path = os.path.join(json_folder, json_file_name)

        convert_csv_to_json(csv_file_path, json_output_file_path)

torgo_xlsr_finetune_F01_keep_all_train - Data Size: 257
torgo_xlsr_finetune_F03_keep_all_train - Data Size: 251
torgo_xlsr_finetune_F04_keep_all_train - Data Size: 225
torgo_xlsr_finetune_M01_keep_all_train - Data Size: 299
torgo_xlsr_finetune_M02_keep_all_train - Data Size: 420
torgo_xlsr_finetune_M03_keep_all_train - Data Size: 226
torgo_xlsr_finetune_M04__keep_all_train - Data Size: 340
torgo_xlsr_finetune_M05_keep_all_train - Data Size: 348

torgo_xlsr_finetune_F01_keep_all_val - Data Size: 29
torgo_xlsr_finetune_F03_keep_all_val - Data Size: 28
torgo_xlsr_finetune_F04_keep_all_val - Data Size: 26
torgo_xlsr_finetune_M01_keep_all_val - Data Size: 34
torgo_xlsr_finetune_M02_keep_all_val - Data Size: 47
torgo_xlsr_finetune_M03_keep_all_val - Data Size: 26
torgo_xlsr_finetune_M04__keep_all_val - Data Size: 38
torgo_xlsr_finetune_M05_keep_all_val - Data Size: 39

torgo_xlsr_finetune_F01_keep_all_test - Data Size: 32
torgo_xlsr_finetune_F03_keep_all_test - Data Size: 32
torgo_xlsr_finet