In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_metric
import os
import sys
sys.path.append("..")
from src.utils.text_processing import clean_text

In [3]:
wer_metric = load_metric("wer")

In [4]:
# Handy functions
def transform_raw_df(df_raw, mapping_df, split_df, metric, columns=[], fmt="intron_whisper"):
    assert fmt != "" or fmt is not None

    
    df = df_raw.copy()
    print(f"df shape: {df.shape}")

    df = df[["audio_paths", "hypothesis"]]
    df = pd.merge(df, mapping_df, on="audio_paths")
    df = pd.merge(df, split_df, on=["idx", "audio_paths"])
    df = df[~df.duplicated(subset="idx")]

    df = df[columns+["hypothesis"]]
    df.loc[:, "reference"] = df.loc[:, "transcript"]
    df.loc[:, "prediction"] = df.loc[:, "hypothesis"]
    df = df.drop(columns=["hypothesis", "transcript"])
    
    print(f"df shape (transformed): {df.shape}")
    df["wer"] = df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], references=[x.reference]), axis=1) 
    return df
    


def normalize_and_compute_wer(df, metric):
    normalized_df = df.copy()
    print(f"Total null values: {normalized_df[normalized_df['prediction'].isnull()].shape[0]}")
    normalized_df["prediction"] = normalized_df["prediction"].fillna("")
    
    normalized_df["reference"] = normalized_df["reference"].apply(lambda x: clean_text(x))
    normalized_df["prediction"] = normalized_df["prediction"].apply(lambda x: clean_text(x))
    
    normalized_df["wer"] = normalized_df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], 
                                                                  references=[x.reference]), axis=1)
    return normalized_df
    
    
def write_to_folder(model_id_or_path, predictions_df, output_dir="../results/", domain="all", split="test"):
    wer = wer_metric.compute(predictions=predictions_df.prediction, references=predictions_df.reference)
    print(f"wer: {wer}")
    output_path = f"{output_dir}/intron-open-{split}-{model_id_or_path}-wer-{round(wer, 4)}-{len(predictions_df)}.csv"
    print(f"output path: {output_path}")
    print(f"Output shape: {predictions_df.shape}")
    predictions_df.to_csv(output_path, index=False)
    
def consolidate_inference_results(model_csv_path, model_name, metric, domain,
                                  ref_csv_path="../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv",
                                  dataset_csv_path="../data/intron-test-public-6346-clean.csv",
                                  output_dir="../results"):
    
    print(f"Input path: {model_csv_path}")
    raw_df = pd.read_csv(model_csv_path)
    print(f"Input shape: {raw_df.shape}")
    
    split_df = pd.read_csv(dataset_csv_path)
    print(f"Dataset split shape: {split_df.shape}")
    split_df = split_df[~split_df.duplicated(subset="audio_paths")]
    split_df = split_df[~split_df.duplicated(subset="idx")]
    print(f"Dataset split shape (without duplicates): {split_df.shape}")
    split_name = split_df.split.unique()[0]
    split_df["audio_paths"] = split_df["audio_paths"].apply(lambda x: x.replace(f"/AfriSpeech-100/{split_name}/", "/data/data/intron/"))
    
    
    ref_df = pd.read_csv(ref_csv_path)
    print(f"Reference csv shape: {ref_df.shape}")
    ref_df = pd.merge(ref_df, split_df, on="audio_paths")
    ref_df[ref_df.duplicated(subset="audio_paths")]
    print(f"Reference csv shape (without duplicates): {ref_df.shape}")
    
    mapping_df = ref_df[["idx", "audio_paths"]]
    columns = ["idx", "domain", "gender", "duration", 
               "age_group", "accent", "user_ids", 
               "transcript", "audio_paths", "origin",
              "country"]

    
    
    # create output directory if not exist
    os.makedirs(f"{output_dir}/raw", exist_ok=True)
    os.makedirs(f"{output_dir}/normalized", exist_ok=True)
    
    models_name_mapping = {
        "openai_whisper_medium_all": "openai/whisper-medium-all",
        "openai_whisper_medium_general": "openai/whisper-medium-general",
        "openai_whisper_medium_clinical": "openai/whisper-medium-clinical",
        "facebook_wav2vec2_large_xlsr_53_english_all": "facebook/wav2vec2-large-xlsr-53-english-all",
        "facebook_wav2vec2_large_xlsr_53_english_general": "facebook/wav2vec2-large-xlsr-53-english-general",
        "facebook_wav2vec2_large_xlsr_53_english_clinical": "facebook/wav2vec2-large-xlsr-53-english-clinical"
    }
    
   
    # transform the raw dataframe
    df = transform_raw_df(raw_df, split_df=split_df, mapping_df=mapping_df, columns=columns, metric=metric)
    df["name"] = models_name_mapping[model_name]
    df["split"] = split_name
    df["audio_paths"] = df["audio_paths"].apply(lambda x: x.replace("/data/data/intron/", f"/AfriSpeech-100/{split_name}/"))
    
    print("***raw")
    # write the result to folder
    write_to_folder(model_id_or_path=model_name, 
                    predictions_df=df, 
                    domain=domain,
                    output_dir=f"{output_dir}/raw",
                    split=split_name)
    
    
    # normalize the raw dataframe
    normalized_df = normalize_and_compute_wer(df, metric=metric)
    normalized_df["name"] = models_name_mapping[model_name]
    normalized_df["split"] = split_name
    normalized_df["audio_paths"] = normalized_df["audio_paths"].apply(lambda x: x.replace("/data/data/intron/", f"/AfriSpeech-100/{split_name}/"))
    
    # write the result to folder
    print("***normalized")
    write_to_folder(model_id_or_path=model_name, 
                    predictions_df=normalized_df,
                    domain=domain,
                    output_dir=f"{output_dir}/normalized",
                    split=split_name)
    
    
    return df, normalized_df
    

# Test Dataset

In [5]:
test_model_name_csv_path_dict = {
    "openai_whisper_medium_all": ["../results/intron-open-test-whisper_all-wer-0.2161-5474.csv", "all"],
    "openai_whisper_medium_general": ["../results/intron-open-test-whisper_general-wer-0.3508-5474.csv", "general"],
    "openai_whisper_medium_clinical": ["../results/intron-open-test-whisper_clinical-wer-0.3678-5474.csv", "clinical"],
    "facebook_wav2vec2_large_xlsr_53_english_all": ["../results/intron-open-test-wav2vec2-large-xlsr-53-all-wer-0.2931-5474.csv", "all"],
    "facebook_wav2vec2_large_xlsr_53_english_general": ["../results/intron-open-test-wav2vec2-large-xlsr-53-general-wer-0.3487-5474.csv", "general"],
    "facebook_wav2vec2_large_xlsr_53_english_clinical": ["../results/intron-open-test-wav2vec2-large-xlsr-53-clinical-wer-0.3675-5474.csv", "clinical"],
}

In [6]:
test_combined_df = pd.DataFrame()
test_combined_normalized_df = pd.DataFrame()

for model_name, model_csv_path_and_domain in test_model_name_csv_path_dict.items():
    print(f"Model name: {model_name}")
    df, normalized_df = consolidate_inference_results(
        model_csv_path=model_csv_path_and_domain[0], 
        model_name=model_name,
        metric=wer_metric,
        domain=model_csv_path_and_domain[1],
        ref_csv_path="../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv",
        dataset_csv_path="../data/intron-test-public-6346-clean.csv",
        output_dir="../results", 
    )


    test_combined_df = pd.concat([test_combined_df, df])
    test_combined_normalized_df = pd.concat([test_combined_normalized_df, normalized_df])
    print("="*10)

print(f"Combined df shape: {test_combined_df.shape}")
print(f"Combined df (normalized) shape: {test_combined_normalized_df.shape}")

test_combined_df.to_csv("../results/raw/intron-open-test-all_models_finetuned.csv", index=False)
test_combined_normalized_df.to_csv("../results/normalized/intron-open-test-all_models_finetuned.csv", index=False)

Model name: openai_whisper_medium_all
Input path: ../results/intron-open-test-whisper_all-wer-0.2161-5474.csv
Input shape: (5474, 8)
Dataset split shape: (6346, 14)
Dataset split shape (without duplicates): (6316, 14)
Reference csv shape: (5474, 8)
Reference csv shape (without duplicates): (5471, 21)
df shape: (5474, 8)
df shape (transformed): (5471, 12)
***raw
wer: 0.3420153211549794
output path: ../results/raw/intron-open-test-openai_whisper_medium_all-wer-0.342-5471.csv
Output shape: (5471, 15)
Total null values: 0
***normalized
wer: 0.2161564046827122
output path: ../results/normalized/intron-open-test-openai_whisper_medium_all-wer-0.2162-5471.csv
Output shape: (5471, 15)
Model name: openai_whisper_medium_general
Input path: ../results/intron-open-test-whisper_general-wer-0.3508-5474.csv
Input shape: (5474, 8)
Dataset split shape: (6346, 14)
Dataset split shape (without duplicates): (6316, 14)
Reference csv shape: (5474, 8)
Reference csv shape (without duplicates): (5471, 21)
df sh

In [7]:
test_combined_df.head(3)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,audio_paths,origin,country,reference,prediction,wer,name,split
0,149716,clinical,Female,10.776984,26-40,twi,5ab7b49ad1cab6392a764a69578dc822,/AfriSpeech-100/test/e696aff8-ce05-4c8e-a3b3-5...,african,GH,Proteins break down to release amino acids whi...,proteins breakdown to release amino acids whic...,0.391304,openai/whisper-medium-all,test
1,360395,clinical,Female,12.39,19-25,igbo,543c037ff44816e8b5ccd0d6cc92fe13,/AfriSpeech-100/test/f28baac3-cdcd-45a2-888c-a...,nigerian,NG,To grant such a patent license to a party mean...,togrant such a patent license to a party means...,0.16,openai/whisper-medium-all,test
2,153514,clinical,Female,5.826984,26-40,isizulu,e87486db0c365bded42184d56b46a2a7,/AfriSpeech-100/test/13062a1b-662b-4afa-85b4-9...,african,ZA,Flatus indicates return of peristalsis.,flatus indicates return ofperistalsis.,0.6,openai/whisper-medium-all,test


# Dev Dataset

In [8]:
dev_model_name_csv_path_dict = {
    "openai_whisper_medium_all": ["../results/intron-open-dev-whisper_all-wer-0.2272-2883.csv", "all"],
    "openai_whisper_medium_general": ["../results/intron-open-dev-whisper_general-wer-0.3469-2883.csv", "general"],
    "openai_whisper_medium_clinical": ["../results/intron-open-dev-whisper_clinical-wer-0.3763-2883.csv", "clinical"],
    "facebook_wav2vec2_large_xlsr_53_english_all": ["../results/intron-open-dev-wav2vec2-large-xlsr-53-all-wer-0.3017-2883.csv", "all"],
    "facebook_wav2vec2_large_xlsr_53_english_general": ["../results/intron-open-dev-wav2vec2-large-xlsr-53-general-wer-0.3468-2883.csv", "general"],
    "facebook_wav2vec2_large_xlsr_53_english_clinical": ["../results/intron-open-dev-wav2vec2-large-xlsr-53-clinical-wer-0.3739-2883.csv", "clinical"],
}

In [9]:
dev_combined_df = pd.DataFrame()
dev_combined_normalized_df = pd.DataFrame()

for model_name, model_csv_path_and_domain in dev_model_name_csv_path_dict.items():
    print(f"Model name: {model_name}")
    df, normalized_df = consolidate_inference_results(
        model_csv_path=model_csv_path_and_domain[0], 
        model_name=model_name,
        metric=wer_metric,
        domain=model_csv_path_and_domain[1],
        ref_csv_path="../results/intron-open-dev-whisper_medium-wer-0.3049-2872.csv",
        dataset_csv_path="../data/intron-dev-public-3231-clean.csv",
        output_dir="../results", 
    )


    dev_combined_df = pd.concat([dev_combined_df, df])
    dev_combined_normalized_df = pd.concat([dev_combined_normalized_df, normalized_df])
    print("="*10)

print(f"Combined df shape: {dev_combined_df.shape}")
print(f"Combined df (normalized) shape: {dev_combined_normalized_df.shape}")

dev_combined_df.to_csv("../results/raw/intron-open-dev-all_models_finetuned.csv", index=False)
dev_combined_normalized_df.to_csv("../results/normalized/intron-open-dev-all_models_finetuned.csv", index=False)

Model name: openai_whisper_medium_all
Input path: ../results/intron-open-dev-whisper_all-wer-0.2272-2883.csv
Input shape: (2883, 8)
Dataset split shape: (3231, 14)
Dataset split shape (without duplicates): (3227, 14)
Reference csv shape: (2872, 8)
Reference csv shape (without duplicates): (2872, 21)
df shape: (2883, 8)
df shape (transformed): (2872, 12)
***raw
wer: 0.3351252376816357
output path: ../results/raw/intron-open-dev-openai_whisper_medium_all-wer-0.3351-2872.csv
Output shape: (2872, 15)
Total null values: 0
***normalized
wer: 0.226967505634861
output path: ../results/normalized/intron-open-dev-openai_whisper_medium_all-wer-0.227-2872.csv
Output shape: (2872, 15)
Model name: openai_whisper_medium_general
Input path: ../results/intron-open-dev-whisper_general-wer-0.3469-2883.csv
Input shape: (2883, 8)
Dataset split shape: (3231, 14)
Dataset split shape (without duplicates): (3227, 14)
Reference csv shape: (2872, 8)
Reference csv shape (without duplicates): (2872, 21)
df shape: 