In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_metric
import os
import sys
sys.path.append("..")
from src.utils.text_processing import clean_text

In [3]:
wer_metric = load_metric("wer")

In [4]:
pd.read_csv("../data/intron-test-public-6346-clean.csv").head(2)

Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,gender
0,149716,5ab7b49ad1cab6392a764a69578dc822,twi,26-40,GH,Proteins break down to release amino acids whi...,136,3a21d00eec39a31f089b9b4f0f8afa21,/AfriSpeech-100/test/e696aff8-ce05-4c8e-a3b3-5...,10.776984,african,clinical,test,Female
1,360395,543c037ff44816e8b5ccd0d6cc92fe13,igbo,19-25,NG,To grant such a patent license to a party mean...,128,544fbff921d13b224adbbb0f637196ed,/AfriSpeech-100/test/f28baac3-cdcd-45a2-888c-a...,12.39,nigerian,clinical,test,Female


In [5]:
# Handy functions
def transform_raw_df(df_raw, mapping_df, split_df, metric, columns=[], fmt="intron_whisper"):
    assert fmt != "" or fmt is not None

    
    df = df_raw.copy()
    print(f"df shape: {df.shape}")

    if fmt == "intron_whisper":
        df = df[["audio_paths", "hypothesis"]]
        df = pd.merge(df, mapping_df, on="audio_paths")
        df = pd.merge(df, split_df, on=["idx", "audio_paths"])
        df = df[~df.duplicated(subset="idx")]

        df = df[columns+["hypothesis"]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, "hypothesis"]
        df = df.drop(columns=["hypothesis", "transcript"])
    
    elif fmt == "aws":
        df = df[["idx", "predictions"]]
        df = pd.merge(df, mapping_df, on="idx")
        df = pd.merge(df, split_df, on=["idx", "audio_paths"])
        df = df[~df.duplicated(subset="idx")]

        df = df[columns+["predictions"]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, "predictions"]
        df = df.drop(columns=["predictions",  "transcript"])

    elif fmt == "azure":
        df = df[["idx", "predictions_raw"]]
        df = pd.merge(df, mapping_df, on="idx")
        df = pd.merge(df, split_df, on=["idx", "audio_paths"])
        df = df[~df.duplicated(subset="idx")]

        df = df[columns+["predictions_raw"]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, "predictions_raw"]
        df = df.drop(columns=["predictions_raw",  "transcript"])
    
    elif fmt == "african_nlp":
        df["audio_paths"] = df[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
        df = df[["audio_paths", 1]]
        df = pd.merge(df, mapping_df, on="audio_paths")
        df = pd.merge(df, split_df, on=["idx", "audio_paths"])

        df = df[columns+[1]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, 1]
        df = df.drop(columns=[1,  "transcript"])
    
    elif fmt in df.columns:
        df = df[["idx", fmt]]
        df = pd.merge(df, mapping_df, on="idx")
        df = pd.merge(df, split_df, on=["idx", "audio_paths"])
        df = df[~df.duplicated(subset="idx")]
        
        df = df[columns+[fmt]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, fmt]
        df = df.drop(columns=[fmt,  "transcript"])
  
    else:
        raise NotImplementedError()
    
    print(f"df shape (transformed): {df.shape}")
    df["wer"] = df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], references=[x.reference]), axis=1) 
    return df
    


def normalize_and_compute_wer(df, metric):
    normalized_df = df.copy()
    print(f"Total null values: {normalized_df[normalized_df['prediction'].isnull()].shape[0]}")
    normalized_df["prediction"] = normalized_df["prediction"].fillna("")
    
    normalized_df["reference"] = normalized_df["reference"].apply(lambda x: clean_text(x))
    normalized_df["prediction"] = normalized_df["prediction"].apply(lambda x: clean_text(x))
    
    normalized_df["wer"] = normalized_df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], 
                                                                  references=[x.reference]), axis=1)
    return normalized_df
    
    
def write_to_folder(model_id_or_path, predictions_df, output_dir="../results/", split="test"):
    wer = wer_metric.compute(predictions=predictions_df.prediction, references=predictions_df.reference)
    print(f"wer: {wer}")
    output_path = f"{output_dir}/intron-open-{split}-{model_id_or_path}-wer-{round(wer, 4)}-{len(predictions_df)}.csv"
    print(f"output path: {output_path}")
    print(f"Output shape: {predictions_df.shape}")
    predictions_df.to_csv(output_path, index=False)
    
def consolidate_zero_shot_results(model_csv_path, model_name, metric,
                                  ref_csv_path="../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv",
                                  dataset_csv_path="../data/intron-test-public-6346-clean.csv",
                                  output_dir="../results"):
    
    split_df = pd.read_csv(dataset_csv_path)
    print(f"Dataset split shape: {split_df.shape}")
    split_df = split_df[~split_df.duplicated(subset="audio_paths")]
    split_df = split_df[~split_df.duplicated(subset="idx")]
    print(f"Dataset split shape (without duplicates): {split_df.shape}")
    split_name = split_df.split.unique()[0]
    split_df["audio_paths"] = split_df["audio_paths"].apply(lambda x: x.replace(f"/AfriSpeech-100/{split_name}/", "/data/data/intron/"))
    
    
    print(f"Input path: {model_csv_path}")
    if model_name in ["nvidia_nemo_conformer_ctc_large", 
                      "nvidia_nemo_conformer_transducer_large", 
                      "speechbrain_crdnn_rnnlm_librispeech"] and split_name == "dev":
        raw_df = pd.read_csv(model_csv_path, header=None, delimiter="	")
        print(f"Input shape: {raw_df.shape}")
        
    else:
        raw_df = pd.read_csv(model_csv_path)
        print(f"Input shape: {raw_df.shape}")
    
    
    ref_df = pd.read_csv(ref_csv_path)
    print(f"Reference csv shape: {ref_df.shape}")
    ref_df = pd.merge(ref_df, split_df, on="audio_paths")
    ref_df[ref_df.duplicated(subset="audio_paths")]
    print(f"Reference csv shape (without duplicates): {ref_df.shape}")
    
    mapping_df = ref_df[["idx", "audio_paths"]]
    columns = ["idx", "domain", "gender", "duration", 
               "age_group", "accent", "user_ids", 
               "transcript", "audio_paths", "origin", 
               "country"]

    
    
    # create output directory if not exist
    os.makedirs(f"{output_dir}/raw", exist_ok=True)
    os.makedirs(f"{output_dir}/normalized", exist_ok=True)
    
    models_name_mapping = {
        "openai_whisper_small": "openai/whisper-small",
        "openai_whisper_small_en": "openai/whisper-small-en",
        "openai_whisper_medium": "openai/whisper-medium",
        "openai_whisper_medium_en": "openai/whisper-medium-en",
        "openai_whisper_large": "openai/whisper-large",
        "facebook_hubert_large_ls960_ft": "facebook/hubert-large-ls960-ft",
        "facebook_hubert_xlarge_ls960_ft": "facebook/hubert-xlarge-ls960-ft",
        "facebook_wav2vec2_large_robust_ft_swbd_300h": "facebook/wav2vec2-large-robust-ft-swbd-300h",
        "facebook_wav2vec2_large_xlsr_53_english": "facebook/wav2vec2-large-xlsr-53-english",
        "facebook_wav2vec2_large_960h_lv60_self": "facebook/wav2vec2-large-960h-lv60-self",
        "facebook_wav2vec2_large_960h": "facebook/wav2vec2-large-960h",
        "facebook_wav2vec2_xls_r_1b_english": "facebook/wav2vec2-xls-r-1b-english",
        "nvidia_nemo_conformer_ctc_large": "nvidia/nemo-conformer-ctc-large",
        "nvidia_nemo_conformer_transducer_large": "nvidia/nemo-conformer-transducer-large",
        "microsoft_wavlm_libri_clean_100h_base": "microsoft/wavlm-libri-clean-100h-base",
        "microsoft_wavlm_libri_clean_100h_large": "microsoft/wavlm-libri-clean-100h-large",
        "microsoft_azure_speech_api": "Azure",
        "google_gcp_speech_api": "GCP",
        "google_gcp_medical_speech_api": "GCP [Medical]",
        "amazon_aws_transcribe_api": "AWS",
        "amazon_aws_transcribe_medical_api_primary_care": "AWS [Medical] (Primary Care)",
        "speechbrain_crdnn_rnnlm_librispeech": "speechbrain/crdnn-rnnlm-librispeech",  
    }
    
    
#     if  model_name == "openai_whisper_large" and split_name == "dev":
#         fmt = "whisper_large"
        
#     elif  model_name == "google_gcp_speech_api" and split_name == "dev":
#         fmt = "gcp"
    
    if  model_name in ["nvidia_nemo_conformer_ctc_large", 
                      "nvidia_nemo_conformer_transducer_large", 
                      "speechbrain_crdnn_rnnlm_librispeech"] and split_name == "dev":
            fmt = "african_nlp"
    elif model_name in ["openai_whisper_small", 
                      "openai_whisper_small_en", 
                      "openai_whisper_medium", 
                      "openai_whisper_medium_en", 
                      "openai_whisper_large", 
                      "facebook_hubert_large_ls960_ft", 
                      "facebook_hubert_xlarge_ls960_ft", 
                      "facebook_wav2vec2_large_robust_ft_swbd_300h", 
                      "facebook_wav2vec2_large_xlsr_53_english", 
                      "facebook_wav2vec2_large_960h_lv60_self", 
                      "facebook_wav2vec2_large_960h", 
                      "facebook_wav2vec2_xls_r_1b_english", 
                      "nvidia_nemo_conformer_ctc_large", 
                      "nvidia_nemo_conformer_transducer_large", 
                      "microsoft_wavlm_libri_clean_100h_base", 
                      "microsoft_wavlm_libri_clean_100h_large",
                      "speechbrain_crdnn_rnnlm_librispeech"]:
        fmt = "intron_whisper"
    elif model_name in ["amazon_aws_transcribe_api", 
                        "amazon_aws_transcribe_medical_api_primary_care",
                        "google_gcp_speech_api",
                        "google_gcp_medical_speech_api"]:
        fmt = "aws"
        
    elif model_name == "microsoft_azure_speech_api":
        fmt = "azure"
    else:
        fmt = model_name
   
    # transform the raw dataframe
    df = transform_raw_df(raw_df, split_df=split_df, mapping_df=mapping_df, columns=columns, fmt=fmt, metric=metric)
    df["name"] = models_name_mapping[model_name]
    df["split"] = split_name
    df["audio_paths"] = df["audio_paths"].apply(lambda x: x.replace("/data/data/intron/", f"/AfriSpeech-100/{split_name}/"))
    
    print("***raw")
    # write the result to folder
    write_to_folder(model_id_or_path=model_name, 
                    predictions_df=df, 
                    output_dir=f"{output_dir}/raw",
                    split=split_name)
    
    
    # normalize the raw dataframe
    normalized_df = normalize_and_compute_wer(df, metric=metric)
    normalized_df["name"] = models_name_mapping[model_name]
    normalized_df["split"] = split_name
    normalized_df["audio_paths"] = normalized_df["audio_paths"].apply(lambda x: x.replace("/data/data/intron/", f"/AfriSpeech-100/{split_name}/"))
    
    # write the result to folder
    print("***normalized")
    write_to_folder(model_id_or_path=model_name, 
                    predictions_df=normalized_df,
                    output_dir=f"{output_dir}/normalized",
                    split=split_name)
    
    
    return df, normalized_df
    

# Test Dataset

In [6]:
test_model_name_csv_path_dict = {
    "openai_whisper_small": "../results/intron-open-test-whisper_small-wer-0.3907-5474.csv",
    "openai_whisper_small_en": "../results/intron-open-test-whisper_small.en-wer-0.4142-5474.csv",
    "openai_whisper_medium": "../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv",
    "openai_whisper_medium_en": "../results/intron-open-test-whisper_medium.en-wer-0.3577-5474.csv",
    "openai_whisper_large": "../results/intron-open-test-whisper_large-wer-0.3057-5474.csv",
    "facebook_hubert_large_ls960_ft": "../results/intron-open-test-facebook-hubert-large-ls960-ft-wer-0.633-5474.csv",
    "facebook_hubert_xlarge_ls960_ft": "../results/intron-open-test-facebook-hubert-xlarge-ls960-ft-wer-0.6409-5474.csv",
    "facebook_wav2vec2_large_robust_ft_swbd_300h": "../results/intron-open-test-facebook-wav2vec2-large-robust-ft-swbd-300h-wer-0.8169-5474.csv",
    "facebook_wav2vec2_large_xlsr_53_english": "../results/intron-open-test-jonatasgrosman-wav2vec2-large-xlsr-53-english-wer-0.576-5474.csv",
    "facebook_wav2vec2_large_960h_lv60_self": "../results/intron-open-test-facebook-wav2vec2-large-960h-lv60-self-wer-0.6111-5474.csv",
    "facebook_wav2vec2_large_960h": "../results/intron-open-test-facebook-wav2vec2-large-960h-wer-0.7169-5474.csv",
    "facebook_wav2vec2_xls_r_1b_english": "../results/intron-open-test-jonatasgrosman-wav2vec2-xls-r-1b-english-wer-0.5935-5474.csv",
    "microsoft_wavlm_libri_clean_100h_base": "../results/intron-open-test-patrickvonplaten-wavlm-libri-clean-100h-base-plus-wer-0.8593-5474.csv",
    "microsoft_wavlm_libri_clean_100h_large": "../results/intron-open-test-patrickvonplaten-wavlm-libri-clean-100h-large-wer-0.7051-5474.csv",
    "microsoft_azure_speech_api": "../results/intron-open-test-azure-transcribe-wer-0.4437-5498.csv",
    "google_gcp_speech_api": "../results/intron-open-test-gcp-transcribe-wer-0.6357-5498.csv",
    "google_gcp_medical_speech_api": "../results/intron-open-test-gcp-transcribe-medical-wer-0.625-5498.csv",
    "amazon_aws_transcribe_api": "../results/intron-open-test-aws-transcribe-wer-0.5417-5498.csv",
    "amazon_aws_transcribe_medical_api_primary_care": "../results/intron-open-test-aws-transcribe-medical-wer-0.5682-5498.csv",

#     "speechbrain_crdnn_rnnlm_librispeech": "",  
#     "nvidia_nemo_conformer_ctc_large": "",
#     "nvidia_nemo_conformer_transducer_large": "",
}

In [7]:
test_combined_df = pd.DataFrame()
test_combined_normalized_df = pd.DataFrame()

for model_name, model_csv_path in test_model_name_csv_path_dict.items():
    print(f"Model name: {model_name}")
    df, normalized_df = consolidate_zero_shot_results(
        model_csv_path=model_csv_path, 
        model_name=model_name,
        metric=wer_metric,
        ref_csv_path="../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv",
        dataset_csv_path="../data/intron-test-public-6346-clean.csv",
        output_dir="../results", 
    )


    test_combined_df = pd.concat([test_combined_df, df])
    test_combined_normalized_df = pd.concat([test_combined_normalized_df, normalized_df])
    print("="*10)

print(f"Combined df shape: {test_combined_df.shape}")
print(f"Combined df (normalized) shape: {test_combined_normalized_df.shape}")

test_combined_df.to_csv("../results/raw/intron-open-test-all_models.csv", index=False)
test_combined_normalized_df.to_csv("../results/normalized/intron-open-test-all_models.csv", index=False)

Model name: openai_whisper_small
Dataset split shape: (6346, 14)
Dataset split shape (without duplicates): (6316, 14)
Input path: ../results/intron-open-test-whisper_small-wer-0.3907-5474.csv
Input shape: (5474, 8)
Reference csv shape: (5474, 8)
Reference csv shape (without duplicates): (5471, 21)
df shape: (5474, 8)
df shape (transformed): (5471, 12)
***raw
wer: 0.4812937864204806
output path: ../results/raw/intron-open-test-openai_whisper_small-wer-0.4813-5471.csv
Output shape: (5471, 15)
Total null values: 0
***normalized
wer: 0.39069743079380875
output path: ../results/normalized/intron-open-test-openai_whisper_small-wer-0.3907-5471.csv
Output shape: (5471, 15)
Model name: openai_whisper_small_en
Dataset split shape: (6346, 14)
Dataset split shape (without duplicates): (6316, 14)
Input path: ../results/intron-open-test-whisper_small.en-wer-0.4142-5474.csv
Input shape: (5474, 8)
Reference csv shape: (5474, 8)
Reference csv shape (without duplicates): (5471, 21)
df shape: (5474, 8)
d

***raw
wer: 0.6787402605905847
output path: ../results/raw/intron-open-test-facebook_wav2vec2_xls_r_1b_english-wer-0.6787-5471.csv
Output shape: (5471, 15)
Total null values: 1
***normalized
wer: 0.5935363905402928
output path: ../results/normalized/intron-open-test-facebook_wav2vec2_xls_r_1b_english-wer-0.5935-5471.csv
Output shape: (5471, 15)
Model name: microsoft_wavlm_libri_clean_100h_base
Dataset split shape: (6346, 14)
Dataset split shape (without duplicates): (6316, 14)
Input path: ../results/intron-open-test-patrickvonplaten-wavlm-libri-clean-100h-base-plus-wer-0.8593-5474.csv
Input shape: (5474, 8)
Reference csv shape: (5474, 8)
Reference csv shape (without duplicates): (5471, 21)
df shape: (5474, 8)
df shape (transformed): (5471, 12)
***raw
wer: 0.8845151574674261
output path: ../results/raw/intron-open-test-microsoft_wavlm_libri_clean_100h_base-wer-0.8845-5471.csv
Output shape: (5471, 15)
Total null values: 74
***normalized
wer: 0.8593353062881387
output path: ../results/nor

In [8]:
test_combined_df.head()

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,audio_paths,origin,country,reference,prediction,wer,name,split
0,149716,clinical,Female,10.776984,26-40,twi,5ab7b49ad1cab6392a764a69578dc822,/AfriSpeech-100/test/e696aff8-ce05-4c8e-a3b3-5...,african,GH,Proteins break down to release amino acids whi...,Proteins break down to release amino acids whi...,0.26087,openai/whisper-small,test
1,360395,clinical,Female,12.39,19-25,igbo,543c037ff44816e8b5ccd0d6cc92fe13,/AfriSpeech-100/test/f28baac3-cdcd-45a2-888c-a...,nigerian,NG,To grant such a patent license to a party mean...,To grant such a patent license to a party mean...,0.12,openai/whisper-small,test
2,153514,clinical,Female,5.826984,26-40,isizulu,e87486db0c365bded42184d56b46a2a7,/AfriSpeech-100/test/13062a1b-662b-4afa-85b4-9...,african,ZA,Flatus indicates return of peristalsis.,Fletters indicates return of peristalsis.,0.2,openai/whisper-small,test
3,129184,clinical,Female,12.985986,26-40,luganda,9d8db954e680843a47c3b7e224f12371,/AfriSpeech-100/test/7ce32977-b330-43c0-bae0-7...,african,UG,Since the degree of effect produced by a drug ...,Since the degree of effect produced by the dru...,0.142857,openai/whisper-small,test
4,155127,clinical,Female,9.73,26-40,setswana,cdf91cf6e59ee411b985a40a955d4d1f,/AfriSpeech-100/test/27a83595-3d3f-4a6b-b909-7...,african,BW,Protection of the host immune mechanism mighti...,Protection of the host immune mechanism might ...,0.466667,openai/whisper-small,test


# Dev Dataset

In [9]:
dev_model_name_csv_path_dict = {
    "openai_whisper_small": "../results/intron-open-dev-whisper_small-wer-0.3427-2883.csv",
    "openai_whisper_small_en": "../results/intron-open-dev-whisper_small.en-wer-0.3521-2883.csv",
    "openai_whisper_medium": "../results/intron-open-dev-whisper_medium-wer-0.2735-2883.csv",
    "openai_whisper_medium_en": "../results/intron-open-dev-whisper_medium.en-wer-0.2911-2883.csv",
    "openai_whisper_large": "../results/intron-open-dev-whisper_large-wer-0.2617-2883.csv",
    "facebook_hubert_large_ls960_ft": "../results/intron-open-dev-facebook-hubert-large-ls960-ft-wer-0.5675-2883.csv",
    "facebook_hubert_xlarge_ls960_ft": "../results/intron-open-dev-facebook-hubert-xlarge-ls960-ft-wer-0.571-2883.csv",
    "facebook_wav2vec2_large_robust_ft_swbd_300h": "../results/intron-open-dev-facebook-wav2vec2-large-robust-ft-swbd-300h-wer-0.7338-2883.csv",
    "facebook_wav2vec2_large_xlsr_53_english": "../results/intron-open-dev-jonatasgrosman-wav2vec2-large-xlsr-53-english-wer-0.53-2883.csv",
    "facebook_wav2vec2_large_960h_lv60_self": "../results/intron-open-dev-facebook-wav2vec2-large-960h-lv60-self-wer-0.55-2883.csv",
    "facebook_wav2vec2_large_960h": "../results/intron-open-dev-facebook-wav2vec2-large-960h-wer-0.6528-2883.csv",
    "facebook_wav2vec2_xls_r_1b_english": "../results/intron-open-dev-jonatasgrosman-wav2vec2-xls-r-1b-english-wer-0.5373-2883.csv",
    "microsoft_wavlm_libri_clean_100h_base": "../results/intron-open-dev-patrickvonplaten-wavlm-libri-clean-100h-base-plus-wer-0.8084-2883.csv",
    "microsoft_wavlm_libri_clean_100h_large": "../results/intron-open-dev-patrickvonplaten-wavlm-libri-clean-100h-large-wer-0.6428-2883.csv",
    "microsoft_azure_speech_api": "../results/intron-open-dev-azure-transcribe-wer-0.3729-2887.csv",
    "google_gcp_speech_api": "../results/intron-open-dev-gcp-transcribe-wer-0.5741-2887.csv",
    "google_gcp_medical_speech_api": "../results/intron-open-dev-gcp-transcribe-medical-wer-0.5649-2887.csv",
    "amazon_aws_transcribe_api": "../results/intron-open-dev-aws-transcribe-wer-0.4653-2887.csv",
    "amazon_aws_transcribe_medical_api_primary_care": "../results/intron-open-dev-aws-transcribe-medical-wer-0.4742-2887.csv",
    "speechbrain_crdnn_rnnlm_librispeech": "../results/african-nlp-speechbrain-predictons",  
    "nvidia_nemo_conformer_ctc_large": "../results/african-nlp-nemo-ctc-predictons",
    "nvidia_nemo_conformer_transducer_large": "../results/african-nlp-nemo-transducer-predictons",
}

In [10]:
dev_combined_df = pd.DataFrame()
dev_combined_normalized_df = pd.DataFrame()

for model_name, model_csv_path in dev_model_name_csv_path_dict.items():
    print(f"Model name: {model_name}")
    df, normalized_df = consolidate_zero_shot_results(
        model_csv_path=model_csv_path, 
        model_name=model_name,
        metric=wer_metric,
        ref_csv_path="../results/intron-open-dev-whisper_medium-wer-0.3049-2872.csv",
        dataset_csv_path="../data/intron-dev-public-3231-clean.csv",
        output_dir="../results", 
    )

    dev_combined_df = pd.concat([dev_combined_df, df])
    dev_combined_normalized_df = pd.concat([dev_combined_normalized_df, normalized_df])
    print("="*10)

print(f"Combined df shape: {dev_combined_df.shape}")
print(f"Combined df (normalized) shape: {dev_combined_normalized_df.shape}")

dev_combined_df.to_csv("../results/raw/intron-open-dev-all_models.csv", index=False)
dev_combined_normalized_df.to_csv("../results/normalized/intron-open-dev-all_models.csv", index=False)

Model name: openai_whisper_small
Dataset split shape: (3231, 14)
Dataset split shape (without duplicates): (3227, 14)
Input path: ../results/intron-open-dev-whisper_small-wer-0.3427-2883.csv
Input shape: (2883, 8)
Reference csv shape: (2872, 8)
Reference csv shape (without duplicates): (2872, 21)
df shape: (2883, 8)
df shape (transformed): (2872, 12)
***raw
wer: 0.4282729641540881
output path: ../results/raw/intron-open-dev-openai_whisper_small-wer-0.4283-2872.csv
Output shape: (2872, 15)
Total null values: 0
***normalized
wer: 0.3425525920360631
output path: ../results/normalized/intron-open-dev-openai_whisper_small-wer-0.3426-2872.csv
Output shape: (2872, 15)
Model name: openai_whisper_small_en
Dataset split shape: (3231, 14)
Dataset split shape (without duplicates): (3227, 14)
Input path: ../results/intron-open-dev-whisper_small.en-wer-0.3521-2883.csv
Input shape: (2883, 8)
Reference csv shape: (2872, 8)
Reference csv shape (without duplicates): (2872, 21)
df shape: (2883, 8)
df sha

***raw
wer: 0.6213995633700322
output path: ../results/raw/intron-open-dev-facebook_wav2vec2_xls_r_1b_english-wer-0.6214-2872.csv
Output shape: (2872, 15)
Total null values: 0
***normalized
wer: 0.5371431254695718
output path: ../results/normalized/intron-open-dev-facebook_wav2vec2_xls_r_1b_english-wer-0.5371-2872.csv
Output shape: (2872, 15)
Model name: microsoft_wavlm_libri_clean_100h_base
Dataset split shape: (3231, 14)
Dataset split shape (without duplicates): (3227, 14)
Input path: ../results/intron-open-dev-patrickvonplaten-wavlm-libri-clean-100h-base-plus-wer-0.8084-2883.csv
Input shape: (2883, 8)
Reference csv shape: (2872, 8)
Reference csv shape (without duplicates): (2872, 21)
df shape: (2883, 8)
df shape (transformed): (2872, 12)
***raw
wer: 0.8377661447451819
output path: ../results/raw/intron-open-dev-microsoft_wavlm_libri_clean_100h_base-wer-0.8378-2872.csv
Output shape: (2872, 15)
Total null values: 31
***normalized
wer: 0.8082738542449286
output path: ../results/normali