In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_metric
import sys
sys.path.append("..")
from src.utils.text_processing import clean_text

In [3]:
wer_metric = load_metric("wer")

In [4]:
test_df = pd.read_csv("../data/intron-test-public-6346-clean.csv")
print(test_df.shape)
test_df.head(3)

(6346, 14)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,gender
0,149716,5ab7b49ad1cab6392a764a69578dc822,twi,26-40,GH,Proteins break down to release amino acids whi...,136,3a21d00eec39a31f089b9b4f0f8afa21,/AfriSpeech-100/test/e696aff8-ce05-4c8e-a3b3-5...,10.776984,african,clinical,test,Female
1,360395,543c037ff44816e8b5ccd0d6cc92fe13,igbo,19-25,NG,To grant such a patent license to a party mean...,128,544fbff921d13b224adbbb0f637196ed,/AfriSpeech-100/test/f28baac3-cdcd-45a2-888c-a...,12.39,nigerian,clinical,test,Female
2,153514,e87486db0c365bded42184d56b46a2a7,isizulu,26-40,ZA,Flatus indicates return of peristalsis.,38,2f91ceb3c1d9c61f2b6a77d602f6d5ee,/AfriSpeech-100/test/13062a1b-662b-4afa-85b4-9...,5.826984,african,clinical,test,Female


In [5]:
test_df = test_df[~test_df.duplicated(subset="audio_paths")]
print(test_df.shape)

(6319, 14)


In [6]:
test_df["audio_paths"] = test_df["audio_paths"].apply(lambda x: x.replace("/AfriSpeech-100/test/", "/data/data/intron/"))
df = pd.read_csv("../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv")
df = pd.merge(df, test_df, on="audio_paths")
df.shape

(5474, 21)

In [7]:
mapping_df = df[["idx", "audio_paths"]]
mapping_df.head(3)

Unnamed: 0,idx,audio_paths
0,149716,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...
1,360395,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...
2,153514,/data/data/intron/13062a1b-662b-4afa-85b4-98f7...


In [8]:
models_df = pd.DataFrame()
models_df_clean = pd.DataFrame()

In [9]:
# Handy functions
def transform_raw_df(df_raw, columns=[], fmt="intron_whisper"):
    assert fmt != "" or fmt is not None
    
    df = df_raw.copy()
    
    if fmt == "intron_whisper":
        df = pd.merge(df, test_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["transcript", "hypothesis", "audio_paths_x", "accent_x"]]
        df.loc[:, "accent"] = df.loc[:, "accent_x"]
        df.loc[:, "audio_paths"] = df.loc[:, "audio_paths_x"]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, "hypothesis"]
        df = df.drop(columns=["accent_x", "audio_paths_x", "hypothesis", "transcript"])
    
    elif fmt == "african_nlp":
        df = pd.merge(df, test_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript", 1]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, 1]
        df = df.drop(columns=[1, "transcript"])

    elif fmt == "aws":
        df = pd.merge(df[["idx", "transcript", "predictions"]], test_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript_x", "predictions"]]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, "predictions"]
        df = df.drop(columns=["transcript_x", "predictions"])

    elif fmt == "azure":
        df = pd.merge(df[["idx", "transcript", "predictions_raw"]], test_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript_x", "predictions_raw"]]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, "predictions_raw"]
        df = df.drop(columns=["transcript_x", "predictions_raw"])
    
    elif fmt == "wav2vec2":
        df.loc[:, "audio_paths"] = df["audio_path"]
        df = pd.merge(df[["audio_paths", "text", "predictions_raw"]], test_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "text", "predictions_raw"]]
        df.loc[:, "reference"] = df.loc[:, "text"]
        df.loc[:, "prediction"] = df.loc[:, "predictions_raw"]
        df = df.drop(columns=["text", "predictions_raw"])
    
    elif fmt in df.columns:
        df = pd.merge(df[["idx", "transcript", fmt]], test_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript_x", fmt]]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, fmt]
        df = df.drop(columns=["transcript_x", fmt])
  
    else:
        raise NotImplementedError()
    
    print(df.shape)
    df["wer"] = df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], references=[x.reference]), axis=1) 
    return df
    


def clean_and_compute_wer(df):
    df_clean = df.copy()
    
    print(df_clean[df_clean["prediction"].isnull()].shape)
    
    df_clean["prediction"] = df_clean["prediction"].fillna("")
    print(df_clean[df_clean["prediction"].isnull()].shape)
    
    df_clean["reference"] = df_clean["reference"].apply(lambda x: clean_text(x))
    df_clean["prediction"] = df_clean["prediction"].apply(lambda x: clean_text(x))
    
    df_clean["wer"] = df_clean.apply(lambda x: wer_metric.compute(predictions=[x.prediction], 
                                                                  references=[x.reference]), axis=1)
    return df_clean
    
    
def write_to_folder(model_id_or_path, predictions_df, output_dir="../results/", split="test"):
    wer = predictions_df["wer"].mean()
    output_path = f"{output_dir}/intron-open-{split}-{model_id_or_path}-wer-{round(wer, 4)}-{len(predictions_df)}.csv"
    predictions_df.to_csv(output_path, index=False)

In [10]:
# make output dir
import os
os.makedirs("../results/raw", exist_ok=True)
os.makedirs("../results/normalized", exist_ok=True)

In [11]:
columns = ["idx", "domain", "gender", "duration", "age_group"]

# Compute WER

## OpenAI

### Whisper-Large

In [12]:
whspr_large_raw = pd.read_csv("../results/intron-open-test-whisper_large-wer-0.3057-5474.csv")
whspr_large = transform_raw_df(whspr_large_raw, columns=columns, fmt="intron_whisper")
whspr_large_clean = clean_and_compute_wer(whspr_large)

(5480, 9)
(0, 10)
(0, 10)


In [13]:
whspr_large.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,audio_paths,reference,prediction,wer
0,149716,clinical,Female,10.776984,26-40,twi,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,Proteins break down to release amino acids whi...,Proteins break down to release amino acids whi...,0.130435
1,360395,clinical,Female,12.39,19-25,igbo,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,To grant such a patent license to a party mean...,To grant such a patent license to a party mean...,0.12


In [14]:
whspr_large_clean.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,audio_paths,reference,prediction,wer
0,149716,clinical,Female,10.776984,26-40,twi,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...,0.130435
1,360395,clinical,Female,12.39,19-25,igbo,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,0.0


In [15]:
# WER
print(whspr_large["wer"].mean())
print(whspr_large_clean["wer"].mean())

0.5094426796309084
0.4035450795850903


In [16]:
print(wer_metric.compute(predictions=whspr_large.prediction.array, references=whspr_large.reference.array))
print(wer_metric.compute(predictions=whspr_large_clean.prediction.array, references=whspr_large_clean.reference.array))

0.38991497710922174
0.3056859949510144


In [17]:
model_id_or_path = "openai_whisper_large"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_large, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_large_clean,
                output_dir="../results/normalized")

In [18]:
whspr_large["name"] = "openai_whisper_large"
whspr_large_clean["name"] = "openai_whisper_large"

models_df = pd.concat([models_df, whspr_large])
models_df_clean = pd.concat([models_df_clean, whspr_large_clean])

### Whisper-Medium

In [19]:
whspr_medium_raw = pd.read_csv("../results/intron-open-test-whisper_medium-wer-0.3322-5474.csv")
print(whspr_medium_raw.shape)
whspr_medium_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,Proteins break down to release amino acids whi...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...
1,To grant such a patent license to a party mean...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...


In [20]:
whspr_medium = transform_raw_df(whspr_medium_raw, columns=columns, fmt="intron_whisper")
whspr_medium_clean = clean_and_compute_wer(whspr_medium)

(5480, 9)
(0, 10)
(0, 10)


In [21]:
# WER
print(whspr_medium["wer"].mean())
print(whspr_medium_clean["wer"].mean())

0.546139008274179
0.42171545807051186


In [22]:
model_id_or_path = "openai_whisper_medium"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium_clean,
                output_dir="../results/normalized")

In [23]:
whspr_medium["name"] = "openai_whisper_medium"
whspr_medium_clean["name"] = "openai_whisper_medium"

models_df = pd.concat([models_df, whspr_medium])
models_df_clean = pd.concat([models_df_clean, whspr_medium_clean])

In [24]:
whspr_medium_en_raw = pd.read_csv("../results/intron-open-test-whisper_medium.en-wer-0.3577-5474.csv")
print(whspr_medium_en_raw.shape)
whspr_medium_en = transform_raw_df(whspr_medium_en_raw, columns=columns, fmt="intron_whisper")
whspr_medium_en_clean = clean_and_compute_wer(whspr_medium_en)

(5474, 8)
(5480, 9)
(0, 10)
(0, 10)


In [25]:
print(whspr_medium_en["wer"].mean())
print(whspr_medium_en_clean["wer"].mean())

0.5513099559994978
0.45046854042565565


In [26]:
model_id_or_path = "openai_whisper_medium_en"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium_en, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium_en_clean,
                output_dir="../results/normalized")

In [27]:
whspr_medium_en["name"] = "openai_whisper_medium_en"
whspr_medium_en_clean["name"] = "openai_whisper_medium_en"

models_df = pd.concat([models_df, whspr_medium_en])
models_df_clean = pd.concat([models_df_clean, whspr_medium_en_clean])

### Whisper-Small

In [28]:
whspr_small_raw = pd.read_csv("../results/intron-open-test-whisper_small-wer-0.3907-5474.csv")
print(whspr_small_raw.shape)
whspr_small_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,Proteins break down to release amino acids whi...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...,proteins break down to release amino acids whi...
1,To grant such a patent license to a party mean...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...


In [29]:
whspr_small = transform_raw_df(whspr_small_raw, columns=columns, fmt="intron_whisper")
whspr_small_clean = clean_and_compute_wer(whspr_small)

(5480, 9)
(0, 10)
(0, 10)


In [30]:
print(whspr_small["wer"].mean())
print(whspr_small_clean["wer"].mean())

0.5991580327132721
0.4902641114067483


In [31]:
model_id_or_path = "openai_whisper_small"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_small, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_small_clean,
                output_dir="../results/normalized")

In [32]:
whspr_small["name"] = "openai_whisper_small"
whspr_small_clean["name"] = "openai_whisper_small"

models_df = pd.concat([models_df, whspr_small])
models_df_clean = pd.concat([models_df_clean, whspr_small_clean])

In [33]:
# missing whisper-small-en
# whspr_small_en_raw = pd.read_csv("../results/intron-open-test-whiper")
# print(whspr_small_en_raw.shape)
# whspr_small_en = transform_raw_df(whspr_small_en_raw, columns=columns, fmt="intron_whisper")
# whspr_small_en_clean = clean_and_compute_wer(whspr_small_en)

In [34]:
# print(whspr_small_en["wer"].mean())
# print(whspr_small_en_clean["wer"].mean())

In [35]:
# model_id_or_path = "openai_whisper_small_en"

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=whspr_small_en, 
#                 output_dir="../results/raw")

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=whspr_small_en_clean,
#                 output_dir="../results/normalized")

In [36]:
# whspr_small_en["name"] = "openai_whisper_small_en"
# whspr_small_en_clean["name"] = "openai_whisper_small_en"

# models_df = pd.concat([models_df, whspr_small_en])
# models_df_clean = pd.concat([models_df_clean, whspr_small_en_clean])

In [37]:
models_df.shape, models_df_clean.shape

((21920, 11), (21920, 11))

## Nvidia

### Nemo-conformer-ctc-large

In [38]:
# nemo_ctc_raw = pd.read_csv("../results/intron-open-test", header=None, delimiter="	")
# nemo_ctc_raw["audio_paths"] = nemo_ctc_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
# print(nemo_ctc_raw.shape)
# nemo_ctc_raw.head(2)

In [39]:
# nemo_ctc = transform_raw_df(nemo_ctc_raw, columns=columns, fmt="african_nlp")
# nemo_ctc_clean = clean_and_compute_wer(nemo_ctc)

In [40]:
# print(nemo_ctc["wer"].mean())
# print(nemo_ctc_clean["wer"].mean())

In [41]:
# model_id_or_path = "nvidia_nemo_conformer_ctc_large"

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=nemo_ctc, 
#                 output_dir="../results/raw")

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=nemo_ctc_clean,
#                 output_dir="../results/normalized")

In [42]:
# nemo_ctc["name"] = "nvidia_nemo_conformer_ctc_large"
# nemo_ctc_clean["name"] = "nvidia_nemo_conformer_ctc_large"

# models_df = pd.concat([models_df, nemo_ctc])
# models_df_clean = pd.concat([models_df_clean, nemo_ctc_clean])

### Nemo-conformer-transducer-large

In [43]:
# nemo_transducer_raw = pd.read_csv("../results/intron-open-test", header=None, delimiter="	")
# nemo_transducer_raw["audio_paths"] = nemo_transducer_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
# print(nemo_transducer_raw.shape)
# nemo_transducer_raw.head(2)

In [44]:
# nemo_transducer = transform_raw_df(nemo_transducer_raw, columns=columns, fmt="african_nlp")
# nemo_transducer_clean = clean_and_compute_wer(nemo_transducer)

In [45]:
# print(nemo_transducer["wer"].mean())
# print(nemo_transducer_clean["wer"].mean())

In [46]:
# model_id_or_path = "nvidia_nemo_conformer_transducer_large"

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=nemo_transducer, 
#                 output_dir="../results/raw")

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=nemo_transducer_clean,
#                 output_dir="../results/normalized")

In [47]:
# nemo_transducer["name"] = "nvidia_nemo_conformer_transducer_large"
# nemo_transducer_clean["name"] = "nvidia_nemo_conformer_transducer_large"

# models_df = pd.concat([models_df, nemo_transducer])
# models_df_clean = pd.concat([models_df_clean, nemo_transducer_clean])

## Speech Brain

In [48]:
# speech_brain_raw = pd.read_csv("../results/intron-open-test", header=None, delimiter="	")
# speech_brain_raw["audio_paths"] = nemo_transducer_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
# print(speech_brain_raw.shape)
# speech_brain_raw.head(2)

In [49]:
# speech_brain = transform_raw_df(speech_brain_raw, columns=columns, fmt="african_nlp")
# speech_brain_clean = clean_and_compute_wer(speech_brain)

In [50]:
# print(speech_brain["wer"].mean())
# print(speech_brain_clean["wer"].mean())

In [51]:
# model_id_or_path="speechbrain_crdnn_rnnlm_librispeech"

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=speech_brain, 
#                 output_dir="../results/raw")

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=speech_brain_clean,
#                 output_dir="../results/normalized")

In [52]:
# speech_brain["name"] = "speechbrain_crdnn_rnnlm_librispeech"
# speech_brain_clean["name"] = "speechbrain_crdnn_rnnlm_librispeech"

# models_df = pd.concat([models_df, speech_brain])
# models_df_clean = pd.concat([models_df_clean, speech_brain_clean])

## AWS

### Medical API (Primary Care)

In [53]:
aws_medical_raw = pd.read_csv("../results/intron-open-test-aws-transcribe-medical-wer-0.9998-5498.csv")
print(aws_medical_raw.shape)
aws_medical_raw.head(2)

(5498, 17)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,gender,predictions,predictions_clean,wer
0,149716,5ab7b49ad1cab6392a764a69578dc822,twi,26-40,GH,Proteins break down to release amino acids whi...,136,3a21d00eec39a31f089b9b4f0f8afa21,/AfriSpeech-100/test/e696aff8-ce05-4c8e-a3b3-5...,10.776984,african,clinical,test,Female,,,1.0
1,360395,543c037ff44816e8b5ccd0d6cc92fe13,igbo,19-25,NG,To grant such a patent license to a party mean...,128,544fbff921d13b224adbbb0f637196ed,/AfriSpeech-100/test/f28baac3-cdcd-45a2-888c-a...,12.39,nigerian,clinical,test,Female,,,1.0


In [54]:
aws_medical_raw[aws_medical_raw.predictions.isna()].shape

(5498, 17)

In [55]:
aws_medical = transform_raw_df(aws_medical_raw, columns=columns, fmt="aws")
aws_medical_clean = clean_and_compute_wer(aws_medical)

(5492, 9)
(5492, 10)
(0, 10)


In [56]:
aws_medical

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,reference,prediction,wer
0,149716,clinical,Female,10.776984,26-40,twi,5ab7b49ad1cab6392a764a69578dc822,Proteins break down to release amino acids whi...,,1.0
1,360395,clinical,Female,12.390000,19-25,igbo,543c037ff44816e8b5ccd0d6cc92fe13,To grant such a patent license to a party mean...,,1.0
2,153514,clinical,Female,5.826984,26-40,isizulu,e87486db0c365bded42184d56b46a2a7,Flatus indicates return of peristalsis.,,1.0
3,129184,clinical,Female,12.985986,26-40,luganda,9d8db954e680843a47c3b7e224f12371,Since the degree of effect produced by a drug ...,,1.0
4,155127,clinical,Female,9.730000,26-40,setswana,cdf91cf6e59ee411b985a40a955d4d1f,Protection of the host immune mechanism mighti...,,1.0
...,...,...,...,...,...,...,...,...,...,...
5487,1045966,clinical,Male,7.000000,26-40,igarra,9a65a19b865e25e57cc6c56b4f63c381,Reason: PREOP CABG Admitting Diagnosis: ACUTE ...,,1.0
5488,595068,clinical,Male,5.533000,19-25,delta,4cf08618140220f8bc9860242713c066,A blood culture showed GPCs in 18 blood cultur...,,1.0
5489,1117295,clinical,Male,9.000000,26-40,yoruba,b809f451766446eb837c2ed5ecb40489,RR decreased from 30's to 20's with treatment ...,,1.0
5490,475636,general,Female,7.237000,26-40,tswana,ed4d0a0d0446ad1ff0da31efac2b7578,International Day for Mangrove: Warri to host ...,,1.0


In [57]:
print(aws_medical["wer"].mean())
print(aws_medical_clean["wer"].mean())

1.0
1.0


In [58]:
model_id_or_path="amazon_aws_transcribe_medical_api_primary_care"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws_medical, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws_medical_clean,
                output_dir="../results/normalized")

In [59]:
aws_medical["name"] = "amazon_aws_transcribe_medical_api_primary_care"
aws_medical_clean["name"] = "amazon_aws_transcribe_medical_api_primary_care"

models_df = pd.concat([models_df, aws_medical])
models_df_clean = pd.concat([models_df_clean, aws_medical_clean])

### Non-medical API

In [60]:
aws_raw = pd.read_csv("../results/intron-open-test-aws-transcribe-wer-1.0023-5498.csv")
print(aws_raw.shape)
aws_raw.head(2)

(5498, 17)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,gender,predictions,predictions_clean,wer
0,149716,5ab7b49ad1cab6392a764a69578dc822,twi,26-40,GH,Proteins break down to release amino acids whi...,136,3a21d00eec39a31f089b9b4f0f8afa21,/AfriSpeech-100/test/e696aff8-ce05-4c8e-a3b3-5...,10.776984,african,clinical,test,Female,,,1.0
1,360395,543c037ff44816e8b5ccd0d6cc92fe13,igbo,19-25,NG,To grant such a patent license to a party mean...,128,544fbff921d13b224adbbb0f637196ed,/AfriSpeech-100/test/f28baac3-cdcd-45a2-888c-a...,12.39,nigerian,clinical,test,Female,We should be asking ourselves whether we want ...,,0.96


In [61]:
aws_raw[aws_raw.predictions.isna()].shape

(5473, 17)

In [62]:
aws = transform_raw_df(aws_raw, columns=columns, fmt="aws")
aws_clean = clean_and_compute_wer(aws)

(5492, 9)
(5467, 10)
(0, 10)


In [63]:
print(aws["wer"].mean())
print(aws_clean["wer"].mean())

1.00278926781173
1.0024512894511732


In [64]:
model_id_or_path="amazon_aws_transcribe_api"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws_clean,
                output_dir="../results/normalized")

In [65]:
aws["name"] = "amazon_aws_transcribe_api"
aws_clean["name"] = "amazon_aws_transcribe_api"

models_df = pd.concat([models_df, aws])
models_df_clean = pd.concat([models_df_clean, aws_clean])

## Google

### GCP Speech Medical API

In [66]:
gcp_medical_raw = pd.read_csv("../results/intron-open-test-gcp-transcribe-medical-wer-0.625-5498.csv")
print(gcp_medical_raw.shape)
gcp_medical_raw.head(2)

(5498, 17)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,gender,predictions_raw,predictions,wer
0,149716,5ab7b49ad1cab6392a764a69578dc822,twi,26-40,GH,Proteins break down to release amino acids whi...,136,3a21d00eec39a31f089b9b4f0f8afa21,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,10.776984,african,clinical,test,Female,"protein is greater than 60 , which I used as f...","protein is greater than 60 , which i used as f...",0.652174
1,360395,543c037ff44816e8b5ccd0d6cc92fe13,igbo,19-25,NG,To grant such a patent license to a party mean...,128,544fbff921d13b224adbbb0f637196ed,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,12.39,nigerian,clinical,test,Female,2 g such a patent license to a patchy means to...,2 g such a patent license to a patchy means to...,0.28


In [67]:
gcp_medical = transform_raw_df(gcp_medical_raw, columns=columns, fmt="aws")
gcp_medical_clean = clean_and_compute_wer(gcp_medical)

(5492, 9)
(73, 10)
(0, 10)


In [68]:
print(gcp_medical["wer"].mean())
print(gcp_medical_clean["wer"].mean())

0.6985600415335655
0.6250368670367586


In [69]:
model_id_or_path="google_gcp_medical_speech_api"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=gcp_medical, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=gcp_medical_clean,
                output_dir="../results/normalized")

In [70]:
gcp_medical["name"] = "google_gcp_medical_speech_api"
gcp_medical_clean["name"] = "google_gcp_medical_speech_api"

models_df = pd.concat([models_df, gcp_medical])
models_df_clean = pd.concat([models_df_clean, gcp_medical_clean])

### GCP Speech Non-medical API

In [71]:
gcp_raw = pd.read_csv("../results/intron-open-test-gcp-transcribe-wer-0.6357-5498.csv")
print(gcp_raw.shape)
gcp_raw.head(2)

(5498, 17)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,gender,predictions_raw,predictions,wer
0,149716,5ab7b49ad1cab6392a764a69578dc822,twi,26-40,GH,Proteins break down to release amino acids whi...,136,3a21d00eec39a31f089b9b4f0f8afa21,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,10.776984,african,clinical,test,Female,amino acids,amino acids,0.913043
1,360395,543c037ff44816e8b5ccd0d6cc92fe13,igbo,19-25,NG,To grant such a patent license to a party mean...,128,544fbff921d13b224adbbb0f637196ed,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,12.39,nigerian,clinical,test,Female,to Grant such a patent license to a party mean...,to grant such a patent license to a party mean...,0.0


In [72]:
gcp = transform_raw_df(gcp_raw, columns=columns, fmt="aws")
gcp_clean = clean_and_compute_wer(gcp)

(5492, 9)
(170, 10)
(0, 10)


In [73]:
print(gcp["wer"].mean())
print(gcp_clean["wer"].mean())

0.7070144850330559
0.6361551076492743


In [74]:
model_id_or_path="google_gcp_speech_api"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=gcp, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=gcp_clean,
                output_dir="../results/normalized")

In [75]:
gcp["name"] = "google_gcp_speech_api"
gcp_clean["name"] = "google_gcp_speech_api"

models_df = pd.concat([models_df, gcp])
models_df_clean = pd.concat([models_df_clean, gcp_clean])

## Microsoft

### Azure

In [76]:
# azure_raw = pd.read_csv("../results/intron-open-test")
# print(azure_raw.shape)
# azure_raw.head(2)

In [77]:
# azure = transform_raw_df(azure_raw, columns=columns, fmt="azure")
# azure_clean = clean_and_compute_wer(azure)

In [78]:
# print(azure["wer"].mean())
# print(azure_clean["wer"].mean())

In [79]:
# model_id_or_path="microsoft_azure_speech_api"

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=azure, 
#                 output_dir="../results/raw")

# write_to_folder(model_id_or_path=model_id_or_path, 
#                 predictions_df=azure_clean,
#                 output_dir="../results/normalized")

In [80]:
# azure["name"] = "microsoft_azure_speech_api"
# azure_clean["name"] = "microsoft_azure_speech_api"

# models_df = pd.concat([models_df, azure])
# models_df_clean = pd.concat([models_df_clean, azure_clean])

In [81]:
# models_df.shape, models_df_clean.shape

### WavLm-libri-clean-100h-large

In [82]:
wavlm_libri_clean_100h_large_raw = pd.read_csv("../results/intron-open-test-patrickvonplaten-wavlm-libri-clean-100h-large-wer-0.7051-5474.csv")
print(wavlm_libri_clean_100h_large_raw.shape)
wavlm_libri_clean_100h_large_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,protns break down to relese amenoases which ie...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,protns break down to relese amenoases which ie...,proteins break down to release amino acids whi...,protns break down to relese amenoases which ie...,proteins break down to release amino acids whi...
1,to grant such a patent license to a partty mea...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a partty mea...,to grant such a patent license to a party mean...,to grant such a patent license to a partty mea...,to grant such a patent license to a party mean...


In [83]:
wavlm_libri_clean_100h_large = transform_raw_df(wavlm_libri_clean_100h_large_raw, columns=columns, fmt="intron_whisper")
wavlm_libri_clean_100h_large_clean = clean_and_compute_wer(wavlm_libri_clean_100h_large)

(5480, 9)
(1, 10)
(0, 10)


In [84]:
print(wavlm_libri_clean_100h_large["wer"].mean())
print(wavlm_libri_clean_100h_large_clean["wer"].mean())

0.8834926509643685
0.8174427579736396


In [85]:
model_id_or_path="microsoft_wavlm_libri_clean_100h_large"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_large, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_large_clean,
                output_dir="../results/normalized")

In [86]:
wavlm_libri_clean_100h_large["name"] = "microsoft_wavlm_libri_clean_100h_large"
wavlm_libri_clean_100h_large_clean["name"] = "microsoft_wavlm_libri_clean_100h_large"

models_df = pd.concat([models_df, wavlm_libri_clean_100h_large])
models_df_clean = pd.concat([models_df_clean, wavlm_libri_clean_100h_large_clean])

### WavLm-libri-clean-100h-base-plus

In [87]:
wavlm_libri_clean_100h_base_raw = pd.read_csv("../results/intron-open-test-patrickvonplaten-wavlm-libri-clean-100h-base-plus-wer-0.8593-5474.csv")
print(wavlm_libri_clean_100h_base_raw.shape)
wavlm_libri_clean_100h_base_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,crortinisbrik danserilisa menasus which i sed ...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,crortinisbrik danserilisa menasus which i sed ...,proteins break down to release amino acids whi...,crortinisbrik danserilisa menasus which i sed ...,proteins break down to release amino acids whi...
1,to grant such a patent licence to a patsy mean...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent licence to a patsy mean...,to grant such a patent license to a party mean...,to grant such a patent license to a patsy mean...,to grant such a patent license to a party mean...


In [88]:
wavlm_libri_clean_100h_base = transform_raw_df(wavlm_libri_clean_100h_base_raw, columns=columns, fmt="intron_whisper")
wavlm_libri_clean_100h_base_clean = clean_and_compute_wer(wavlm_libri_clean_100h_base)

(5480, 9)
(74, 10)
(0, 10)


In [89]:
print(wavlm_libri_clean_100h_base["wer"].mean())
print(wavlm_libri_clean_100h_base_clean["wer"].mean())

0.981884889169573
0.9596212752583314


In [90]:
model_id_or_path="microsoft_wavlm_libri_clean_100h_base"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_base, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_base_clean,
                output_dir="../results/normalized")

In [91]:
wavlm_libri_clean_100h_base["name"] = "microsoft_wavlm_libri_clean_100h_base"
wavlm_libri_clean_100h_base_clean["name"] = "microsoft_wavlm_libri_clean_100h_base"

models_df = pd.concat([models_df, wavlm_libri_clean_100h_base])
models_df_clean = pd.concat([models_df_clean, wavlm_libri_clean_100h_base_clean])

## Facebook

### Wav2vec2-large-xlsr-53-english

In [92]:
wav2vec2_large_xlsr_53_english_raw = pd.read_csv("../results/intron-open-test-jonatasgrosman-wav2vec2-large-xlsr-53-english-wer-0.576-5474.csv")
print(wav2vec2_large_xlsr_53_english_raw.shape)
wav2vec2_large_xlsr_53_english_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,protings breat dancer elease are munivasis whi...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,protings breat dancer elease are munivasis whi...,proteins break down to release amino acids whi...,protings breat dancer elease are munivasis whi...,proteins break down to release amino acids whi...
1,to grant such a patent license to a party mean...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...


In [93]:
wav2vec2_large_xlsr_53_english = transform_raw_df(wav2vec2_large_xlsr_53_english_raw, columns=columns, fmt="intron_whisper")
wav2vec2_large_xlsr_53_english_clean = clean_and_compute_wer(wav2vec2_large_xlsr_53_english)

(5480, 9)
(1, 10)
(0, 10)


In [94]:
print(wav2vec2_large_xlsr_53_english["wer"].mean())
print(wav2vec2_large_xlsr_53_english_clean["wer"].mean())

0.7685491696566653
0.6841450483812219


In [95]:
print(wer_metric.compute(predictions=wav2vec2_large_xlsr_53_english.prediction.array, references=wav2vec2_large_xlsr_53_english.reference.array))
print(wer_metric.compute(predictions=wav2vec2_large_xlsr_53_english_clean.prediction.array, references=wav2vec2_large_xlsr_53_english_clean.reference.array))

0.6463570961412688
0.5759767694340165


In [96]:
model_id_or_path="facebook_wav2vec2_large_xlsr_53_english"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_xlsr_53_english, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_xlsr_53_english_clean,
                output_dir="../results/normalized")

In [97]:
wav2vec2_large_xlsr_53_english["name"] = "facebook_wav2vec2_large_xlsr_53_english"
wav2vec2_large_xlsr_53_english_clean["name"] = "facebook_wav2vec2_large_xlsr_53_english"

models_df = pd.concat([models_df, wav2vec2_large_xlsr_53_english])
models_df_clean = pd.concat([models_df_clean, wav2vec2_large_xlsr_53_english_clean])

### Wav2vec2-xls-r-1b-english

In [98]:
wav2vec2_xls_r_1b_english_raw = pd.read_csv("../results/intron-open-test-jonatasgrosman-wav2vec2-xls-r-1b-english-wer-0.5935-5474.csv")
print(wav2vec2_xls_r_1b_english_raw.shape)
wav2vec2_xls_r_1b_english_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,frotin's brig danty release aminoasis which ar...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,frotin's brig danty release aminoasis which ar...,proteins break down to release amino acids whi...,frotin is brig danty release aminoasis which a...,proteins break down to release amino acids whi...
1,to grant such a patent license to a patsy mean...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a patsy mean...,to grant such a patent license to a party mean...,to grant such a patent license to a patsy mean...,to grant such a patent license to a party mean...


In [99]:
wav2vec2_xls_r_1b_english = transform_raw_df(wav2vec2_xls_r_1b_english_raw, columns=columns, fmt="intron_whisper")
wav2vec2_xls_r_1b_english_clean = clean_and_compute_wer(wav2vec2_xls_r_1b_english)

(5480, 9)
(1, 10)
(0, 10)


In [100]:
print(wav2vec2_xls_r_1b_english["wer"].mean())
print(wav2vec2_xls_r_1b_english_clean["wer"].mean())

0.8085042295813446
0.7061287579323898


In [101]:
model_id_or_path="facebook_wav2vec2_xls_r_1b_english"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_xls_r_1b_english, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_xls_r_1b_english_clean,
                output_dir="../results/normalized")

In [102]:
wav2vec2_xls_r_1b_english["name"] = "facebook_wav2vec2_xls_r_1b_english"
wav2vec2_xls_r_1b_english_clean["name"] = "facebook_wav2vec2_xls_r_1b_english"

models_df = pd.concat([models_df, wav2vec2_xls_r_1b_english])
models_df_clean = pd.concat([models_df_clean, wav2vec2_xls_r_1b_english_clean])

### Wav2vec2-large-960h-lv60-self

In [103]:
wav2vec2_large_960h_lv60_self_raw = pd.read_csv("../results/intron-open-test-facebook-wav2vec2-large-960h-lv60-self-wer-0.6111-5474.csv")
print(wav2vec2_large_960h_lv60_self_raw.shape)
wav2vec2_large_960h_lv60_self_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,FROTINS BREK DOWN TO RELEASE A MINASES WHICH I...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,frotins brek down to release a minases which i...,proteins break down to release amino acids whi...,frotins brek down to release a minases which i...,proteins break down to release amino acids whi...
1,TO GRANT SUCH A PATENT LICENSE TO A PARTY MEAN...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...


In [104]:
wav2vec2_large_960h_lv60_self = transform_raw_df(wav2vec2_large_960h_lv60_self_raw, columns=columns, fmt="intron_whisper")
wav2vec2_large_960h_lv60_self_clean = clean_and_compute_wer(wav2vec2_large_960h_lv60_self)

(5480, 9)
(1, 10)
(0, 10)


In [105]:
print(wav2vec2_large_960h_lv60_self["wer"].mean())
print(wav2vec2_large_960h_lv60_self_clean["wer"].mean())

1.2277196118612432
0.7333392614599654


In [106]:
model_id_or_path="facebook_wav2vec2_large_960h_lv60_self"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h_lv60_self, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h_lv60_self_clean,
                output_dir="../results/normalized")

In [107]:
wav2vec2_large_960h_lv60_self["name"] = "facebook_wav2vec2_large_960h_lv60_self"
wav2vec2_large_960h_lv60_self_clean["name"] = "facebook_wav2vec2_large_960h_lv60_self"

models_df = pd.concat([models_df, wav2vec2_large_960h_lv60_self])
models_df_clean = pd.concat([models_df_clean, wav2vec2_large_960h_lv60_self_clean])

### Wav2vec2-large-960h

In [108]:
wav2vec2_large_960h_raw = pd.read_csv("../results/intron-open-test-facebook-wav2vec2-large-960h-wer-0.7169-5474.csv")
print(wav2vec2_large_960h_raw.shape)
wav2vec2_large_960h_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,FROATING'S RIG DANCERYLIS A MINASES WHICH I US...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,froating's rig dancerylis a minases which i us...,proteins break down to release amino acids whi...,froating is rig dancerylis a minases which i u...,proteins break down to release amino acids whi...
1,TO GRANT SUCH A PATENT LICENSE TO A PARI MEANS...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a pari means...,to grant such a patent license to a party mean...,to grant such a patent license to a pari means...,to grant such a patent license to a party mean...


In [109]:
wav2vec2_large_960h = transform_raw_df(wav2vec2_large_960h_raw, columns=columns, fmt="intron_whisper")
wav2vec2_large_960h_clean = clean_and_compute_wer(wav2vec2_large_960h)

(5480, 9)
(16, 10)
(0, 10)


In [110]:
print(wav2vec2_large_960h["wer"].mean())
print(wav2vec2_large_960h_clean["wer"].mean())

1.2465420177464461
0.8381530709763311


In [111]:
model_id_or_path="facebook_wav2vec2_large_960h"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h_clean,
                output_dir="../results/normalized")

In [112]:
wav2vec2_large_960h["name"] = "facebook_wav2vec2_large_960h"
wav2vec2_large_960h_clean["name"] = "facebook_wav2vec2_large_960h"

models_df = pd.concat([models_df, wav2vec2_large_960h])
models_df_clean = pd.concat([models_df_clean, wav2vec2_large_960h_clean])

### Wav2vec2-large-robust-ft-swbd-300h

In [113]:
wav2vec2_large_robust_ft_swbd_300h_raw = pd.read_csv("../results/intron-open-test-facebook-wav2vec2-large-robust-ft-swbd-300h-wer-0.8169-5474.csv")
print(wav2vec2_large_robust_ft_swbd_300h_raw.shape)
wav2vec2_large_robust_ft_swbd_300h_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,FROSING IS FRAGED ONTORE LEAS I MENO AS A SWIC...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,frosing is fraged ontore leas i meno as a swic...,proteins break down to release amino acids whi...,frosing is fraged ontore leas i meno as a swic...,proteins break down to release amino acids whi...
1,TO GRANT SUCH A PATENT LICENSE TO A PAZI MEANS...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a pazi means...,to grant such a patent license to a party mean...,to grant such a patent license to a pazi means...,to grant such a patent license to a party mean...


In [114]:
wav2vec2_large_robust_ft_swbd_300h = transform_raw_df(wav2vec2_large_robust_ft_swbd_300h_raw, columns=columns, fmt="intron_whisper")
wav2vec2_large_robust_ft_swbd_300h_clean = clean_and_compute_wer(wav2vec2_large_robust_ft_swbd_300h)

(5480, 9)
(4, 10)
(0, 10)


In [115]:
print(wav2vec2_large_robust_ft_swbd_300h["wer"].mean())
print(wav2vec2_large_robust_ft_swbd_300h_clean["wer"].mean())

1.3151703325563109
0.9491971715406234


In [116]:
model_id_or_path="facebook_wav2vec2_large_robust_ft_swbd_300h"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_robust_ft_swbd_300h, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_robust_ft_swbd_300h_clean,
                output_dir="../results/normalized")

In [117]:
wav2vec2_large_robust_ft_swbd_300h["name"] = "facebook_wav2vec2_large_robust_ft_swbd_300h"
wav2vec2_large_robust_ft_swbd_300h_clean["name"] = "facebook_wav2vec2_large_robust_ft_swbd_300h"

models_df = pd.concat([models_df, wav2vec2_large_robust_ft_swbd_300h])
models_df_clean = pd.concat([models_df_clean, wav2vec2_large_robust_ft_swbd_300h_clean])

### Hubert-large-ls960-ft

In [118]:
hubert_large_ls960_ft_raw = pd.read_csv("../results/intron-open-test-facebook-hubert-large-ls960-ft-wer-0.633-5474.csv")
print(hubert_large_ls960_ft_raw.shape)
hubert_large_ls960_ft_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,PROTINS BREAK DOWN TO RELEASE ARMINOASIS WHICH...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,protins break down to release arminoasis which...,proteins break down to release amino acids whi...,protins break down to release arminoasis which...,proteins break down to release amino acids whi...
1,TO GRANT SUCH A PATENT LICENSE TO A PARTY MEAN...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...


In [119]:
hubert_large_ls960_ft = transform_raw_df(hubert_large_ls960_ft_raw, columns=columns, fmt="intron_whisper")
hubert_large_ls960_ft_clean = clean_and_compute_wer(hubert_large_ls960_ft)

(5480, 9)
(1, 10)
(0, 10)


In [120]:
print(hubert_large_ls960_ft["wer"].mean())
print(hubert_large_ls960_ft_clean["wer"].mean())

1.2260313946467565
0.7524701017884475


In [121]:
model_id_or_path="facebook_hubert_large_ls960_ft"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_large_ls960_ft, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_large_ls960_ft_clean,
                output_dir="../results/normalized")

In [122]:
hubert_large_ls960_ft["name"] = "facebook_hubert_large_ls960_ft"
hubert_large_ls960_ft_clean["name"] = "facebook_hubert_large_ls960_ft"

models_df = pd.concat([models_df, hubert_large_ls960_ft])
models_df_clean = pd.concat([models_df_clean, hubert_large_ls960_ft_clean])

### Hubert-xlarge-ls960-ft

In [123]:
hubert_xlarge_ls960_ft_raw = pd.read_csv("../results/intron-open-test-facebook-hubert-xlarge-ls960-ft-wer-0.6409-5474.csv")
print(hubert_xlarge_ls960_ft_raw.shape)
hubert_xlarge_ls960_ft_raw.head(2)

(5474, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,FLORITINGIS WRAGD DOWN TORELESE A MINOASUS WHI...,Proteins break down to release amino acids whi...,/data/data/intron/e696aff8-ce05-4c8e-a3b3-5634...,twi,floritingis wragd down torelese a minoasus whi...,proteins break down to release amino acids whi...,floritingis wragd down torelese a minoasus whi...,proteins break down to release amino acids whi...
1,TO GRANT SUCH A PATENT LICENSE TO A PARTY MEAN...,To grant such a patent license to a party mean...,/data/data/intron/f28baac3-cdcd-45a2-888c-a59f...,igbo,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...,to grant such a patent license to a party mean...


In [124]:
hubert_xlarge_ls960_ft = transform_raw_df(hubert_xlarge_ls960_ft_raw, columns=columns, fmt="intron_whisper")
hubert_xlarge_ls960_ft_clean = clean_and_compute_wer(hubert_xlarge_ls960_ft)

(5480, 9)
(1, 10)
(0, 10)


In [125]:
print(hubert_xlarge_ls960_ft["wer"].mean())
print(hubert_xlarge_ls960_ft_clean["wer"].mean())

1.2538348376800743
0.7685927716377526


In [126]:
model_id_or_path="facebook_hubert_xlarge_ls960_ft"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_xlarge_ls960_ft, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_xlarge_ls960_ft_clean,
                output_dir="../results/normalized")

In [127]:
hubert_xlarge_ls960_ft["name"] = "facebook_hubert_xlarge_ls960_ft"
hubert_xlarge_ls960_ft_clean["name"] = "facebook_hubert_xlarge_ls960_ft"

models_df = pd.concat([models_df, hubert_xlarge_ls960_ft])
models_df_clean = pd.concat([models_df_clean, hubert_xlarge_ls960_ft_clean])

In [128]:
models_df.shape, models_df_clean.shape

((93208, 12), (93208, 12))

In [129]:
models_df.to_csv("../results/raw/intron-open-test-all_models.csv")
models_df_clean.to_csv("../results/normalized/intron-open-test-all_models.csv")

# Other statistics

- compute per-accent statistics
- compute per-gender statistics
- compute per-domain statistics
- compute per age-group statistics

In [131]:
import os

if os.path.exists("../results/raw/all_models.csv"):
    models_df = pd.read_csv("../results/raw/intron-open-test-all_models.csv")

if os.path.exists("../results/normalized/all_models.csv"):
    models_df_clean = pd.read_csv("../results/normalized/intron-open-test-all_models.csv")

In [132]:
list(set(models_df.name))

['amazon_aws_transcribe_medical_api_primary_care',
 'google_gcp_medical_speech_api',
 'openai_whisper_medium',
 'facebook_wav2vec2_large_960h_lv60_self',
 'microsoft_wavlm_libri_clean_100h_large',
 'facebook_wav2vec2_large_robust_ft_swbd_300h',
 'openai_whisper_small',
 'facebook_hubert_xlarge_ls960_ft',
 'amazon_aws_transcribe_api',
 'openai_whisper_medium_en',
 'openai_whisper_large',
 'facebook_wav2vec2_large_960h',
 'microsoft_wavlm_libri_clean_100h_base',
 'facebook_hubert_large_ls960_ft',
 'google_gcp_speech_api',
 'facebook_wav2vec2_large_xlsr_53_english',
 'facebook_wav2vec2_xls_r_1b_english']

In [133]:
models_mapping = {
    "amazon_aws_transcribe_api": "AWS",
    "google_gcp_speech_api": "GCP",
    "google_gcp_medical_speech_api": "GCP [Medical]",
    "facebook_hubert_xlarge_ls960_ft": "facebook/hubert-xlarge-ls960-ft",
    "facebook_wav2vec2_large_robust_ft_swbd_300h": "facebook/wav2vec2-large-robust-ft-swbd-300h",
    "microsoft_azure_speech_api": "Azure",
    "openai_whisper_small": "openai/whisper-small",
    "openai_whisper_small_en": "openai/whisper-small-en",
    "speechbrain_crdnn_rnnlm_librispeech": "speechbrain/crdnn-rnnlm-librispeech",
    "facebook_wav2vec2_large_xlsr_53_english": "facebook/wav2vec2-large-xlsr-53-english",
    "nvidia_nemo_conformer_ctc_large": "nvidia/nemo-conformer-ctc-large",
    "nvidia_nemo_conformer_transducer_large": "nvidia/nemo-conformer-transducer-large",
    "facebook_hubert_large_ls960_ft": "facebook/hubert-large-ls960-ft",
    "openai_whisper_medium": "openai/whisper-medium",
    "microsoft_wavlm_libri_clean_100h_base": "microsoft/wavlm-libri-clean-100h-base",
    "amazon_aws_transcribe_medical_api_primary_care": "AWS [Medical] (Primary Care)",
    "facebook_wav2vec2_large_960h_lv60_self": "facebook/wav2vec2-large-960h-lv60-self",
    "openai_whisper_large": "openai/whisper-large",
    "facebook_wav2vec2_large_960h": "facebook/wav2vec2-large-960h",
    "openai_whisper_medium_en": "openai/whisper-medium-en",
    "facebook_wav2vec2_xls_r_1b_english": "facebook/wav2vec2-xls-r-1b-english",
    "microsoft_wavlm_libri_clean_100h_large": "microsoft/wavlm-libri-clean-100h-large"
}

In [134]:
models_df.name = models_df.name.apply(lambda x: models_mapping[x])
models_df_clean.name = models_df_clean.name.apply(lambda x: models_mapping[x])

In [135]:
name = "facebook"
models_df[models_df.name.str.startswith(name)].groupby(["name", "domain"])["wer"].mean().reset_index()

Unnamed: 0,name,domain,wer
0,facebook/hubert-large-ls960-ft,clinical,1.276967
1,facebook/hubert-large-ls960-ft,general,1.159191
2,facebook/hubert-xlarge-ls960-ft,clinical,1.311211
3,facebook/hubert-xlarge-ls960-ft,general,1.178544
4,facebook/wav2vec2-large-960h,clinical,1.302091
5,facebook/wav2vec2-large-960h,general,1.173649
6,facebook/wav2vec2-large-960h-lv60-self,clinical,1.27674
7,facebook/wav2vec2-large-960h-lv60-self,general,1.163393
8,facebook/wav2vec2-large-robust-ft-swbd-300h,clinical,1.381025
9,facebook/wav2vec2-large-robust-ft-swbd-300h,general,1.228753


In [136]:
# name = "nvidia"
# models_df_clean[models_df_clean.name.str.startswith(name)].groupby(["name", "domain"])["wer"].mean().reset_index()

In [137]:
models_df.domain = models_df.domain.str.capitalize()
models_df_clean.domain = models_df_clean.domain.str.capitalize()

In [152]:
domain_df = models_df.groupby(["name", "domain"])["wer"].mean().reset_index().set_index("name").pivot(columns="domain").sort_values(("wer", "Clinical"))
domain_df = domain_df.droplevel(0, axis=1)
domain_df

domain,Clinical,General
name,Unnamed: 1_level_1,Unnamed: 2_level_1
openai/whisper-large,0.610564,0.376748
openai/whisper-medium-en,0.63403,0.442761
openai/whisper-medium,0.63559,0.428758
openai/whisper-small,0.690537,0.479247
GCP [Medical],0.736015,0.64944
GCP,0.771967,0.621832
facebook/wav2vec2-large-xlsr-53-english,0.87103,0.63407
facebook/wav2vec2-xls-r-1b-english,0.914974,0.66879
microsoft/wavlm-libri-clean-100h-large,0.988569,0.745608
AWS [Medical] (Primary Care),1.0,1.0


In [153]:
domain_df= domain_df.merge(models_df.groupby(["name"])["wer"].mean().reset_index(), on="name")
domain_df["Both"] = domain_df["wer"]
domain_df = domain_df[["name", "Clinical", "General", "Both"]]
domain_df = domain_df.set_index("name")
domain_df = domain_df[["General", "Clinical", "Both"]].round(3)
domain_df

Unnamed: 0_level_0,General,Clinical,Both
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
openai/whisper-large,0.377,0.611,0.509
openai/whisper-medium-en,0.443,0.634,0.551
openai/whisper-medium,0.429,0.636,0.546
openai/whisper-small,0.479,0.691,0.599
GCP [Medical],0.649,0.736,0.699
GCP,0.622,0.772,0.707
facebook/wav2vec2-large-xlsr-53-english,0.634,0.871,0.769
facebook/wav2vec2-xls-r-1b-english,0.669,0.915,0.809
microsoft/wavlm-libri-clean-100h-large,0.746,0.989,0.883
AWS [Medical] (Primary Care),1.0,1.0,1.0


In [154]:
print(domain_df.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  General &  Clinical &   Both \\
name                                        &          &           &        \\
\midrule
openai/whisper-large                        &    0.377 &     0.611 &  0.509 \\
openai/whisper-medium-en                    &    0.443 &     0.634 &  0.551 \\
openai/whisper-medium                       &    0.429 &     0.636 &  0.546 \\
openai/whisper-small                        &    0.479 &     0.691 &  0.599 \\
GCP [Medical]                               &    0.649 &     0.736 &  0.699 \\
GCP                                         &    0.622 &     0.772 &  0.707 \\
facebook/wav2vec2-large-xlsr-53-english     &    0.634 &     0.871 &  0.769 \\
facebook/wav2vec2-xls-r-1b-english          &    0.669 &     0.915 &  0.809 \\
microsoft/wavlm-libri-clean-100h-large      &    0.746 &     0.989 &  0.883 \\
AWS [Medical] (Primary Care)                &    1.000 &     1.000 &  1.000 \\
AWS                                         &    1.00

In [155]:
domain_df_clean = models_df_clean.groupby(["name", "domain"])["wer"].mean().reset_index().set_index("name").pivot(columns="domain").sort_values(("wer", "Clinical"))
domain_df_clean = domain_df_clean.droplevel(0, axis=1)
domain_df_clean

domain,Clinical,General
name,Unnamed: 1_level_1,Unnamed: 2_level_1
openai/whisper-large,0.495154,0.283333
openai/whisper-medium,0.502572,0.315612
openai/whisper-medium-en,0.526946,0.350112
openai/whisper-small,0.577093,0.376324
GCP [Medical],0.643839,0.600379
GCP,0.704339,0.546736
facebook/wav2vec2-large-xlsr-53-english,0.791901,0.542743
facebook/wav2vec2-xls-r-1b-english,0.817967,0.559371
facebook/wav2vec2-large-960h-lv60-self,0.849437,0.580992
facebook/hubert-large-ls960-ft,0.867848,0.601067


In [156]:
domain_df_clean= domain_df_clean.merge(models_df_clean.groupby(["name"])["wer"].mean().reset_index(), on="name")
domain_df_clean["Both"] = domain_df_clean["wer"]
domain_df_clean = domain_df_clean[["name", "Clinical", "General", "Both"]]
domain_df_clean = domain_df_clean.set_index("name")
domain_df_clean = domain_df_clean[["General", "Clinical", "Both"]].round(3)
domain_df_clean

Unnamed: 0_level_0,General,Clinical,Both
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
openai/whisper-large,0.283,0.495,0.404
openai/whisper-medium,0.316,0.503,0.422
openai/whisper-medium-en,0.35,0.527,0.45
openai/whisper-small,0.376,0.577,0.49
GCP [Medical],0.6,0.644,0.625
GCP,0.547,0.704,0.636
facebook/wav2vec2-large-xlsr-53-english,0.543,0.792,0.684
facebook/wav2vec2-xls-r-1b-english,0.559,0.818,0.706
facebook/wav2vec2-large-960h-lv60-self,0.581,0.849,0.733
facebook/hubert-large-ls960-ft,0.601,0.868,0.752


In [157]:
print(domain_df_clean.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  General &  Clinical &   Both \\
name                                        &          &           &        \\
\midrule
openai/whisper-large                        &    0.283 &     0.495 &  0.404 \\
openai/whisper-medium                       &    0.316 &     0.503 &  0.422 \\
openai/whisper-medium-en                    &    0.350 &     0.527 &  0.450 \\
openai/whisper-small                        &    0.376 &     0.577 &  0.490 \\
GCP [Medical]                               &    0.600 &     0.644 &  0.625 \\
GCP                                         &    0.547 &     0.704 &  0.636 \\
facebook/wav2vec2-large-xlsr-53-english     &    0.543 &     0.792 &  0.684 \\
facebook/wav2vec2-xls-r-1b-english          &    0.559 &     0.818 &  0.706 \\
facebook/wav2vec2-large-960h-lv60-self      &    0.581 &     0.849 &  0.733 \\
facebook/hubert-large-ls960-ft              &    0.601 &     0.868 &  0.752 \\
facebook/hubert-xlarge-ls960-ft             &    0.60

In [158]:
# accent_df_clean = models_df_clean.groupby(["name", "accent"])["wer"].mean().reset_index().set_index("name").pivot(columns="accent").sort_values(("wer", "english"))
# accent_df_clean = accent_df_clean.droplevel(0, axis=1)
# accent_df_clean = accent_df_clean.round(3)
# accent_df_clean

In [None]:
# accent_df_clean.T

In [None]:
# models_df_best_clean = models_df_clean[models_df_clean.name == "openai/whisper-medium"]

In [159]:
# accent_df_best_clean = models_df_best_clean.groupby(["name", "accent", "domain"])[["wer"]].mean().reset_index()
# accent_df_best_clean
# accent_df_clean = accent_df_clean.droplevel(0, axis=1)
