In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_metric
import sys
sys.path.append("..")
from src.utils.text_processing import clean_text

In [3]:
wer_metric = load_metric("wer")

In [4]:
dev_df = pd.read_csv("../data/intron-dev-public-3231-clean.csv")
print(dev_df.shape)
dev_df.head(3)

(3231, 14)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_path,duration,origin,domain,split,gender
0,155349,659e36c14baaf7fa7bb197f951251f4b,setswana,26-40,BW,We should be asking ourselves whether we want ...,110,86ad3ef21e469217f28a749c990c81fd,/AfriSpeech-100/dev/92d2b94e-3e31-40be-b479-50...,8.400998,african,general,dev,Male
1,60812,1fd5f717cede9a867bf37d03c7d2166b,siswati,26-40,ZA,Other sagittal planes parallel to this off cen...,88,5409734bcc4a0f053e73f405d0f135da,/AfriSpeech-100/dev/127bdba1-8bc8-44a4-9c37-8e...,5.236984,african,clinical,dev,Female
2,139668,f2d08d2e1c47a187bfa6869fadc1f755,setswana,26-40,ZA,Tuberculoma is an intracranial massoccurring s...,109,b10dc4e959b6596d31866e5094ef500a,/AfriSpeech-100/dev/de0b1ee6-46f1-4eb6-a747-47...,7.072993,african,clinical,dev,Male


In [5]:
dev_df = dev_df[~dev_df.duplicated(subset="audio_path")]
print(dev_df.shape)

(3227, 14)


In [6]:
dev_df["audio_paths"] = dev_df["audio_path"].apply(lambda x: x.replace("/AfriSpeech-100/dev/", "/data/data/intron/"))
df = pd.read_csv("../results/intron-open-dev-whisper_medium-wer-0.3049-2872.csv")
df = pd.merge(df, dev_df, on="audio_paths")
df.shape

(2872, 22)

In [7]:
mapping_df = df[["idx", "audio_paths"]]
mapping_df.head(3)

Unnamed: 0,idx,audio_paths
0,72663,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...
1,153902,/data/data/intron/7ce31a48-f507-4344-bc49-96de...
2,82671,/data/data/intron/b8f6fbc1-426f-4a4b-99db-c38d...


In [22]:
# Handy functions
def transform_raw_df(df_raw, columns=[], fmt="intron_whisper"):
    assert fmt != "" or fmt is not None
    
    df = df_raw.copy()
    
    if fmt == "intron_whisper":
        df = pd.merge(df, dev_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["transcript", "hypothesis", "audio_paths_x", "accent_x"]]
        df.loc[:, "accent"] = df.loc[:, "accent_x"]
        df.loc[:, "audio_paths"] = df.loc[:, "audio_paths_x"]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, "hypothesis"]
        df = df.drop(columns=["accent_x", "audio_paths_x", "hypothesis", "transcript"])
    
    elif fmt == "african_nlp":
        df = pd.merge(df, dev_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript", 1]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, 1]
        df = df.drop(columns=[1, "transcript"])

    elif fmt == "aws":
        df = pd.merge(df[["idx", "transcript", "predictions"]], dev_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript_x", "predictions"]]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, "predictions"]
        df = df.drop(columns=["transcript_x", "predictions"])

    elif fmt == "azure":
        df = pd.merge(df[["idx", "transcript", "predictions_raw"]], dev_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript_x", "predictions_raw"]]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, "predictions_raw"]
        df = df.drop(columns=["transcript_x", "predictions_raw"])
    
    elif fmt == "wav2vec2":
        df.loc[:, "audio_paths"] = df["audio_path"]
        df = pd.merge(df[["audio_paths", "text", "predictions_raw"]], dev_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "text", "predictions_raw"]]
        df.loc[:, "reference"] = df.loc[:, "text"]
        df.loc[:, "prediction"] = df.loc[:, "predictions_raw"]
        df = df.drop(columns=["text", "predictions_raw"])
    
    elif fmt in df.columns:
        df = pd.merge(df[["idx", "transcript", fmt]], dev_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript_x", fmt]]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, fmt]
        df = df.drop(columns=["transcript_x", fmt])
  
    else:
        raise NotImplementedError()
    
    print(df.shape)
    df["wer"] = df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], references=[x.reference]), axis=1) 
    return df
    


def clean_and_compute_wer(df):
    df_clean = df.copy()
    
    print(df_clean[df_clean["prediction"].isnull()].shape)
    
    df_clean["prediction"] = df_clean["prediction"].fillna("")
    print(df_clean[df_clean["prediction"].isnull()].shape)
    
    df_clean["reference"] = df_clean["reference"].apply(lambda x: clean_text(x))
    df_clean["prediction"] = df_clean["prediction"].apply(lambda x: clean_text(x))
    
    df_clean["wer"] = df_clean.apply(lambda x: wer_metric.compute(predictions=[x.prediction], 
                                                                  references=[x.reference]), axis=1)
    return df_clean
    
    
def write_to_folder(model_id_or_path, predictions_df, output_dir="../results/", split="dev"):
    wer = predictions_df["wer"].mean()
    output_path = f"{output_dir}/intron-open-{split}-{model_id_or_path}-wer-{round(wer, 4)}-{len(predictions_df)}.csv"
    predictions_df.to_csv(output_path, index=False)

In [45]:
# make output dir
import os
os.makedirs("../results/raw", exist_ok=True)
os.makedirs("../results/normalized", exist_ok=True)

# Compute WER

## OpenAI

In [46]:
whspr = pd.read_csv("../results/whisper_wer_asr_dev.csv")
print(whspr.shape)
whspr.head(3)

(3232, 10)


Unnamed: 0.1,Unnamed: 0,idx,user_ids,transcript,whisper_medium,whisper_large,wer,cleanup_references,cleanup_whisper,wer_cleanup
0,0,72663,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding t...,Ensure the correct placement of the feeding t...,0.451613,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,0.290323
1,1,153902,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...","Regardless of how amyokites stimulated, howev...","Regardless of how a myocardial stimulation, h...",0.444444,regardlessof how amyocyteis stimulated however...,regardless of how amyokites stimulated howeve...,0.25
2,2,82671,a3c2c182b6b53089ef4bc3eccff103bf,"For Nigeria to achieve similar noble goals, th...","For Nigeria to achieve similar noble goals, t...","For Nigeria to achieve similar noble goals, t...",0.222222,for nigeria to achieve similar noble goals th...,for nigeria to achieve similar noble goals th...,0.027778


In [47]:
columns = ["idx", "domain", "gender", "duration", "age_group"]

### Whisper-Large

In [48]:
whspr_large = transform_raw_df(whspr, columns=columns, fmt="whisper_large")
whspr_large_clean = clean_and_compute_wer(whspr_large)

(2872, 9)
(3, 10)
(0, 10)


In [49]:
whspr_large.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,reference,prediction,wer
0,72663,clinical,Female,16.421995,19-25,swahili,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding t...,0.290323
1,153902,clinical,Female,11.393991,19-25,swahili,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...","Regardless of how a myocardial stimulation, h...",0.5


In [50]:
whspr_large_clean.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,reference,prediction,wer
0,72663,clinical,Female,16.421995,19-25,swahili,f559cb4f16bc465ea44b56a8d3b5513e,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,0.290323
1,153902,clinical,Female,11.393991,19-25,swahili,2be9bc423e70a24c703f8336e60af3f3,"regardlessof how amyocyteis stimulated,however...","regardless of how a myocardial stimulation, ho...",0.5


In [51]:
# WER
print(whspr_large["wer"].mean())
print(whspr_large_clean["wer"].mean())

0.5039740585544689
0.4108835197703194


In [52]:
model_id_or_path = "openai_whisper_large"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_large, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_large_clean,
                output_dir="../results/normalized")

### Whisper-Medium

In [53]:
whspr_medium_raw = pd.read_csv("../results/intron-open-dev-whisper_medium-wer-0.3049-2872.csv")
print(whspr_medium_raw.shape)
whspr_medium_raw.head(2)

(2872, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding tu...,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,swahili,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...
1,"Regardless of how amyokites stimulated, howeve...","Regardlessof how amyocyteis stimulated,however...",/data/data/intron/7ce31a48-f507-4344-bc49-96de...,swahili,"regardless of how amyokites stimulated, howeve...","regardlessof how amyocyteis stimulated,however...",regardless of how amyokites stimulated however...,regardlessof how amyocyteis stimulated however...


In [54]:
whspr_medium = transform_raw_df(whspr_medium_raw, columns=columns, fmt="intron_whisper")
whspr_medium_clean = clean_and_compute_wer(whspr_medium)

(2872, 9)
(0, 10)
(0, 10)


In [55]:
# WER
print(whspr_medium["wer"].mean())
print(whspr_medium_clean["wer"].mean())

0.4643886864352219
0.34914988813403375


In [56]:
model_id_or_path = "openai_whisper_medium"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium_clean,
                output_dir="../results/normalized")

In [57]:
whspr_medium_en_raw = pd.read_csv("../results/intron-open-dev-whisper_medium.en-wer-0.3219-2872.csv")
print(whspr_medium_en_raw.shape)
whspr_medium_en = transform_raw_df(whspr_medium_en_raw, columns=columns, fmt="intron_whisper")
whspr_medium_en_clean = clean_and_compute_wer(whspr_medium_en)

(2872, 8)
(2872, 9)
(0, 10)
(0, 10)


In [58]:
print(whspr_medium_en["wer"].mean())
print(whspr_medium_en_clean["wer"].mean())

0.46137424273526506
0.3678484564644302


In [59]:
model_id_or_path = "openai_whisper_medium_en"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium_en, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_medium_en_clean,
                output_dir="../results/normalized")

### Whisper-Small

In [60]:
whspr_small_raw = pd.read_csv("../results/intron-open-dev-whisper_small-wer-0.3743-2872.csv")
print(whspr_small_raw.shape)
whspr_small_raw.head(2)

(2872, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding tu...,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,swahili,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...
1,"Regardless of how a moeocatase stimulated, how...","Regardlessof how amyocyteis stimulated,however...",/data/data/intron/7ce31a48-f507-4344-bc49-96de...,swahili,"regardless of how a moeocatase stimulated, how...","regardlessof how amyocyteis stimulated,however...",regardless of how a moeocatase stimulated howe...,regardlessof how amyocyteis stimulated however...


In [61]:
whspr_small = transform_raw_df(whspr_small_raw, columns=columns, fmt="intron_whisper")
whspr_small_clean = clean_and_compute_wer(whspr_small)

(2872, 9)
(0, 10)
(0, 10)


In [62]:
print(whspr_small["wer"].mean())
print(whspr_small_clean["wer"].mean())

0.5337821026197487
0.43133830977562343


In [63]:
model_id_or_path = "openai_whisper_small"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_small, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_small_clean,
                output_dir="../results/normalized")

In [64]:
whspr_small_en_raw = pd.read_csv("../results/intron-open-dev-whisper_small.en-wer-0.383-2872.csv")
print(whspr_small_en_raw.shape)
whspr_small_en = transform_raw_df(whspr_small_en_raw, columns=columns, fmt="intron_whisper")
whspr_small_en_clean = clean_and_compute_wer(whspr_small_en)

(2872, 8)
(2872, 9)
(0, 10)
(0, 10)


In [65]:
print(whspr_small_en["wer"].mean())
print(whspr_small_en_clean["wer"].mean())

0.519713022377241
0.43402733892570033


In [66]:
model_id_or_path = "openai_whisper_small_en"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_small_en, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=whspr_small_en_clean,
                output_dir="../results/normalized")

## Nvidia

### Nemo-conformer-ctc-large

In [67]:
nemo_ctc_raw = pd.read_csv("../results/african-nlp-nemo-ctc-predictons", header=None, delimiter="	")
nemo_ctc_raw["audio_paths"] = nemo_ctc_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
print(nemo_ctc_raw.shape)
nemo_ctc_raw.head(2)

(3227, 3)


Unnamed: 0,0,1,audio_paths
0,/scratch/pbsjobs/axy327/dev/a25d5981-d814-4eef...,construction and sea trials,/data/data/intron/a25d5981-d814-4eef-8637-cc35...
1,/scratch/pbsjobs/axy327/dev/35a17e2f-5b12-4886...,unstable bos fracture of unspecified thoracic ...,/data/data/intron/35a17e2f-5b12-4886-ad2c-c1aa...


In [68]:
nemo_ctc = transform_raw_df(nemo_ctc_raw, columns=columns, fmt="african_nlp")
nemo_ctc_clean = clean_and_compute_wer(nemo_ctc)

(2872, 9)
(2, 10)
(0, 10)


In [69]:
print(nemo_ctc["wer"].mean())
print(nemo_ctc_clean["wer"].mean())

0.6767011051916588
0.5503160383940416


In [70]:
model_id_or_path = "nvidia_nemo_conformer_ctc_large"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=nemo_ctc, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=nemo_ctc_clean,
                output_dir="../results/normalized")

### Nemo-conformer-transducer-large

In [71]:
nemo_transducer_raw = pd.read_csv("../results/african-nlp-nemo-transducer-predictons", header=None, delimiter="	")
nemo_transducer_raw["audio_paths"] = nemo_transducer_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
print(nemo_transducer_raw.shape)
nemo_transducer_raw.head(2)

(3227, 3)


Unnamed: 0,0,1,audio_paths
0,/scratch/pbsjobs/axy327/dev/a25d5981-d814-4eef...,construction and sea trials,/data/data/intron/a25d5981-d814-4eef-8637-cc35...
1,/scratch/pbsjobs/axy327/dev/35a17e2f-5b12-4886...,unstable boss fracture of unspecified thoracic...,/data/data/intron/35a17e2f-5b12-4886-ad2c-c1aa...


In [72]:
nemo_transducer = transform_raw_df(nemo_transducer_raw, columns=columns, fmt="african_nlp")
nemo_transducer_clean = clean_and_compute_wer(nemo_transducer)

(2872, 9)
(6, 10)
(0, 10)


In [73]:
print(nemo_transducer["wer"].mean())
print(nemo_transducer_clean["wer"].mean())

0.6637992751483184
0.5245269422772046


In [74]:
model_id_or_path = "nvidia_nemo_conformer_transducer_large"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=nemo_transducer, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=nemo_transducer_clean,
                output_dir="../results/normalized")

## Speech Brain

In [75]:
speech_brain_raw = pd.read_csv("../results/african-nlp-speechbrain-predictons", header=None, delimiter="	")
speech_brain_raw["audio_paths"] = nemo_transducer_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
print(speech_brain_raw.shape)
speech_brain_raw.head(2)

(3227, 3)


Unnamed: 0,0,1,audio_paths
0,/scratch/pbsjobs/axy327/dev/a25d5981-d814-4eef...,CONSTRUCTION AND C TRIALS,/data/data/intron/a25d5981-d814-4eef-8637-cc35...
1,/scratch/pbsjobs/axy327/dev/35a17e2f-5b12-4886...,STUBBLE BOSS RUPTURE OF OSPREY'S WIFE'S THORAC...,/data/data/intron/35a17e2f-5b12-4886-ad2c-c1aa...


In [76]:
speech_brain = transform_raw_df(speech_brain_raw, columns=columns, fmt="african_nlp")
speech_brain_clean = clean_and_compute_wer(speech_brain)

(2872, 9)
(1, 10)
(0, 10)


In [77]:
print(speech_brain["wer"].mean())
print(speech_brain_clean["wer"].mean())

1.3610988905098031
0.976453580724285


In [78]:
model_id_or_path="speechbrain_crdnn_rnnlm_librispeech"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=speech_brain, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=speech_brain_clean,
                output_dir="../results/normalized")

## AWS

### Medical API (Primary Care)

In [79]:
aws_medical_raw = pd.read_csv("../results/intron-open-dev-aws-transcribe-medical-wer-0.5544-3232.csv")
print(aws_medical_raw.shape)
aws_medical_raw.head(2)

(3232, 15)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,predictions,wer
0,72663,f559cb4f16bc465ea44b56a8d3b5513e,swahili,19-25,KE,Ensure the correct placement of the feeding tu...,206,29af500daee748498b3329352fbc170a,/AfriSpeech-100/dev/e809b58c-4f05-4754-b98c-fb...,16.421995,african,clinical,dev,ensure the correct placement of the feeding tu...,0.451613
1,153902,2be9bc423e70a24c703f8336e60af3f3,swahili,19-25,KE,"Regardlessof how amyocyteis stimulated,however...",143,b87324a3c5ca065494f385e4f49e4b23,/AfriSpeech-100/dev/7ce31a48-f507-4344-bc49-96...,11.393991,african,clinical,dev,"regardless of how a market is stimulated, howe...",0.555556


In [80]:
aws_medical = transform_raw_df(aws_medical_raw, columns=columns, fmt="aws")
aws_medical_clean = clean_and_compute_wer(aws_medical)

(2872, 9)
(1, 10)
(0, 10)


In [81]:
print(aws_medical["wer"].mean())
print(aws_medical_clean["wer"].mean())

0.6347359704426636
0.5358062344446374


In [82]:
model_id_or_path="amazon_aws_transcribe_medical_api_primary_care"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws_medical, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws_medical_clean,
                output_dir="../results/normalized")

### Non-medical API

In [83]:
aws_raw = pd.read_csv("../results/intron-open-dev-aws-transcribe-wer-0.5212-3232.csv")
print(aws_raw.shape)
aws_raw.head(2)

(3232, 15)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,predictions,wer
0,72663,f559cb4f16bc465ea44b56a8d3b5513e,swahili,19-25,KE,Ensure the correct placement of the feeding tu...,206,29af500daee748498b3329352fbc170a,/AfriSpeech-100/dev/e809b58c-4f05-4754-b98c-fb...,16.421995,african,clinical,dev,ensure the correct placement of the feeding tu...,0.516129
1,153902,2be9bc423e70a24c703f8336e60af3f3,swahili,19-25,KE,"Regardlessof how amyocyteis stimulated,however...",143,b87324a3c5ca065494f385e4f49e4b23,/AfriSpeech-100/dev/7ce31a48-f507-4344-bc49-96...,11.393991,african,clinical,dev,"regardless of how a market is stimulated, howe...",0.555556


In [84]:
aws = transform_raw_df(aws_raw, columns=columns, fmt="aws")
aws_clean = clean_and_compute_wer(aws)

(2872, 9)
(7, 10)
(0, 10)


In [85]:
print(aws["wer"].mean())
print(aws_clean["wer"].mean())

0.5992453253269371
0.49882406211336133


In [86]:
model_id_or_path="amazon_aws_transcribe_api"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=aws_clean,
                output_dir="../results/normalized")

## Google

### GCP Speech Non-medical API

In [87]:
gcp_raw = pd.read_csv("../results/gcp_wer_asr_dev.csv")
print(gcp_raw.shape)
gcp_raw.head(2)

(3232, 9)


Unnamed: 0.1,Unnamed: 0,idx,user_ids,transcript,gcp,wer,cleanup_references,cleanup_gcp,wer_cleanup
0,0,72663,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,cancel the correct placement of the feeding tu...,0.580645,ensure the correct placement of the feeding tu...,cancel the correct placement of the feeding tu...,0.580645
1,1,153902,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...",regardless of how American cheese stimulated a...,0.722222,regardlessof how amyocyteis stimulated however...,regardless of how american cheese stimulated a...,0.55


In [88]:
gcp = transform_raw_df(gcp_raw, columns=columns, fmt="gcp")
gcp_clean = clean_and_compute_wer(gcp)

(2872, 9)
(85, 10)
(0, 10)


In [89]:
print(gcp["wer"].mean())
print(gcp_clean["wer"].mean())

0.6566726487334155
0.5993110736614581


In [90]:
model_id_or_path="google_gcp_speech_api"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=gcp, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=gcp_clean,
                output_dir="../results/normalized")

## Microsoft

### Azure

In [91]:
azure_raw = pd.read_csv("../results/intron-open-dev-azure-transcribe-wer-0.5038-2883.csv")
print(azure_raw.shape)
azure_raw.head(2)

(2883, 16)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,predictions_raw,predictions,wer
0,72663,f559cb4f16bc465ea44b56a8d3b5513e,swahili,19-25,KE,Ensure the correct placement of the feeding tu...,206,29af500daee748498b3329352fbc170a,/AfriSpeech-100/dev/e809b58c-4f05-4754-b98c-fb...,16.421995,african,clinical,dev,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,0.290323
1,153902,2be9bc423e70a24c703f8336e60af3f3,swahili,19-25,KE,"Regardlessof how amyocyteis stimulated,however...",143,b87324a3c5ca065494f385e4f49e4b23,/AfriSpeech-100/dev/7ce31a48-f507-4344-bc49-96...,11.393991,african,clinical,dev,"Regardless of how America is stimulated, howev...","regardless of how america is stimulated, howev...",0.5


In [92]:
azure = transform_raw_df(azure_raw, columns=columns, fmt="azure")
azure_clean = clean_and_compute_wer(azure)

(2872, 9)
(501, 10)
(0, 10)


In [93]:
print(azure["wer"].mean())
print(azure_clean["wer"].mean())

0.5948180547456207
0.5169678167305342


In [94]:
model_id_or_path="microsoft_azure_speech_api"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=azure, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=azure_clean,
                output_dir="../results/normalized")

### WavLm-libri-clean-100h-large

In [95]:
wavlm_libri_clean_100h_large_raw = pd.read_csv("../results/intron-open-dev-ful-patrickvonplaten-wavlm-libri-clean-100h-large-wer-0.6938-3232.csv")
print(wavlm_libri_clean_100h_large_raw.shape)
wavlm_libri_clean_100h_large_raw.head(2)

(3232, 7)


Unnamed: 0,audio_path,text,reference,predictions,predictions_raw,wer,accent
0,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,enshured the corect placement of the feiding t...,enshured the corect placement of the feiding t...,0.806452,swahili
1,/data/data/intron/7ce31a48-f507-4344-bc49-96de...,"Regardlessof how amyocyteis stimulated,however...","regardlessof how amyocyteis stimulated,however...",regadless of how amocety stimulated however th...,regadless of how amocety stimulated however th...,0.666667,swahili


In [96]:
wavlm_libri_clean_100h_large = transform_raw_df(wavlm_libri_clean_100h_large_raw, columns=columns, fmt="wav2vec2")
wavlm_libri_clean_100h_large_clean = clean_and_compute_wer(wavlm_libri_clean_100h_large)

(2877, 9)
(1, 10)
(0, 10)


In [97]:
print(wavlm_libri_clean_100h_large["wer"].mean())
print(wavlm_libri_clean_100h_large_clean["wer"].mean())

0.8328127595669096
0.7692137602897249


In [98]:
model_id_or_path="microsoft_wavlm_libri_clean_100h_large"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_large, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_large_clean,
                output_dir="../results/normalized")

### WavLm-libri-clean-100h-base-plus

In [99]:
wavlm_libri_clean_100h_base_raw = pd.read_csv("../results/wavLM_wer_asr_dev.csv")
print(wavlm_libri_clean_100h_base_raw.shape)
wavlm_libri_clean_100h_base_raw.head(2)

(3232, 9)


Unnamed: 0.1,Unnamed: 0,idx,user_ids,transcript,wavLM,wer,cleanup_references,cleanup_wavLM,wer_cleanup
0,0,72663,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,ensu the correct placement of the fiding tube ...,0.903226,ensure the correct placement of the feeding tu...,ensu the correct placement of the fiding tube ...,0.83871
1,1,153902,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...",regardless of how amuketisstimulated however t...,0.722222,regardlessof how amyocyteis stimulated however...,regardless of how amuketisstimulated however t...,0.6


In [100]:
wavlm_libri_clean_100h_base = transform_raw_df(wavlm_libri_clean_100h_base_raw, columns=columns, fmt="wavLM")
wavlm_libri_clean_100h_base_clean = clean_and_compute_wer(wavlm_libri_clean_100h_base)

(2872, 9)
(2, 10)
(0, 10)


In [101]:
print(wavlm_libri_clean_100h_base["wer"].mean())
print(wavlm_libri_clean_100h_base_clean["wer"].mean())

0.9022293171522732
0.8549641714400352


In [102]:
model_id_or_path="microsoft_wavlm_libri_clean_100h_base"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_base, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wavlm_libri_clean_100h_base_clean,
                output_dir="../results/normalized")

## Facebook

### Wav2vec2-large-xlsr-53-english

In [103]:
wav2vec2_large_xlsr_53_english_raw = pd.read_csv("../results/intron-open-dev-ful-jonatasgrosman-wav2vec2-large-xlsr-53-english-wer-0.5606-3232.csv")
print(wav2vec2_large_xlsr_53_english_raw.shape)
wav2vec2_large_xlsr_53_english_raw.head(2)

(3232, 7)


Unnamed: 0,audio_path,text,reference,predictions,predictions_raw,wer,accent
0,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct replacement of the feeding ...,ensure the correct replacement of the feeding ...,0.709677,swahili
1,/data/data/intron/7ce31a48-f507-4344-bc49-96de...,"Regardlessof how amyocyteis stimulated,however...","regardlessof how amyocyteis stimulated,however...",regardless of how amerchetis stimulated howeve...,regardless of how amerchetis stimulated howeve...,0.611111,swahili


In [104]:
wav2vec2_large_xlsr_53_english = transform_raw_df(wav2vec2_large_xlsr_53_english_raw, columns=columns, fmt="wav2vec2")
wav2vec2_large_xlsr_53_english_clean = clean_and_compute_wer(wav2vec2_large_xlsr_53_english)

(2877, 9)
(1, 10)
(0, 10)


In [105]:
print(wav2vec2_large_xlsr_53_english["wer"].mean())
print(wav2vec2_large_xlsr_53_english_clean["wer"].mean())

0.7152722570406036
0.6398663107153179


In [106]:
model_id_or_path="facebook_wav2vec2_large_xlsr_53_english"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_xlsr_53_english, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_xlsr_53_english_clean,
                output_dir="../results/normalized")

### Wav2vec2-xls-r-1b-english

In [107]:
wav2vec2_xls_r_1b_english_raw = pd.read_csv("../results/intron-open-dev-ful-jonatasgrosman-wav2vec2-xls-r-1b-english-wer-0.5761-3232.csv")
print(wav2vec2_xls_r_1b_english_raw.shape)
wav2vec2_xls_r_1b_english_raw.head(2)

(3232, 7)


Unnamed: 0,audio_path,text,reference,predictions,predictions_raw,wer,accent
0,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct replacement of the feeding ...,ensure the correct replacement of the feeding ...,0.709677,swahili
1,/data/data/intron/7ce31a48-f507-4344-bc49-96de...,"Regardlessof how amyocyteis stimulated,however...","regardlessof how amyocyteis stimulated,however...",regardless of how amceteis stimulated however ...,regardless of how amceteis stimulated however ...,0.611111,swahili


In [108]:
wav2vec2_xls_r_1b_english = transform_raw_df(wav2vec2_xls_r_1b_english_raw, columns=columns, fmt="wav2vec2")
wav2vec2_xls_r_1b_english_clean = clean_and_compute_wer(wav2vec2_xls_r_1b_english)

(2877, 9)
(0, 10)
(0, 10)


In [109]:
print(wav2vec2_xls_r_1b_english["wer"].mean())
print(wav2vec2_xls_r_1b_english_clean["wer"].mean())

0.743025983322594
0.6514921574251561


In [110]:
model_id_or_path="facebook_wav2vec2_xls_r_1b_english"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_xls_r_1b_english, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_xls_r_1b_english_clean,
                output_dir="../results/normalized")

### Wav2vec2-large-960h-lv60-self

In [111]:
wav2vec2_large_960h_lv60_self_raw = pd.read_csv("../results/intron-open-dev-ful-facebook-wav2vec2-large-960h-lv60-self-wer-0.5944-3232.csv")
print(wav2vec2_large_960h_lv60_self_raw.shape)
wav2vec2_large_960h_lv60_self_raw.head(2)

(3232, 7)


Unnamed: 0,audio_path,text,reference,predictions,predictions_raw,wer,accent
0,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct eplacement of the feeding t...,ENSURE THE CORRECT EPLACEMENT OF THE FEEDING T...,0.645161,swahili
1,/data/data/intron/7ce31a48-f507-4344-bc49-96de...,"Regardlessof how amyocyteis stimulated,however...","regardlessof how amyocyteis stimulated,however...",regardless of how amikitas stimulated however ...,REGARDLESS OF HOW AMIKITAS STIMULATED HOWEVER ...,0.666667,swahili


In [112]:
wav2vec2_large_960h_lv60_self = transform_raw_df(wav2vec2_large_960h_lv60_self_raw, columns=columns, fmt="wav2vec2")
wav2vec2_large_960h_lv60_self_clean = clean_and_compute_wer(wav2vec2_large_960h_lv60_self)

(2877, 9)
(1, 10)
(0, 10)


In [113]:
print(wav2vec2_large_960h_lv60_self["wer"].mean())
print(wav2vec2_large_960h_lv60_self_clean["wer"].mean())

1.2229264243916067
0.67118697565761


In [114]:
model_id_or_path="facebook_wav2vec2_large_960h_lv60_self"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h_lv60_self, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h_lv60_self_clean,
                output_dir="../results/normalized")

### Wav2vec2-large-960h

In [115]:
wav2vec2_large_960h_raw = pd.read_csv("../results/intron-open-dev-ful-facebook-wav2vec2-large-960h-wer-0.7031-3232.csv")
print(wav2vec2_large_960h_raw.shape)
wav2vec2_large_960h_raw.head(2)

(3232, 7)


Unnamed: 0,audio_path,text,reference,predictions,predictions_raw,wer,accent
0,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ENSURE THE CORRECT PLACEMENT OF THE FEEDING TU...,0.741935,swahili
1,/data/data/intron/7ce31a48-f507-4344-bc49-96de...,"Regardlessof how amyocyteis stimulated,however...","regardlessof how amyocyteis stimulated,however...",regardless of how amercitis stimulated however...,REGARDLESS OF HOW AMERCITIS STIMULATED HOWEVER...,0.666667,swahili


In [116]:
wav2vec2_large_960h = transform_raw_df(wav2vec2_large_960h_raw, columns=columns, fmt="wav2vec2")
wav2vec2_large_960h_clean = clean_and_compute_wer(wav2vec2_large_960h)

(2877, 9)
(1, 10)
(0, 10)


In [117]:
print(wav2vec2_large_960h["wer"].mean())
print(wav2vec2_large_960h_clean["wer"].mean())

1.246550763360166
0.789551281262618


In [118]:
model_id_or_path="facebook_wav2vec2_large_960h"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_960h_clean,
                output_dir="../results/normalized")

### Wav2vec2-large-robust-ft-swbd-300h

In [119]:
wav2vec2_large_robust_ft_swbd_300h_raw = pd.read_csv("../results/intron-open-dev-ful-facebook-wav2vec2-large-robust-ft-swbd-300h-wer-0.7749-3232.csv")
print(wav2vec2_large_robust_ft_swbd_300h_raw.shape)
wav2vec2_large_robust_ft_swbd_300h_raw.head(2)

(3232, 7)


Unnamed: 0,audio_path,text,reference,predictions,predictions_raw,wer,accent
0,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,Ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the corextplacement of the feoding tube...,ENSURE THE COREXTPLACEMENT OF THE FEODING TUBE...,0.83871,swahili
1,/data/data/intron/7ce31a48-f507-4344-bc49-96de...,"Regardlessof how amyocyteis stimulated,however...","regardlessof how amyocyteis stimulated,however...",regardless of how amercatis stimulated however...,REGARDLESS OF HOW AMERCATIS STIMULATED HOWEVER...,0.777778,swahili


In [120]:
wav2vec2_large_robust_ft_swbd_300h = transform_raw_df(wav2vec2_large_robust_ft_swbd_300h_raw, columns=columns, fmt="wav2vec2")
wav2vec2_large_robust_ft_swbd_300h_clean = clean_and_compute_wer(wav2vec2_large_robust_ft_swbd_300h)

(2877, 9)
(7, 10)
(0, 10)


In [121]:
print(wav2vec2_large_robust_ft_swbd_300h["wer"].mean())
print(wav2vec2_large_robust_ft_swbd_300h_clean["wer"].mean())

1.2880591945755344
0.8636325335980687


In [122]:
model_id_or_path="facebook_wav2vec2_large_robust_ft_swbd_300h"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_robust_ft_swbd_300h, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=wav2vec2_large_robust_ft_swbd_300h_clean,
                output_dir="../results/normalized")

### Hubert-large-ls960-ft

In [123]:
hubert_large_ls960_ft_raw = pd.read_csv("../results/hubert_wer_asr_dev.csv")
print(hubert_large_ls960_ft_raw.shape)
hubert_large_ls960_ft_raw.head(2)

(3232, 10)


Unnamed: 0.1,Unnamed: 0,idx,user_ids,transcript,hubert_large,hubert_xlarge,wer,cleanup_references,cleanup_hubert,wer_cleanup
0,0,72663,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,ENSURE THE CORRECT PLACEMENT OF THE FIDING TUB...,ENSURE THE CORRECT PLACEMENT OF THE FEEDING TU...,0.677419,ensure the correct placement of the feeding tu...,ensure the correct placement of the fiding tub...,0.580645
1,1,153902,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...",REGARDLESS OF HOW AMECITIS STIMULATED HOWEVER ...,REGARDLESS OF HOW A MCIT IS STIMULATED HOWEVER...,0.722222,regardlessof how amyocyteis stimulated however...,regardless of how amecitis stimulated however ...,0.5


In [124]:
hubert_large_ls960_ft = transform_raw_df(hubert_large_ls960_ft_raw, columns=columns, fmt="hubert_large")
hubert_large_ls960_ft_clean = clean_and_compute_wer(hubert_large_ls960_ft)

(2872, 9)
(1, 10)
(0, 10)


In [125]:
print(hubert_large_ls960_ft["wer"].mean())
print(hubert_large_ls960_ft_clean["wer"].mean())

1.2269235515829158
0.7085538560634761


In [126]:
model_id_or_path="facebook_hubert_large_ls960_ft"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_large_ls960_ft, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_large_ls960_ft_clean,
                output_dir="../results/normalized")

### Hubert-xlarge-ls960-ft

In [127]:
hubert_xlarge_ls960_ft_raw = pd.read_csv("../results/hubert_wer_asr_dev.csv")
print(hubert_xlarge_ls960_ft_raw.shape)
hubert_xlarge_ls960_ft_raw.head(2)

(3232, 10)


Unnamed: 0.1,Unnamed: 0,idx,user_ids,transcript,hubert_large,hubert_xlarge,wer,cleanup_references,cleanup_hubert,wer_cleanup
0,0,72663,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,ENSURE THE CORRECT PLACEMENT OF THE FIDING TUB...,ENSURE THE CORRECT PLACEMENT OF THE FEEDING TU...,0.677419,ensure the correct placement of the feeding tu...,ensure the correct placement of the fiding tub...,0.580645
1,1,153902,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...",REGARDLESS OF HOW AMECITIS STIMULATED HOWEVER ...,REGARDLESS OF HOW A MCIT IS STIMULATED HOWEVER...,0.722222,regardlessof how amyocyteis stimulated however...,regardless of how amecitis stimulated however ...,0.5


In [128]:
hubert_xlarge_ls960_ft = transform_raw_df(hubert_xlarge_ls960_ft_raw, columns=columns, fmt="hubert_xlarge")
hubert_xlarge_ls960_ft_clean = clean_and_compute_wer(hubert_xlarge_ls960_ft)

(2872, 9)
(1, 10)
(0, 10)


In [129]:
print(hubert_xlarge_ls960_ft["wer"].mean())
print(hubert_xlarge_ls960_ft_clean["wer"].mean())

1.2361624783348384
0.7148852567015267


In [130]:
model_id_or_path="facebook_hubert_xlarge_ls960_ft"

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_xlarge_ls960_ft, 
                output_dir="../results/raw")

write_to_folder(model_id_or_path=model_id_or_path, 
                predictions_df=hubert_xlarge_ls960_ft_clean,
                output_dir="../results/normalized")

# Other statistics

- compute per-accent statistics
- compute per-gender statistics
- compute per-domain statistics
- compute per age-group statistics