In [1]:
import warnings
warnings.filterwarnings("ignore")

In [76]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_metric
import sys
sys.path.append("..")
from src.utils.text_processing import clean_text

In [3]:
wer_metric = load_metric("wer")

In [4]:
dev_df = pd.read_csv("../data/intron-dev-public-3231-clean.csv")
print(dev_df.shape)
dev_df.head(3)

(3231, 14)


Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_path,duration,origin,domain,split,gender
0,155349,659e36c14baaf7fa7bb197f951251f4b,setswana,26-40,BW,We should be asking ourselves whether we want ...,110,86ad3ef21e469217f28a749c990c81fd,/AfriSpeech-100/dev/92d2b94e-3e31-40be-b479-50...,8.400998,african,general,dev,Male
1,60812,1fd5f717cede9a867bf37d03c7d2166b,siswati,26-40,ZA,Other sagittal planes parallel to this off cen...,88,5409734bcc4a0f053e73f405d0f135da,/AfriSpeech-100/dev/127bdba1-8bc8-44a4-9c37-8e...,5.236984,african,clinical,dev,Female
2,139668,f2d08d2e1c47a187bfa6869fadc1f755,setswana,26-40,ZA,Tuberculoma is an intracranial massoccurring s...,109,b10dc4e959b6596d31866e5094ef500a,/AfriSpeech-100/dev/de0b1ee6-46f1-4eb6-a747-47...,7.072993,african,clinical,dev,Male


In [5]:
dev_df = dev_df[~dev_df.duplicated(subset="audio_path")]
print(dev_df.shape)

(3227, 14)


In [6]:
dev_df["audio_paths"] = dev_df["audio_path"].apply(lambda x: x.replace("/AfriSpeech-100/dev/", "/data/data/intron/"))
df = pd.read_csv("../results/intron-open-dev-whisper_medium-wer-0.3049-2872.csv")
df = pd.merge(df, dev_df, on="audio_paths")
df.shape

(2872, 22)

In [7]:
mapping_df = df[["idx", "audio_paths"]]
mapping_df.head(3)

Unnamed: 0,idx,audio_paths
0,72663,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...
1,153902,/data/data/intron/7ce31a48-f507-4344-bc49-96de...
2,82671,/data/data/intron/b8f6fbc1-426f-4a4b-99db-c38d...


In [91]:
# Handy functions
def transform_raw_df(df_raw, columns=[], fmt="intron"):
    if fmt == "intron":
        df = pd.merge(df_raw, dev_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["transcript", "hypothesis", "audio_paths_x", "accent_x"]]
        df.loc[:, "accent"] = df.loc[:, "accent_x"]
        df.loc[:, "audio_paths"] = df.loc[:, "audio_paths_x"]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, "hypothesis"]
        df = df.drop(columns=["accent_x", "audio_paths_x", "hypothesis", "transcript"])

        print(df.shape)
        
    elif fmt == "whisper_large":
        df = pd.merge(df_raw, dev_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")
        print(df.shape)
        

        df = df[columns+["accent", "user_ids_x", "transcript_x", "whisper_large"]]

        df.loc[:, "user_ids"] = df.loc[:, "user_ids_x"]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, "whisper_large"]
        df = df.drop(columns=["transcript_x", "whisper_large", "user_ids_x"])

        print(df.shape)
        
    elif fmt == "whisper_medium":
        df = pd.merge(df_raw, dev_df, on="idx")
        df = pd.merge(df, mapping_df, on="idx")
        print(df.shape)
        

        df = df[columns+["accent", "user_ids_x", "transcript_x", "whisper_medium"]]

        df.loc[:, "user_ids"] = df.loc[:, "user_ids_x"]
        df.loc[:, "reference"] = df.loc[:, "transcript_x"]
        df.loc[:, "prediction"] = df.loc[:, "whisper_medium"]
        df = df.drop(columns=["transcript_x", "whisper_medium", "user_ids_x"])

        print(df.shape)
    
    elif fmt == "african-nlp":
        df = pd.merge(df_raw, dev_df, on="audio_paths")
        df = pd.merge(df, mapping_df, on="idx")

        df = df[columns+["accent", "user_ids", "transcript", 1]]
        df.loc[:, "reference"] = df.loc[:, "transcript"]
        df.loc[:, "prediction"] = df.loc[:, 1]
        df = df.drop(columns=[1, "transcript"])

        print(df.shape)

        
    df["wer"] = df.apply(lambda x: wer_metric.compute(predictions=[x.prediction], references=[x.reference]), axis=1) 
    return df
    


def clean_and_compute_wer(df):
    df_clean = df.copy()
    
    print(df_clean[df_clean["prediction"].isnull()].shape)
    
    df_clean["prediction"] = df_clean["prediction"].fillna("")
    print(df_clean[df_clean["prediction"].isnull()].shape)
    
    df_clean["reference"] = df_clean["reference"].apply(lambda x: clean_text(x))
    df_clean["prediction"] = df_clean["prediction"].apply(lambda x: clean_text(x))
    
    df_clean["wer"] = df_clean.apply(lambda x: wer_metric.compute(predictions=[x.prediction], 
                                                                  references=[x.reference]), axis=1)
    return df_clean
    
    
def write_to_folder(model_id_or_path, predictions_df, output_dir="../results/clean", split="dev"):
    wer = predictions_df["wer"].mean()
    output_path = f"{output_dir}/intron-open-{split}-{model_id_or_path}-wer-{round(wer, 4)}-{len(predictions_df)}.csv"
    predictions_df.to_csv(output_path, index=False)

# Whisper models

In [9]:
whspr = pd.read_csv("../results/whisper_wer_asr_dev.csv").drop(columns="Unnamed: 0")
print(whspr.shape)
whspr.head(3)

(3232, 9)


Unnamed: 0,idx,user_ids,transcript,whisper_medium,whisper_large,wer,cleanup_references,cleanup_whisper,wer_cleanup
0,72663,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding t...,Ensure the correct placement of the feeding t...,0.451613,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,0.290323
1,153902,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...","Regardless of how amyokites stimulated, howev...","Regardless of how a myocardial stimulation, h...",0.444444,regardlessof how amyocyteis stimulated however...,regardless of how amyokites stimulated howeve...,0.25
2,82671,a3c2c182b6b53089ef4bc3eccff103bf,"For Nigeria to achieve similar noble goals, th...","For Nigeria to achieve similar noble goals, t...","For Nigeria to achieve similar noble goals, t...",0.222222,for nigeria to achieve similar noble goals th...,for nigeria to achieve similar noble goals th...,0.027778


In [10]:
columns = ["idx", "domain", "gender", "duration", "age_group"]

## Large

In [11]:
whspr_large = transform_raw_df(whspr, columns=columns, fmt="whisper_large")
whspr_large_clean = clean_and_compute_wer(whspr_large)

(2872, 24)
(2872, 9)
(3, 10)
(0, 10)


In [12]:
whspr_large.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,reference,prediction,wer
0,72663,clinical,Female,16.421995,19-25,swahili,f559cb4f16bc465ea44b56a8d3b5513e,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding t...,0.290323
1,153902,clinical,Female,11.393991,19-25,swahili,2be9bc423e70a24c703f8336e60af3f3,"Regardlessof how amyocyteis stimulated,however...","Regardless of how a myocardial stimulation, h...",0.5


In [13]:
whspr_large_clean.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,reference,prediction,wer
0,72663,clinical,Female,16.421995,19-25,swahili,f559cb4f16bc465ea44b56a8d3b5513e,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,0.290323
1,153902,clinical,Female,11.393991,19-25,swahili,2be9bc423e70a24c703f8336e60af3f3,"regardlessof how amyocyteis stimulated,however...","regardless of how a myocardial stimulation, ho...",0.5


In [29]:
# WER
print(whspr_large["wer"].mean())
print(whspr_large_clean["wer"].mean())

0.5039740585544689
0.4108835197703194


In [115]:
predictions_df = whspr_large
model_id_or_path = "whisper_large"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [116]:
predictions_df = whspr_large_clean
model_id_or_path = "whisper_large_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

## Medium

In [68]:

df = pd.merge(nemo_ctc_raw, dev_df, on="audio_paths")
df = pd.merge(adf, mapping_df, on="idx")

df = df[columns+["accent", "user_ids", "transcript", 1]]
df.loc[:, "reference"] = df.loc[:, "transcript"]
df.loc[:, "prediction"] = df.loc[:, 1]
df = df.drop(columns=[1, "transcript"])

print(df.shape)

(2872, 9)


In [16]:
whspr_medium_raw = pd.read_csv("../results/intron-open-dev-whisper_medium-wer-0.3049-2872.csv")
print(whspr_medium_raw.shape)
whspr_medium_raw.head(2)

(2872, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding tu...,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,swahili,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...
1,"Regardless of how amyokites stimulated, howeve...","Regardlessof how amyocyteis stimulated,however...",/data/data/intron/7ce31a48-f507-4344-bc49-96de...,swahili,"regardless of how amyokites stimulated, howeve...","regardlessof how amyocyteis stimulated,however...",regardless of how amyokites stimulated however...,regardlessof how amyocyteis stimulated however...


In [17]:
whspr_medium = transform_raw_df(whspr_medium_raw, columns=columns, fmt="intron")
whspr_medium_clean = clean_and_compute_wer(whspr_medium)

(2872, 10)
(0, 11)
(0, 11)


In [30]:
# WER
print(whspr_medium["wer"].mean())
print(whspr_medium_clean["wer"].mean())

0.4643886864352219
0.34914988813403375


In [113]:
predictions_df = whspr_medium
model_id_or_path = "whisper_medium"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [114]:
predictions_df = whspr_medium_clean
model_id_or_path = "whisper_medium_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [27]:
whspr_medium_en_raw = pd.read_csv("../results/intron-open-dev-whisper_medium.en-wer-0.3219-2872.csv")
print(whspr_medium_en_raw.shape)
whspr_medium_en = transform_raw_df(whspr_medium_en_raw, columns=columns, fmt="intron")
whspr_medium_en_clean = clean_and_compute_wer(whspr_medium_en)

(2872, 8)
(2872, 10)
(0, 11)
(0, 11)


In [28]:
print(whspr_medium_en["wer"].mean())
print(whspr_medium_en_clean["wer"].mean())

0.46137424273526506
0.3678484564644302


In [111]:
predictions_df = whspr_medium_en
model_id_or_path = "whisper_medium_en"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [112]:
predictions_df = whspr_medium_en_clean
model_id_or_path = "whisper_medium_en_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

## Small

In [20]:
whspr_small_raw = pd.read_csv("../results/intron-open-dev-whisper_small-wer-0.3743-2872.csv")
print(whspr_small_raw.shape)
whspr_small_raw.head(2)

(2872, 8)


Unnamed: 0,hypothesis,reference,audio_paths,accent,pred_clean,ref_clean,hypothesis_clean,reference_clean
0,Ensure the correct placement of the feeding tu...,Ensure the correct placement of the feeding tu...,/data/data/intron/e809b58c-4f05-4754-b98c-fbf2...,swahili,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...,ensure the correct placement of the feeding tu...
1,"Regardless of how a moeocatase stimulated, how...","Regardlessof how amyocyteis stimulated,however...",/data/data/intron/7ce31a48-f507-4344-bc49-96de...,swahili,"regardless of how a moeocatase stimulated, how...","regardlessof how amyocyteis stimulated,however...",regardless of how a moeocatase stimulated howe...,regardlessof how amyocyteis stimulated however...


In [21]:
whspr_small = transform_raw_df(whspr_small_raw, columns=columns, fmt="intron")
whspr_small_clean = clean_and_compute_wer(whspr_small)

(2872, 10)
(0, 11)
(0, 11)


In [31]:
print(whspr_small["wer"].mean())
print(whspr_small_clean["wer"].mean())

0.5337821026197487
0.43133830977562343


In [110]:
predictions_df = whspr_small
model_id_or_path = "whisper_small"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [109]:
predictions_df = whspr_small_clean
model_id_or_path = "whisper_small_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [24]:
whspr_small_en_raw = pd.read_csv("../results/intron-open-dev-whisper_small.en-wer-0.383-2872.csv")
print(whspr_small_en_raw.shape)
whspr_small_en = transform_raw_df(whspr_small_en_raw, columns=columns, fmt="intron")
whspr_small_en_clean = clean_and_compute_wer(whspr_small_en)

(2872, 8)
(2872, 10)
(0, 11)
(0, 11)


In [25]:
print(whspr_small_en["wer"].mean())
print(whspr_small_en_clean["wer"].mean())

0.519713022377241
0.43402733892570033


In [107]:
predictions_df = whspr_small_en
model_id_or_path = "whisper_small_en"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [106]:
predictions_df = whspr_small_en_clean
model_id_or_path = "whisper_small_en_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

# Nemo

## Nemo-conformer-ctc-large

In [50]:
nemo_ctc_raw = pd.read_csv("../results/african-nlp-nemo-ctc-predictons", header=None, delimiter="	")
nemo_ctc_raw["audio_paths"] = nemo_ctc_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
print(nemo_ctc_raw.shape)
nemo_ctc_raw.head(2)

(3227, 3)


Unnamed: 0,0,1,audio_paths
0,/scratch/pbsjobs/axy327/dev/a25d5981-d814-4eef...,construction and sea trials,/data/data/intron/a25d5981-d814-4eef-8637-cc35...
1,/scratch/pbsjobs/axy327/dev/35a17e2f-5b12-4886...,unstable bos fracture of unspecified thoracic ...,/data/data/intron/35a17e2f-5b12-4886-ad2c-c1aa...


In [71]:
nemo_ctc = transform_raw_df(nemo_ctc_raw, columns=columns, fmt="nemo")
nemo_ctc_clean = clean_and_compute_wer(nemo_ctc)

(2872, 9)
(2, 10)
(0, 10)


In [72]:
print(nemo_ctc["wer"].mean())
print(nemo_ctc_clean["wer"].mean())

0.6767011051916588
0.5503160383940416


In [105]:
predictions_df = nemo_ctc
model_id_or_path = "nemo_conformer_ctc_large"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [104]:
predictions_df = nemo_ctc_clean
model_id_or_path = "nemo_conformer_ctc_large_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

## Nemo-conformer-transducer-large

In [73]:
nemo_transducer_raw = pd.read_csv("../results/african-nlp-nemo-transducer-predictons", header=None, delimiter="	")
nemo_transducer_raw["audio_paths"] = nemo_transducer_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
print(nemo_transducer_raw.shape)
nemo_transducer_raw.head(2)

(3227, 3)


Unnamed: 0,0,1,audio_paths
0,/scratch/pbsjobs/axy327/dev/a25d5981-d814-4eef...,construction and sea trials,/data/data/intron/a25d5981-d814-4eef-8637-cc35...
1,/scratch/pbsjobs/axy327/dev/35a17e2f-5b12-4886...,unstable boss fracture of unspecified thoracic...,/data/data/intron/35a17e2f-5b12-4886-ad2c-c1aa...


In [92]:
nemo_transducer = transform_raw_df(nemo_transducer_raw, columns=columns, fmt="african-nlp")
nemo_transducer_clean = clean_and_compute_wer(nemo_transducer)

(2872, 9)
(6, 10)
(0, 10)


In [75]:
print(nemo_transducer["wer"].mean())
print(nemo_transducer_clean["wer"].mean())

0.6637992751483184
0.5245269422772046


In [102]:
predictions_df = nemo_transducer
model_id_or_path = "nemo_conformer_transducer_large"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [103]:
predictions_df = nemo_transducer_clean
model_id_or_path = "nemo_conformer_transducer_large_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

# Speech Brain

In [90]:
speech_brain_raw = pd.read_csv("../results/african-nlp-speechbrain-predictons", header=None, delimiter="	")
speech_brain_raw["audio_paths"] = nemo_transducer_raw[0].apply(lambda x: x.replace("/scratch/pbsjobs/axy327/dev/", "/data/data/intron/"))
print(speech_brain_raw.shape)
speech_brain_raw.head(2)

(3227, 3)


Unnamed: 0,0,1,audio_paths
0,/scratch/pbsjobs/axy327/dev/a25d5981-d814-4eef...,CONSTRUCTION AND C TRIALS,/data/data/intron/a25d5981-d814-4eef-8637-cc35...
1,/scratch/pbsjobs/axy327/dev/35a17e2f-5b12-4886...,STUBBLE BOSS RUPTURE OF OSPREY'S WIFE'S THORAC...,/data/data/intron/35a17e2f-5b12-4886-ad2c-c1aa...


In [93]:
speech_brain = transform_raw_df(speech_brain_raw, columns=columns, fmt="african-nlp")
speech_brain_clean = clean_and_compute_wer(speech_brain)

(2872, 9)
(1, 10)
(0, 10)


In [94]:
print(speech_brain["wer"].mean())
print(speech_brain_clean["wer"].mean())

1.3610988905098031
0.976453580724285


In [100]:
predictions_df=speech_brain
model_id_or_path="speechbrain_crdnn_rnnlm_librispeech"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

In [101]:
predictions_df=speech_brain_clean
model_id_or_path="speechbrain_crdnn_rnnlm_librispeech_clean"
write_to_folder(model_id_or_path=model_id_or_path, predictions_df=predictions_df)

# AWS

## Medical

In [96]:
aws_medical_raw = pd.read_csv("../results/intron-open-dev-aws-transcribe-medical-wer-0.5544-3232.csv")
print(aws_medical_raw.shape)

(3232, 15)


In [97]:
aws_medical_raw.head(2)

Unnamed: 0,idx,user_ids,accent,age_group,country,transcript,nchars,audio_ids,audio_paths,duration,origin,domain,split,predictions,wer
0,72663,f559cb4f16bc465ea44b56a8d3b5513e,swahili,19-25,KE,Ensure the correct placement of the feeding tu...,206,29af500daee748498b3329352fbc170a,/AfriSpeech-100/dev/e809b58c-4f05-4754-b98c-fb...,16.421995,african,clinical,dev,ensure the correct placement of the feeding tu...,0.451613
1,153902,2be9bc423e70a24c703f8336e60af3f3,swahili,19-25,KE,"Regardlessof how amyocyteis stimulated,however...",143,b87324a3c5ca065494f385e4f49e4b23,/AfriSpeech-100/dev/7ce31a48-f507-4344-bc49-96...,11.393991,african,clinical,dev,"regardless of how a market is stimulated, howe...",0.555556


# Other statistics

- compute per-accent statistics
- compute per-gender statistics
- compute per-domain statistics
- compute per age-group statistics