In [1]:
import pandas as pd
import ast
pd.set_option('display.max_colwidth', 0)

import json

In [2]:
# All the pretrained model results are stored here
pretrained_df = pd.read_csv("../results/normalized/intron-open-test-all_models.csv")
print(pretrained_df.shape)

# This is finetuned model result
finetuned_df = pd.read_csv("../results/normalized/intron-open-test-all_models_finetuned.csv")
print(finetuned_df.shape)

# Only consider general domain samples
pretrained_df = pretrained_df[pretrained_df.domain == "general"]

finetuned_df = finetuned_df[finetuned_df.domain == "general"]

print(len(pretrained_df), len(finetuned_df))

(103949, 15)
(32826, 15)
44916 14184


In [3]:
pretrained_df["model_tag"] = "pretrained"
finetuned_df["model_tag"] = "finetuned"

In [4]:
# df = pd.read_csv("../results/ner/intron-test-public-6346-clean_with_named_entity.csv")

# # only consider samples in pretrained df
# df_general = df_general[df_general.idx.isin(pretrained_df.idx.unique())]

# df_entity_ = df_general[df_general.has_entity == 1].reset_index(drop=True)

# df_no_entity = df_general[df_general.entities_group.isna()].reset_index(drop=True)

# # Only consider samples with named entities and contains either location or person named entities
# df_entity = df_entity_[(df_entity_.PER.notna()) | (df_entity_.LOC.notna())].reset_index(drop=True)

In [5]:
def get_dfs(csv):
    df = pd.read_csv(csv)
    
    df_general = df[df.domain == "general"]

    df_entity_ = df_general[df_general.has_entity == 1].reset_index(drop=True)

    df_no_entity = df_general[df_general.entities_group.isna()].reset_index(drop=True)

    # Only consider samples with named entities and contains either location or person named entities
    df_entity = df_entity_[(df_entity_.PER.notna()) | (df_entity_.LOC.notna())].reset_index(drop=True)
    
    return df_general, df_no_entity, df_entity, df_entity_, df

In [6]:
csv = "../results/ner/intron-test-public-6346-clean_with_named_entity.csv"
df_general, df_no_entity, df_entity, df_entity_, df = get_dfs(csv)

In [7]:
# only consider samples in pretrained df
df_general = df_general[df_general.idx.isin(pretrained_df.idx.unique())]
df_entity = df_entity[df_entity.idx.isin(pretrained_df.idx.unique())]
df_no_entity = df_no_entity[df_no_entity.idx.isin(pretrained_df.idx.unique())]
df_entity_ = df_entity_[df_entity_.idx.isin(pretrained_df.idx.unique())]

In [8]:
df_entity_.shape, df_entity.shape, df_no_entity.shape, df_general.shape, df.shape

((1156, 21), (971, 21), (1029, 21), (2364, 21), (6346, 21))

In [296]:
# Only focus on sentences with named entities 
pretrained_df_entity = pretrained_df[pretrained_df["idx"].isin(df_entity["idx"].to_list())]
pretrained_df_no_entity = pretrained_df[pretrained_df["idx"].isin(df_no_entity["idx"].to_list())]

print(pretrained_df_entity.shape, pretrained_df_no_entity.shape)

finetuned_df_entity = finetuned_df[finetuned_df["idx"].isin(df_entity["idx"].to_list())]
finetuned_df_no_entity = finetuned_df[finetuned_df["idx"].isin(df_no_entity["idx"].to_list())]

print(finetuned_df_entity.shape, finetuned_df_no_entity.shape)

(18449, 16) (19551, 16)
(5826, 16) (6174, 16)


In [297]:
pretrained_df_entity.groupby(["name"])["idx"].apply(lambda x: len(x))

name
AWS                                            971
AWS [Medical] (Primary Care)                   971
Azure                                          971
GCP                                            971
GCP [Medical]                                  971
facebook/hubert-large-ls960-ft                 971
facebook/hubert-xlarge-ls960-ft                971
facebook/wav2vec2-large-960h                   971
facebook/wav2vec2-large-960h-lv60-self         971
facebook/wav2vec2-large-robust-ft-swbd-300h    971
facebook/wav2vec2-large-xlsr-53-english        971
facebook/wav2vec2-xls-r-1b-english             971
microsoft/wavlm-libri-clean-100h-base          971
microsoft/wavlm-libri-clean-100h-large         971
openai/whisper-large                           971
openai/whisper-medium                          971
openai/whisper-medium-en                       971
openai/whisper-small                           971
openai/whisper-small-en                        971
Name: idx, dtype: int64

In [298]:
pretrained_df_no_entity.groupby(["name"])["idx"].apply(lambda x: len(x))

name
AWS                                            1029
AWS [Medical] (Primary Care)                   1029
Azure                                          1029
GCP                                            1029
GCP [Medical]                                  1029
facebook/hubert-large-ls960-ft                 1029
facebook/hubert-xlarge-ls960-ft                1029
facebook/wav2vec2-large-960h                   1029
facebook/wav2vec2-large-960h-lv60-self         1029
facebook/wav2vec2-large-robust-ft-swbd-300h    1029
facebook/wav2vec2-large-xlsr-53-english        1029
facebook/wav2vec2-xls-r-1b-english             1029
microsoft/wavlm-libri-clean-100h-base          1029
microsoft/wavlm-libri-clean-100h-large         1029
openai/whisper-large                           1029
openai/whisper-medium                          1029
openai/whisper-medium-en                       1029
openai/whisper-small                           1029
openai/whisper-small-en                        1029
Name: i

In [299]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained(
    "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
)
model = AutoModelForTokenClassification.from_pretrained(
    "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [300]:
# Replace model without prediction with empty string
pretrained_df_entity.loc[:, "prediction"] = pretrained_df_entity.loc[:, "prediction"].fillna("")
finetuned_df_entity.loc[:, "prediction"] = finetuned_df_entity.loc[:, "prediction"].fillna("")

pretrained_df_no_entity.loc[:, "prediction"] = pretrained_df_no_entity.loc[:, "prediction"].fillna("")
finetuned_df_no_entity.loc[:, "prediction"] = finetuned_df_no_entity.loc[:, "prediction"].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pretrained_df_entity.loc[:, "prediction"] = pretrained_df_entity.loc[:, "prediction"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finetuned_df_entity.loc[:, "prediction"] = finetuned_df_entity.loc[:, "prediction"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pretrained_df

In [301]:
pretrained_df_entity.shape, finetuned_df_entity.shape, pretrained_df_no_entity.shape, finetuned_df_no_entity.shape

((18449, 16), (5826, 16), (19551, 16), (6174, 16))

### Extract named entities

In [302]:
# pretty slow

import os

n = len(pretrained_df_entity) + len(finetuned_df_entity)
# use old result
n = 27950
save_path = f"../results/normalized/intron-open-test-all_models_filtered_ner-{n}.csv"

if not os.path.exists(save_path):
    pretrained_df_entity["entities_group_prediction"] = pretrained_df_entity.apply(lambda x: nlp.group_entities(nlp(x["prediction"])), axis=1)
    finetuned_df_entity["entities_group_prediction"] = finetuned_df_entity.apply(lambda x: nlp.group_entities(nlp(x["prediction"])), axis=1)

    pretrained_df_entity["model_tag"] = "pretrained"
    finetuned_df_entity["model_tag"] = "finetuned"
    
    all_model_entity = pd.concat([pretrained_df_entity, finetuned_df_entity], axis=0)
    all_model_entity = all_model_entity.reset_index(drop=True)
    
    all_model_entity = all_model_entity.join(df_entity[["idx", "entities_group"]].set_index("idx"), on="idx")
    
    assert n == len(all_model_entity)
    all_model_entity.to_csv(save_path, index=None)

else:
    all_model_entity = pd.read_csv(save_path)
    if "Unnamed: 0" in all_model_entity.columns:
        all_model_entity= all_model_entity.drop(columns=["Unnamed: 0"])
        
    all_model_entity.loc[:, "entities_group"] = all_model_entity["entities_group"].apply(json.loads)
    all_model_entity.loc[:, "entities_group_prediction"] = all_model_entity["entities_group_prediction"].apply(ast.literal_eval)
    
    all_model_entity = all_model_entity[all_model_entity.idx.isin(df_general.idx.unique())]
    

In [228]:
all_model_no_entity = pd.concat([pretrained_df_no_entity, finetuned_df_no_entity], axis=0)
all_model_no_entity = all_model_no_entity.reset_index(drop=True)

all_model_no_entity = all_model_no_entity.join(df_no_entity[["idx", "entities_group"]].set_index("idx"), on="idx")

In [229]:
len(all_model_entity.idx.unique()), len(all_model_entity.name.unique()), len(all_model_no_entity.idx.unique()), len(all_model_no_entity.name.unique())

(971, 25, 1029, 25)

In [230]:
all_model_no_entity[["reference", "prediction", "entities_group"]].sample(10)

Unnamed: 0,reference,prediction,entities_group
12191,"innovation will be as much a matter of imagination and networking, as technological development.",innovation will be as moth ibbeta of imagination and networking comer as technological development osto,
22119,"it partly read, we condemn this incident in the strongest term and call on the authorities to investigate this tragedy and ensure that all the culprits are brought to book.","a partly read, we condent this incident into strongest tem and come on the authorities to investigate this trauma and ensure that all the corporates are brought to book.",
13876,when the next pandemic hits and can anyone doubt that it will?,what the next pandemic ets and can anyone doubt that itt wil quiston mak,
15462,life will hide you for your protection,life will hide you for your protection,
14790,she was informed the stricken man lived in a housing unit under her supervision.,she was informed the stricken man lived in a housing unit under supervision.,
22864,why would a non-profit pay people hundreds of thousands in salaries is beyond my understanding.,why wid a nonprofit pay people hundreds of thousands an salaries is beyond my understanding.,
8567,on the goals she wishes her brand will accomplish:,on the goals she wishes ire burned with accomplished colong,
24086,we are working on elevating the condition of our secondary healthcare facilities.,we are working on elevating the condition of our secondary healthcare facilities.,
18360,primary and secondary schools,primary and secondary schools.,
24698,"he stated, some were discriminated against while others were completely disenfranchised and their teeming supporters, who were amongst the delegates, could not cast their votes while the exercise lasted.","e stated, some were discluminated against while others were completely disemfrenchised and their teamin supportors, who were amongt the deligates, could not cause their voats while the exercise last.",


### Extract named entities for the models based on set threshold

Note:

The named entities for the reference is already thresholded.
It makes sense to threshold the extracted named entities from the predicted models as well
but this should be double-checked with the team

In [231]:
all_model_entity["entities_group"][0][0]

{'entity_group': 'PER',
 'score': 0.9998998641967773,
 'word': 'Tinubu',
 'start': 0,
 'end': 6}

In [232]:
all_model_entity["entities_group_prediction"][0]

[{'entity_group': 'PER',
  'score': 0.768043,
  'word': 'tan',
  'start': 0,
  'end': 3}]

In [233]:
## Extract entities

threshhold=0.8

all_model_entity.loc[:, "ner_cat"] = all_model_entity.loc[\
                                                          :, "entities_group"].apply(\
                                                          lambda x: "".join(\
                                                                            [x[i]["word"].lower().strip() \
                                                                            for i in range(len(x)) \
                                                                            if x[i]["entity_group"] == "PER" \
                                                                            or x[i]["entity_group"] == "LOC" \
                                                                            or x[i]["entity_group"] == "ORG"
                                                                            ]))

all_model_entity.loc[:, "ner_cat_prediction"] = all_model_entity.loc[\
                                                                     :, "entities_group_prediction"].apply(\
                                                                     lambda x: "".join(\
                                                                                     [x[i]["word"].lower().strip() \
                                                                                      for i in range(len(x)) \
                                                                                      if 
                                                                                      (x[i]["entity_group"] == "PER" \
                                                                                      or x[i]["entity_group"] == "LOC" \
                                                                                      or x[i]["entity_group"] == "ORG") \
                                                                                      and x[i]["score"] >= threshhold
                                                                                    ]))

In [234]:
len(all_model_entity.idx.unique()), \
len(all_model_entity[all_model_entity.loc[:, "ner_cat_prediction"] != ""])

(971, 10900)

In [235]:
# join the entities with space
all_model_entity.loc[:, "ner_cat"] = all_model_entity.loc[:, "ner_cat"].apply(lambda x: "".join(x.split(" ")))
all_model_entity.loc[:, "ner_cat_prediction"] = all_model_entity.loc[\
                                                                     :, "ner_cat_prediction"].apply(\
                                                                     lambda x: "".join(x.split(" ")))

In [236]:
all_model_entity.loc[:, "ner_cat_prediction"][all_model_entity.loc[:, "ner_cat_prediction"] != ""].sample(5)

16392    elebay                                                        
21603    mokolodinetralreservegaboronegamereservemanyelanonggamereserve
19226    berlin                                                        
11382    contagoracity                                                 
22533    douggomez                                                     
Name: ner_cat_prediction, dtype: object

In [237]:
example_pred_ner_less_than_threshold = all_model_entity[all_model_entity.loc[:, "entities_group_prediction"].apply(lambda x: any([True if len(x) >0 and (x[i]["entity_group"] == "PER" \
                                                                                      or x[i]["entity_group"] == "LOC" \
                                                                                      or x[i]["entity_group"] == "ORG") \
                                                                                      and x[i]["score"] < threshhold \
                                                          else False for i in range(len(x))]))]

In [238]:
example_pred_ner_less_than_threshold[["reference", "prediction", "ner_cat", "ner_cat_prediction", "entities_group", "entities_group_prediction"]][example_pred_ner_less_than_threshold.loc[:, "ner_cat_prediction"] != ""].sample(3)

Unnamed: 0,reference,prediction,ner_cat,ner_cat_prediction,entities_group,entities_group_prediction
26198,"i think youre going to see quite a bit change, superintendent tony sanders said.","i think youre going to see quite a big change, supretendent to nisander said.",tonysanders,ni,"[{'entity_group': 'PER', 'score': 0.9999992251396179, 'word': 'Tony Sanders', 'start': 62, 'end': 74}]","[{'entity_group': 'PER', 'score': 0.9965587, 'word': 'ni', 'start': 63, 'end': 65}, {'entity_group': 'PER', 'score': 0.59518075, 'word': 'sander', 'start': 65, 'end': 71}]"
1366,"children chinweizu ojo and bukola were found last night wandering the streets unattended after their mother and father, went missing while returning from work at eket","children chinouzu, ojo and bukola were found last night wandering the streets unattended after their mother and father, went missing while returning from work at ekhet.",chinweizuojobukolaeket,chinouzuobukolaekhet,"[{'entity_group': 'PER', 'score': 0.9999948740005493, 'word': 'Chinweizu Ojo', 'start': 9, 'end': 22}, {'entity_group': 'PER', 'score': 0.9999982118606567, 'word': 'Bu', 'start': 27, 'end': 29}, {'entity_group': 'PER', 'score': 0.9968841671943665, 'word': 'kola', 'start': 29, 'end': 33}, {'entity_group': 'DATE', 'score': 0.9991463422775269, 'word': 'last night', 'start': 45, 'end': 55}, {'entity_group': 'LOC', 'score': 0.994438648223877, 'word': 'Eket', 'start': 162, 'end': 166}]","[{'entity_group': 'PER', 'score': 0.9999354, 'word': '', 'start': 9, 'end': 10}, {'entity_group': 'PER', 'score': 0.9980495, 'word': 'chinouzu', 'start': 9, 'end': 17}, {'entity_group': 'PER', 'score': 0.999899, 'word': 'o', 'start': 19, 'end': 20}, {'entity_group': 'PER', 'score': 0.7785832, 'word': 'jo', 'start': 20, 'end': 22}, {'entity_group': 'PER', 'score': 0.80091864, 'word': 'bukola', 'start': 27, 'end': 33}, {'entity_group': 'DATE', 'score': 0.9997582, 'word': 'last night', 'start': 45, 'end': 55}, {'entity_group': 'LOC', 'score': 0.996971, 'word': 'ekhet', 'start': 162, 'end': 167}]"
7808,this morning chimdiomimi came to feyisayo hospital in buguma to see dr zauna,dis morning chim dio mini came to fenishayo hospital in bukuma to see doctor zona,chimdiomimifeyisayohospitalbugumazauna,yohospitalbukumazona,"[{'entity_group': 'DATE', 'score': 0.9419776201248169, 'word': 'This morning', 'start': 0, 'end': 12}, {'entity_group': 'PER', 'score': 0.999982476234436, 'word': 'Chimdiomimi', 'start': 13, 'end': 24}, {'entity_group': 'ORG', 'score': 0.9999440908432007, 'word': 'Feyisayo hospital', 'start': 33, 'end': 50}, {'entity_group': 'LOC', 'score': 0.9994399547576904, 'word': 'Buguma', 'start': 54, 'end': 60}, {'entity_group': 'PER', 'score': 0.9999699592590332, 'word': 'Zauna', 'start': 71, 'end': 76}]","[{'entity_group': 'DATE', 'score': 0.9964913, 'word': 'dis morning', 'start': 0, 'end': 11}, {'entity_group': 'ORG', 'score': 0.69308424, 'word': 'fenisha', 'start': 34, 'end': 41}, {'entity_group': 'LOC', 'score': 0.8139883, 'word': 'yo hospital', 'start': 41, 'end': 52}, {'entity_group': 'LOC', 'score': 0.9903598, 'word': 'bukuma', 'start': 56, 'end': 62}, {'entity_group': 'PER', 'score': 0.99104685, 'word': 'zona', 'start': 77, 'end': 81}]"


In [239]:
example_pred_ner_less_than_threshold[["reference", "prediction", "ner_cat", "ner_cat_prediction", "entities_group", "entities_group_prediction"]][example_pred_ner_less_than_threshold.loc[:, "ner_cat_prediction"] == ""].sample(5)

Unnamed: 0,reference,prediction,ner_cat,ner_cat_prediction,entities_group,entities_group_prediction
9507,"he said zimbabwes new government, which came to power after mugabes ousting, posed an unusual and extraordinary threat to his country.",s coma which came to power after mogabby's autstin comma posed an unusual and extraordinary treat to his country frsto,zimbabwemugabe,,"[{'entity_group': 'LOC', 'score': 0.9892527461051941, 'word': 'Zimbabwe', 'start': 8, 'end': 16}, {'entity_group': 'PER', 'score': 0.9999889731407166, 'word': 'Mugabe', 'start': 60, 'end': 66}]","[{'entity_group': 'PER', 'score': 0.54825103, 'word': 'mo', 'start': 33, 'end': 35}]"
9151,"he said zimbabwes new government, which came to power after mugabes ousting, posed an unusual and extraordinary threat to his country.",he sated imburbwis new government which came to power after mocabus ousting posed as unusual and extraordinary threat to his country,zimbabwemugabe,,"[{'entity_group': 'LOC', 'score': 0.9892527461051941, 'word': 'Zimbabwe', 'start': 8, 'end': 16}, {'entity_group': 'PER', 'score': 0.9999889731407166, 'word': 'Mugabe', 'start': 60, 'end': 66}]","[{'entity_group': 'PER', 'score': 0.73481476, 'word': 'moc', 'start': 60, 'end': 63}]"
11629,s southwest service provides monday-saturday rail service at the ashburn railroad station.,s southwest a viporise mande heivmont atridiges avis at deo ashborn areli redos tichan vuftav,southwestserviceashburnrailroadstation,,"[{'entity_group': 'ORG', 'score': 0.9999750256538391, 'word': 'SouthWest Service', 'start': 2, 'end': 19}, {'entity_group': 'DATE', 'score': 0.9999855160713196, 'word': 'Monday-Saturday', 'start': 29, 'end': 44}, {'entity_group': 'LOC', 'score': 0.9116502404212952, 'word': 'Ashburn railroad station', 'start': 65, 'end': 89}]","[{'entity_group': 'LOC', 'score': 0.721553, 'word': 'southwest', 'start': 2, 'end': 11}, {'entity_group': 'LOC', 'score': 0.72869045, 'word': 'he', 'start': 29, 'end': 31}, {'entity_group': 'LOC', 'score': 0.7487573, 'word': 'iv', 'start': 31, 'end': 33}]"
23958,"nova is married to ada de la cruz , a dominican beauty pageant champion .","niver is married to adadelacus, a dominant andpeutic vagentian.",novaadadelacruz,,"[{'entity_group': 'PER', 'score': 0.999997615814209, 'word': 'Nova', 'start': 0, 'end': 4}, {'entity_group': 'PER', 'score': 0.9999973773956299, 'word': 'Ada de la Cruz', 'start': 19, 'end': 33}]","[{'entity_group': 'PER', 'score': 0.7609201, 'word': 'niver', 'start': 0, 'end': 5}]"
24925,"tamara has also starred in the independent film limbo lounge, directed by tom pankratz.",sama has also staired in the independent fielt leimbo lonc directed by tron sancrat.,tamaratompankratz,,"[{'entity_group': 'PER', 'score': 0.9999775290489197, 'word': 'Tamara', 'start': 0, 'end': 6}, {'entity_group': 'PER', 'score': 0.9999905824661255, 'word': 'Tom Pankratz', 'start': 74, 'end': 86}]","[{'entity_group': 'PER', 'score': 0.7509741, 'word': 'sama', 'start': 0, 'end': 4}]"


In [240]:
all_model_entity[["reference", "prediction", "ner_cat", "ner_cat_prediction", "entities_group", "entities_group_prediction"]][\
                all_model_entity.loc[:, "ner_cat_prediction"] != ""].sample(5)

Unnamed: 0,reference,prediction,ner_cat,ner_cat_prediction,entities_group,entities_group_prediction
14559,"the san onofre nuclear generating station songs, located on the pacific coast about five miles southeast of san clemente, is one such site.",this san onifri neucla genertin station songgs command loocated on the pacific cooust about five miles south est of saint clemens comand is one such site fulsto,sanonofrenucleargeneratingstationsongsthepacificcoastsoutheastofsanclemente,sanonifrineuclasouthestofsaintclemenscomand,"[{'entity_group': 'ORG', 'score': 0.8928154706954956, 'word': 'San Onofre Nuclear Generating Station SONGS', 'start': 4, 'end': 47}, {'entity_group': 'LOC', 'score': 0.6259666085243225, 'word': 'the', 'start': 60, 'end': 63}, {'entity_group': 'LOC', 'score': 0.9874609112739563, 'word': 'Pacific coast', 'start': 64, 'end': 77}, {'entity_group': 'LOC', 'score': 0.9604066014289856, 'word': 'southeast of San Clemente', 'start': 95, 'end': 120}]","[{'entity_group': 'ORG', 'score': 0.98967266, 'word': 'san onifri neucla', 'start': 5, 'end': 22}, {'entity_group': 'LOC', 'score': 0.89931655, 'word': 'south est of saint clemens comand', 'start': 103, 'end': 136}]"
8833,zlatan is well-known for his rap abilities and very different adlibs.,latan is well known for is roppaabilities on very different uglips fosto,zlatan,latan,"[{'entity_group': 'PER', 'score': 0.999998927116394, 'word': 'Zlatan', 'start': 0, 'end': 6}]","[{'entity_group': 'PER', 'score': 0.9884328, 'word': 'la', 'start': 0, 'end': 2}, {'entity_group': 'PER', 'score': 0.9494498, 'word': 'tan', 'start': 2, 'end': 5}]"
8599,"chimazuru came down with abarshi disease, and was treated with jalloh drug at batagarawa hospital on 27/03/1986",chi mazo came down with abashi disease and was trita with jalo druk at basa gaawa hospitu on twenty sevenths march nineteen eighty six,chimazurubatagarawahospital,basagaawahospitu,"[{'entity_group': 'PER', 'score': 0.9999211430549622, 'word': 'Chimazuru', 'start': 0, 'end': 9}, {'entity_group': 'ORG', 'score': 0.9994169473648071, 'word': 'Batagarawa hospital', 'start': 78, 'end': 97}, {'entity_group': 'DATE', 'score': 0.9999960064888, 'word': '27/03/1986', 'start': 101, 'end': 111}]","[{'entity_group': 'ORG', 'score': 0.98729205, 'word': 'basa gaawa hospitu', 'start': 71, 'end': 89}, {'entity_group': 'DATE', 'score': 0.9999958, 'word': 'twenty sevenths march nineteen eighty six', 'start': 93, 'end': 134}]"
640,"the elderly clergy man, revd akanu ekeoma developed a heart attack","the elderly clergyman, reverend akanu ekeoma, develop a heart attack.",akanuekeoma,reverendakanuekeoma,"[{'entity_group': 'PER', 'score': 0.9999970197677612, 'word': 'Akanu Ekeoma', 'start': 29, 'end': 41}]","[{'entity_group': 'PER', 'score': 0.96008724, 'word': 'reverend akanu ekeoma', 'start': 23, 'end': 44}]"
1666,spend at least 100 and this cheesy toronto institution will deliver a custom order straight to your muskoka cottage every friday.,spend at least 100 and this cheesy toronto institution will deliver a custom or that streets your muscocha cottage every friday.,torontomuskoka,toronto,"[{'entity_group': 'LOC', 'score': 0.9999924898147583, 'word': 'Toronto', 'start': 35, 'end': 42}, {'entity_group': 'LOC', 'score': 0.9999896287918091, 'word': 'Muskoka', 'start': 100, 'end': 107}, {'entity_group': 'DATE', 'score': 0.9842779040336609, 'word': 'Friday', 'start': 122, 'end': 128}]","[{'entity_group': 'LOC', 'score': 0.93624175, 'word': 'toronto', 'start': 35, 'end': 42}, {'entity_group': 'DATE', 'score': 0.9112494, 'word': 'friday', 'start': 121, 'end': 127}]"


In [241]:
df_names = pd.read_csv("../data/African_names/List of Nigerian names - Sheet1.csv")
df_names2 = pd.read_csv("../data/African_names/mmc2-igbo names.csv", delimiter=";")
df_names3 = pd.read_csv("../data/African_names/AfricanNamesDatabase.csv")
df_names_list = df_names.YORUBA.str.lower().to_list() + df_names.HAUSA.str.lower().to_list()+ \
                    df_names.IBO.str.lower().to_list() + df_names2["Igbo Names"].str.lower().to_list() 
#                    + \ df_names3.Name.str.lower().to_list()
df_names_list = df_names_list
print(len(df_names_list))

# afri ner
all_model_entity.loc[:, "afri_ner_cat"] = all_model_entity.loc[\
                                                               :, "entities_group"].apply(\
                                                                lambda x: "".join(\
                                                                [j.lower().strip() \
                                                                for i in range(len(x)) \
                                                                if x[i]["entity_group"] == "PER" \
                                                                for j in x[i]["word"].split()
                                                                if j.lower() in df_names_list]
                                                               ))
all_model_entity.loc[:, "afri_ner_cat_prediction"] = all_model_entity.loc[\
                                                                        :, "entities_group_prediction"].apply(\
                                                                        lambda x: "".join(\
                                                                        [j.lower().strip() \
                                                                        for i in range(len(x)) \
                                                                        if x[i]["entity_group"] == "PER" \
#                                                                         if x[i]["word"].lower() in df_names_list]
                                                                        for j in x[i]["word"].split()
                                                                        if j.lower() in df_names_list]
                                                                        ))



all_model_entity.loc[:, "afri_ner_cat"] = all_model_entity.loc[\
                                                               :, "afri_ner_cat"].apply(\
                                                               lambda x: "".join(x.split(" ")))
all_model_entity.loc[:, "afri_ner_cat_prediction"] = all_model_entity.loc[\
                                                                          :, "afri_ner_cat_prediction"].apply(\
                                                                          lambda x: "".join(x.split(" ")))

# df_entity_PER.loc[:, "afri_ner_count"] = df_entity_PER.loc[:, "entities_group"].
# apply(lambda x: len([x[i]["word"] for i in range(len(x)) if x[i]["entity_group"] == "PER" 
#                      if x[i]["word"].lower() in df_names_list]))


all_model_entity.loc[:, "afri_ner_count"] = all_model_entity.loc[:, "entities_group"].apply(\
                                            lambda x: len([j \
                                            for i in range(len(x)) \
                                            if x[i]["entity_group"] == "PER" \
                                            for j in x[i]["word"].split()
                                            if j.lower() in df_names_list]
#                                             if x[i]["word"].lower() in df_names_list]             
                                            ))

all_model_entity.loc[:, "afri_ner"] = all_model_entity.loc[:, "entities_group"].apply(\
                                            lambda x: [j \
                                            for i in range(len(x)) \
                                            if x[i]["entity_group"] == "PER" \
                                            for j in x[i]["word"].split()
                                            if j.lower() in df_names_list]
#                                             if x[i]["word"].lower() in df_names_list]             
                                            )


2462


In [242]:
len(all_model_entity[all_model_entity.loc[:, "afri_ner_cat"] != ""].idx.unique()), \
len(all_model_entity[all_model_entity.loc[:, "afri_ner_cat_prediction"] != ""])

(229, 793)

In [243]:
len(all_model_entity[all_model_entity["afri_ner_count"] >= 1].idx.unique())

229

In [244]:
xx = all_model_entity[all_model_entity.loc[:, "afri_ner_count"] >= 1]
xx = xx[~xx.duplicated(subset=["idx"])]
xx.afri_ner_count.sum()

376

In [245]:
xx[["reference", "prediction", "entities_group", "entities_group_prediction", "afri_ner", "afri_ner_count"]].sample(5)

Unnamed: 0,reference,prediction,entities_group,entities_group_prediction,afri_ner,afri_ner_count
755,"dr ewaoluwa doyinsola speaking. next line. fever is high grade, intermittent, temporarily relieved by paracetamol. next line",dr. ewao luwa doing solar speaking. next line. fever is high grade intermittent coma. temporarily relieved by paracetamol. next line.,"[{'entity_group': 'PER', 'score': 0.9995025992393494, 'word': 'Ewaoluwa Doyinsola', 'start': 3, 'end': 21}]","[{'entity_group': 'PER', 'score': 0.99760866, 'word': 'ewao luwa', 'start': 4, 'end': 13}]","[Ewaoluwa, Doyinsola]",2
694,"seeing how the ghost of museveni was walking majestically round the balogun stadium, the buhari group was said to have ran round to jerk the slush fund to 3,000.","seeing how the ghost of missy venny was walking madesi kali round the balogu stadium, the buhari group was said to have ran around to check the slush fund to 3,000.","[{'entity_group': 'PER', 'score': 0.9999949932098389, 'word': 'Museveni', 'start': 24, 'end': 32}, {'entity_group': 'LOC', 'score': 0.9999313354492188, 'word': 'Balogun Stadium', 'start': 68, 'end': 83}, {'entity_group': 'PER', 'score': 0.9996496438980103, 'word': 'Buhari', 'start': 89, 'end': 95}]","[{'entity_group': 'PER', 'score': 0.9252866, 'word': 'miss venny', 'start': 24, 'end': 35}, {'entity_group': 'LOC', 'score': 0.99739695, 'word': 'balogu stadium', 'start': 70, 'end': 84}, {'entity_group': 'PER', 'score': 0.7316538, 'word': 'bu', 'start': 90, 'end': 92}, {'entity_group': 'ORG', 'score': 0.5671392, 'word': 'hari', 'start': 92, 'end': 96}]",[Buhari],1
635,they delivered a baby girl on 27-07-2024 who they named uwaezuoke sade ebidougha and itunuoluwa,"they delivered the baby girl on 27 july 2024, who they named oasu ok shadeh ebuduga and itunubu luwa.","[{'entity_group': 'DATE', 'score': 0.9999915361404419, 'word': '27-07-2024', 'start': 30, 'end': 40}, {'entity_group': 'PER', 'score': 0.9998416304588318, 'word': 'Uwaezuoke Sade Ebidougha', 'start': 56, 'end': 80}, {'entity_group': 'PER', 'score': 0.999657392501831, 'word': 'Itunuoluwa', 'start': 85, 'end': 95}]","[{'entity_group': 'DATE', 'score': 0.9999973, 'word': '27 july 2024', 'start': 32, 'end': 44}]","[Uwaezuoke, Sade, Itunuoluwa]",3
625,"im happy that im still progressing, said seyi shay.","i am happy that i am still progressing, c'est si a chat pour tout.","[{'entity_group': 'PER', 'score': 0.9999529123306274, 'word': 'Seyi Shay', 'start': 41, 'end': 50}]",[],[Seyi],1
965,mr and mrs dabira strive to provide their family in dutse a balanced diet.,"mr. and mrs. dabira strive to provide their family in, do you say, a balanced diet.","[{'entity_group': 'PER', 'score': 0.9997553825378418, 'word': 'Dabira', 'start': 11, 'end': 17}, {'entity_group': 'LOC', 'score': 0.9997099041938782, 'word': 'Dutse', 'start': 52, 'end': 57}]","[{'entity_group': 'PER', 'score': 0.9959441, 'word': 'dabira', 'start': 13, 'end': 19}]",[Dabira],1


In [246]:
all_model_entity[["reference", "prediction", "afri_ner_cat", "afri_ner_cat_prediction", "entities_group", "entities_group_prediction"]][\
                all_model_entity.loc[:, "afri_ner_cat_prediction"] != ""].sample(5)

Unnamed: 0,reference,prediction,afri_ner_cat,afri_ner_cat_prediction,entities_group,entities_group_prediction
22029,the patient was transferred to hospital mmasichukwu under dr adesida supervision for further care after 6 week hospitalization at hospital yenagoa,the patient was transferred to hospital maasichukwu under dr adesida supervision for further care after 6 week hospitalization at hospital yenagoa,adesida,ade,"[{'entity_group': 'ORG', 'score': 0.9985774159431458, 'word': 'Hospital Mmasichukwu', 'start': 31, 'end': 51}, {'entity_group': 'PER', 'score': 0.9998680353164673, 'word': 'Adesida', 'start': 61, 'end': 68}, {'entity_group': 'DATE', 'score': 0.9999942779541016, 'word': '6 week', 'start': 104, 'end': 110}, {'entity_group': 'ORG', 'score': 0.9998908042907715, 'word': 'Hospital Yenagoa', 'start': 130, 'end': 146}]","[{'entity_group': 'LOC', 'score': 0.6293032, 'word': 'asi', 'start': 42, 'end': 45}, {'entity_group': 'PER', 'score': 0.9896376, 'word': 'ade', 'start': 61, 'end': 64}, {'entity_group': 'DATE', 'score': 0.9999959, 'word': '6 week', 'start': 104, 'end': 110}, {'entity_group': 'ORG', 'score': 0.66185236, 'word': 'yenagoa', 'start': 139, 'end': 146}]"
22164,dr. izundu oluchi thinks pt eluma is stable enough to go home.,dr. izundu oluche thinks pt eluma is stable enough to go home.,izunduoluchieluma,izundu,"[{'entity_group': 'PER', 'score': 0.9999353289604187, 'word': 'Izundu Oluchi', 'start': 4, 'end': 17}, {'entity_group': 'PER', 'score': 0.9984029531478882, 'word': 'Eluma', 'start': 28, 'end': 33}]","[{'entity_group': 'PER', 'score': 0.99641514, 'word': 'izundu oluche', 'start': 4, 'end': 17}, {'entity_group': 'PER', 'score': 0.9970421, 'word': 'el', 'start': 28, 'end': 30}]"
10310,"children chinweizu ojo and bukola were found last night wandering the streets unattended after their mother and father, went missing while returning from work at eket",children chinoz ojo and bucola were found last night wandering the streets unattended after their mother and father coma went missing while returning from work at eket,chinweizuojokola,ojo,"[{'entity_group': 'PER', 'score': 0.9999948740005493, 'word': 'Chinweizu Ojo', 'start': 9, 'end': 22}, {'entity_group': 'PER', 'score': 0.9999982118606567, 'word': 'Bu', 'start': 27, 'end': 29}, {'entity_group': 'PER', 'score': 0.9968841671943665, 'word': 'kola', 'start': 29, 'end': 33}, {'entity_group': 'DATE', 'score': 0.9991463422775269, 'word': 'last night', 'start': 45, 'end': 55}, {'entity_group': 'LOC', 'score': 0.994438648223877, 'word': 'Eket', 'start': 162, 'end': 166}]","[{'entity_group': 'PER', 'score': 0.9999932, 'word': '', 'start': 9, 'end': 10}, {'entity_group': 'PER', 'score': 0.9998604, 'word': 'chinoz ojo', 'start': 9, 'end': 19}, {'entity_group': 'PER', 'score': 0.99997723, 'word': 'bu', 'start': 24, 'end': 26}, {'entity_group': 'PER', 'score': 0.9959992, 'word': 'cola', 'start': 26, 'end': 30}, {'entity_group': 'DATE', 'score': 0.99987614, 'word': 'last night', 'start': 42, 'end': 52}, {'entity_group': 'LOC', 'score': 0.99646324, 'word': '', 'start': 163, 'end': 164}, {'entity_group': 'LOC', 'score': 0.805931, 'word': 'eket', 'start': 163, 'end': 167}]"
22976,parents called at 11:22am and were updated by dr chizaramekpere nwachukwu,parents called at 11:22am and were updated by dr chizaramekpere nwachukwu,chizaramekperenwachukwu,chizaramekperenwachukwu,"[{'entity_group': 'DATE', 'score': 0.9999885559082031, 'word': '11:22am', 'start': 18, 'end': 25}, {'entity_group': 'PER', 'score': 0.9999945163726807, 'word': 'Chizaramekpere Nwachukwu', 'start': 49, 'end': 73}]","[{'entity_group': 'DATE', 'score': 0.99999285, 'word': '11:22am', 'start': 18, 'end': 25}, {'entity_group': 'PER', 'score': 0.9851276, 'word': 'chizaramekpere nwachukwu', 'start': 49, 'end': 73}]"
23069,"olusola was a negerian movie director, historian, movie critic and teacher, born in akpawfu on 02 apr, 1985","olusola was a negerian movie director, historian, movie critic and teacher, born in akpawfu on 02 apr, 1985",olusola,olusola,"[{'entity_group': 'PER', 'score': 0.9999923706054688, 'word': 'Olusola', 'start': 0, 'end': 7}, {'entity_group': 'LOC', 'score': 0.9999462366104126, 'word': 'Akpawfu', 'start': 84, 'end': 91}, {'entity_group': 'DATE', 'score': 0.9999940395355225, 'word': '02 Apr, 1985', 'start': 95, 'end': 107}]","[{'entity_group': 'PER', 'score': 0.999707, 'word': 'olusola', 'start': 0, 'end': 7}, {'entity_group': 'LOC', 'score': 0.99920076, 'word': 'akpawfu', 'start': 84, 'end': 91}, {'entity_group': 'DATE', 'score': 0.9999937, 'word': '02 apr, 1985', 'start': 95, 'end': 107}]"


In [247]:
# some words in predicted afri entities but not in reference
# for example, reference has "al-mustapha" which is predicted as mustapha
# mustapha exist in our list but mustapha does not.
# df_names_list.index("mustafa")  vs "mustapha"

In [248]:
# df_names_list.index("bashir")

### Compute metric

In [249]:
from datasets import load_metric
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

In [250]:
all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "clinical")]

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,audio_paths,origin,country,...,split,entities_group_prediction,model_tag,entities_group,ner_cat,ner_cat_prediction,afri_ner_cat,afri_ner_cat_prediction,afri_ner_count,afri_ner


In [251]:
# Sanity check

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "general")]
print(wer_metric.compute(predictions=x.prediction, references=x.reference))

x = all_model_entity[all_model_entity.name == "openai/whisper-large"]
print(wer_metric.compute(predictions=x.prediction, references=x.reference))

0.30014974211080914
0.30014974211080914


In [252]:

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "general")]
print(cer_metric.compute(predictions=x.prediction, references=x.reference))

x = all_model_entity[all_model_entity.name == "openai/whisper-large"]
print(cer_metric.compute(predictions=x.prediction, references=x.reference))

0.14382112160983127
0.14382112160983127


In [253]:
# Sanity check

# x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "clinical")]
# print(cer_metric.compute(predictions=x.ner_cat_prediction, references=x.ner_cat))

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "general")]
print(cer_metric.compute(predictions=x.ner_cat_prediction, references=x.ner_cat))

x = all_model_entity[all_model_entity.name == "openai/whisper-large"]
print(cer_metric.compute(predictions=x.ner_cat_prediction, references=x.ner_cat))

0.5650354738948518
0.5650354738948518


In [254]:
def domain_wer(df, metric="wer", columns=None, sort_by=None, predictions="ner_cat_prediction", references="ner_cat"):
    
    if columns is None:
        columns = [
                ("Test (baseline)", "General"), ("Test (baseline)", "Clinical"), ("Test (baseline)", "Both"), \
               ("Test (without ner)", "General"), ("Test (without ner)", "Clinical"), ("Test (without ner)", "Both"), \
                ("Test (ner)", "General"), ("Test (ner)", "Clinical"), ("Test (ner)", "Both"), \
                ("Test (ner ng)", "General"), ("Test (ner ng)", "Clinical"), ("Test (ner ng)", "Both"),
             ]
    
    if sort_by is None:
        sort_by = ("Test (ner)", "Both")
        
    df = df.copy()
    df.domain = df.domain.str.capitalize()
    df.split = df.split.str.capitalize()
    
    
    # compute the domain WER
    if metric == "cer":
        domain_df = df.groupby(["model_tag", "name", "split", "domain"]).apply(lambda x: cer_metric.compute(predictions=x[predictions], references=x[references]))        
        domain_df_ = df.groupby(["model_tag", "name", "split"]).apply(lambda x: cer_metric.compute(predictions=x[predictions], references=x[references])).reset_index()
    
    elif metric=="wer":
        domain_df = df.groupby(["model_tag", "name", "split", "domain"]).apply(lambda x: wer_metric.compute(predictions=x[predictions], references=x[references]))
        domain_df_ = df.groupby(["model_tag", "name", "split"]).apply(lambda x: wer_metric.compute(predictions=x[predictions], references=x[references])).reset_index()
        
    _domain_df = domain_df.reset_index().set_index(["model_tag", "name"]).pivot(columns=["split", "domain"])
    _domain_df = _domain_df.droplevel(0, axis=1)
        
    domain_df_["domain"] = "Both"    
    domain_df_ = domain_df_.set_index(["model_tag", "name"]).pivot(columns=["split", "domain"])
    domain_df_ = domain_df_.droplevel(0, axis=1)
    
    # merge
    domain_df__ = _domain_df.merge(domain_df_, on=["model_tag", "name"]).sort_values(sort_by)    
    domain_df__ = domain_df__[columns]
    domain_df__ = domain_df__.round(3)
    domain_df__.columns.names = ["", ""]
    return domain_df__


In [255]:
all_model_entity.columns

Index(['idx', 'domain', 'gender', 'duration', 'age_group', 'accent',
       'user_ids', 'audio_paths', 'origin', 'country', 'reference',
       'prediction', 'wer', 'name', 'split', 'entities_group_prediction',
       'model_tag', 'entities_group', 'ner_cat', 'ner_cat_prediction',
       'afri_ner_cat', 'afri_ner_cat_prediction', 'afri_ner_count',
       'afri_ner'],
      dtype='object')

In [256]:
model_of_interest = ["facebook/wav2vec2-large-960h",
                    "facebook/wav2vec2-large-960h-lv60-self",
                    "facebook/hubert-xlarge-ls960-ft",
                    "microsoft/wavlm-libri-clean-100h-large",
                    "openai/whisper-large",
                    "openai/whisper-medium",
                    "facebook/wav2vec2-large-xlsr-53-english",
                    "Azure", "GCP", "AWS",
                    "openai/whisper-medium-general",
                    "facebook/wav2vec2-large-xlsr-53-english-general"]

In [258]:
split = "Test (all -- wer)"
all_model = pd.concat([pretrained_df, finetuned_df])
all_model["split"] = split

columns = [(split, "General")]
sort_by = (split, "General")

domain_df = domain_wer(all_model, 
                       metric="wer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="reference",
                       predictions="prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (all -- wer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General
model_tag,name,Unnamed: 2_level_2
finetuned,openai/whisper-medium-general,0.186
finetuned,openai/whisper-medium-all,0.192
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.236
pretrained,openai/whisper-large,0.24
pretrained,openai/whisper-medium,0.276
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.279
pretrained,openai/whisper-medium-en,0.304
pretrained,openai/whisper-small,0.33
pretrained,Azure,0.34
pretrained,openai/whisper-small-en,0.35


In [259]:
domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T

  domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T


Unnamed: 0_level_0,Test (all -- wer)
Unnamed: 0_level_1,General
name,Unnamed: 1_level_2
facebook/wav2vec2-large-960h,0.641
facebook/wav2vec2-large-960h-lv60-self,0.533
facebook/hubert-xlarge-ls960-ft,0.562
microsoft/wavlm-libri-clean-100h-large,0.631
openai/whisper-large,0.24
openai/whisper-medium,0.276
facebook/wav2vec2-large-xlsr-53-english,0.506
Azure,0.34
GCP,0.534
AWS,0.354


In [260]:
split = "Test (no ner -- wer)"
all_model_no_entity["split"] = split

columns = [(split, "General")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_no_entity, 
                       metric="wer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="reference",
                       predictions="prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (no ner -- wer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General
model_tag,name,Unnamed: 2_level_2
finetuned,openai/whisper-medium-general,0.172
finetuned,openai/whisper-medium-all,0.175
pretrained,openai/whisper-large,0.187
pretrained,openai/whisper-medium,0.206
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.211
pretrained,openai/whisper-medium-en,0.226
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.253
pretrained,openai/whisper-small,0.258
pretrained,openai/whisper-small-en,0.268
pretrained,Azure,0.273


In [261]:
domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T

  domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T


Unnamed: 0_level_0,Test (no ner -- wer)
Unnamed: 0_level_1,General
name,Unnamed: 1_level_2
facebook/wav2vec2-large-960h,0.565
facebook/wav2vec2-large-960h-lv60-self,0.458
facebook/hubert-xlarge-ls960-ft,0.487
microsoft/wavlm-libri-clean-100h-large,0.562
openai/whisper-large,0.187
openai/whisper-medium,0.206
facebook/wav2vec2-large-xlsr-53-english,0.447
Azure,0.273
GCP,0.464
AWS,0.279


In [262]:
split = "Test (ner -- wer)"
all_model_entity["split"] = split

columns = [(split, "General")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity, 
                       metric="wer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="reference",
                       predictions="prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (ner -- wer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General
model_tag,name,Unnamed: 2_level_2
finetuned,openai/whisper-medium-general,0.198
finetuned,openai/whisper-medium-all,0.208
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.258
pretrained,openai/whisper-large,0.3
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.302
pretrained,openai/whisper-medium,0.352
pretrained,openai/whisper-medium-en,0.388
pretrained,Azure,0.402
pretrained,openai/whisper-small,0.405
pretrained,AWS,0.426


In [263]:
domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T

  domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T


Unnamed: 0_level_0,Test (ner -- wer)
Unnamed: 0_level_1,General
name,Unnamed: 1_level_2
facebook/wav2vec2-large-960h,0.696
facebook/wav2vec2-large-960h-lv60-self,0.584
facebook/hubert-xlarge-ls960-ft,0.613
microsoft/wavlm-libri-clean-100h-large,0.68
openai/whisper-large,0.3
openai/whisper-medium,0.352
facebook/wav2vec2-large-xlsr-53-english,0.55
Azure,0.402
GCP,0.603
AWS,0.426


In [264]:
len(df_entity[df_entity.idx.isin(all_model_entity[all_model_entity.loc[:, "afri_ner_count"] >= 1].idx.unique())])

229

In [265]:
split = "Test (afri-ner -- wer)"
all_model_entity["split"] = split

columns = [(split, "General")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity[all_model_entity.loc[:, "afri_ner_count"] >= 1], 
                       metric="wer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="reference",
                       predictions="prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (afri-ner -- wer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General
model_tag,name,Unnamed: 2_level_2
finetuned,openai/whisper-medium-general,0.108
finetuned,openai/whisper-medium-all,0.117
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.212
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.256
pretrained,openai/whisper-large,0.412
finetuned,facebook/wav2vec2-large-xlsr-53-english-clinical,0.451
pretrained,openai/whisper-medium-en,0.473
finetuned,openai/whisper-medium-clinical,0.476
pretrained,openai/whisper-medium,0.488
pretrained,Azure,0.509


In [266]:
domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T

  domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T


Unnamed: 0_level_0,Test (afri-ner -- wer)
Unnamed: 0_level_1,General
name,Unnamed: 1_level_2
facebook/wav2vec2-large-960h,0.802
facebook/wav2vec2-large-960h-lv60-self,0.683
facebook/hubert-xlarge-ls960-ft,0.701
microsoft/wavlm-libri-clean-100h-large,0.769
openai/whisper-large,0.412
openai/whisper-medium,0.488
facebook/wav2vec2-large-xlsr-53-english,0.617
Azure,0.509
GCP,0.7
AWS,0.556


In [267]:
split = "Test (ner -- prediction cer)"
all_model_entity["split"] = split

columns = [(split, "General")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity, 
                       metric="cer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="ner_cat",
                       predictions="ner_cat_prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (ner -- prediction cer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General
model_tag,name,Unnamed: 2_level_2
pretrained,openai/whisper-large,0.565
finetuned,openai/whisper-medium-general,0.576
finetuned,openai/whisper-medium-all,0.596
pretrained,openai/whisper-medium,0.607
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.622
pretrained,openai/whisper-medium-en,0.623
pretrained,openai/whisper-small,0.653
pretrained,openai/whisper-small-en,0.668
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.67
pretrained,Azure,0.674


In [268]:
domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T

  domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T


Unnamed: 0_level_0,Test (ner -- prediction cer)
Unnamed: 0_level_1,General
name,Unnamed: 1_level_2
facebook/wav2vec2-large-960h,0.861
facebook/wav2vec2-large-960h-lv60-self,0.808
facebook/hubert-xlarge-ls960-ft,0.803
microsoft/wavlm-libri-clean-100h-large,0.864
openai/whisper-large,0.565
openai/whisper-medium,0.607
facebook/wav2vec2-large-xlsr-53-english,0.772
Azure,0.674
GCP,0.827
AWS,0.735


In [269]:
split = "Test (afri-ner -- predicted cer)"
all_model_entity["split"] = split

columns = [(split, "General")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity[all_model_entity.loc[:, "afri_ner_cat"] != ""], 
                       metric="cer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="afri_ner_cat",
                       predictions="afri_ner_cat_prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (afri-ner -- predicted cer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General
model_tag,name,Unnamed: 2_level_2
finetuned,openai/whisper-medium-all,0.7
finetuned,openai/whisper-medium-general,0.704
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.815
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.816
pretrained,openai/whisper-large,0.855
pretrained,openai/whisper-medium,0.913
pretrained,openai/whisper-medium-en,0.927
pretrained,openai/whisper-small-en,0.934
finetuned,openai/whisper-medium-clinical,0.938
pretrained,openai/whisper-small,0.94


In [270]:
domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T

  domain_df.reset_index().T.drop("model_tag").T.set_index("name").T[model_of_interest].T


Unnamed: 0_level_0,Test (afri-ner -- predicted cer)
Unnamed: 0_level_1,General
name,Unnamed: 1_level_2
facebook/wav2vec2-large-960h,0.986
facebook/wav2vec2-large-960h-lv60-self,0.978
facebook/hubert-xlarge-ls960-ft,0.986
microsoft/wavlm-libri-clean-100h-large,0.984
openai/whisper-large,0.855
openai/whisper-medium,0.913
facebook/wav2vec2-large-xlsr-53-english,0.965
Azure,0.946
GCP,0.991
AWS,0.97


### Named entities statistics

In [9]:
def get_entities(df):
    df = df.copy()
    df.entities_group = df.entities_group.apply(lambda x: json.loads(x))
    print(len(df))
    per_ent =  df.entities_group.apply(lambda x: len([x[i]["word"] for i in range(len(x)) if x[i]["entity_group"] == "PER"])).sum()
    org_ent = df.entities_group.apply(lambda x: len([x[i]["word"] for i in range(len(x)) if x[i]["entity_group"] == "ORG"])).sum()
    loc_ent = df.entities_group.apply(lambda x: len([x[i]["word"] for i in range(len(x)) if x[i]["entity_group"] == "LOC"])).sum()
    all_ent = df.entities_group.apply(lambda x: len([x[i]["word"] for i in range(len(x)) if  \
                                                   x[i]["entity_group"] == "PER"
                                                   or x[i]["entity_group"] == "LOC" \
                                                 or x[i]["entity_group"] == "ORG"])).sum()
    
    return pd.DataFrame([{"#PER": per_ent, "#LOC": loc_ent, "#ORG": org_ent, "Total": all_ent}])

In [10]:
get_entities(df_entity)

971


Unnamed: 0,#PER,#LOC,#ORG,Total
0,1064,526,279,1869


In [15]:
csv = "../results/ner/intron-train-public-58000-clean_with_named_entity.csv"
df_general_train, df_no_entity_train, df_entity_train, _, _ = get_dfs(csv)
len(df_general_train), len(df_no_entity_train), len(df_entity_train)

(21682, 8275, 10034)

In [13]:
get_entities(df_entity_train)

10034


Unnamed: 0,#PER,#LOC,#ORG,Total
0,11011,6322,3194,20527


In [16]:
csv = "../results/ner/intron-dev-public-3231-clean_with_named_entity.csv"
df_general_dev, df_no_entity_dev, df_entity_dev, _, _ = get_dfs(csv)
len(df_general_dev), len(df_no_entity_dev), len(df_entity_dev)

(1407, 565, 600)

In [18]:
get_entities(df_entity_dev)

600


Unnamed: 0,#PER,#LOC,#ORG,Total
0,669,372,192,1233
