In [190]:
import pandas as pd
import ast
pd.set_option('display.max_colwidth', 0)

import json

In [20]:
df = pd.read_csv("../results/ner/intron-test-public-6346-clean_with_named_entity.csv")
df_entity_ = df[df.has_entity == 1].reset_index(drop=True)

# Only consider samples with named entities and contains either location or person named entities
df_entity = df_entity_[(df_entity_.PER.notna()) | (df_entity_.LOC.notna())].reset_index(drop=True)

In [21]:
df_entity_.shape, df_entity.shape, df.shape

((1703, 21), (1396, 21), (6346, 21))

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained(
    "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
)
model = AutoModelForTokenClassification.from_pretrained(
    "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [18]:
# All the pretrained model results are stored here
pretrained_df = pd.read_csv("../results/normalized/intron-open-test-all_models.csv")
print(pretrained_df.shape)

(103949, 15)


In [19]:
# 19 models, 5,471 samples per model
len(pretrained_df.idx.unique()), len(pretrained_df.name.unique())

(5471, 19)

In [23]:
# Only focus on sentences with named entities 
pretrained_df_entity = pretrained_df[pretrained_df["idx"].isin(df_entity["idx"].to_list())]
print(pretrained_df_entity.shape)

(21242, 15)


In [24]:
# Filtered down to 1,118 samples per model
len(pretrained_df_entity.idx.unique()), len(pretrained_df_entity.name.unique())

(1118, 19)

In [25]:
# This is finetuned model result
finetuned_df = pd.read_csv("../results/normalized/intron-open-test-all_models_finetuned.csv")
print(finetuned_df.shape)

(32826, 15)


In [26]:
# 6 finetuned models
len(finetuned_df.idx.unique()), len(finetuned_df.name.unique())

(5471, 6)

In [27]:
finetuned_df_entity = finetuned_df[finetuned_df["idx"].isin(df_entity["idx"].to_list())]
print(finetuned_df_entity.shape)

(6708, 15)


In [28]:
len(finetuned_df_entity.idx.unique()), len(finetuned_df_entity.name.unique())

(1118, 6)

In [29]:
# Replace model without prediction with empty string
pretrained_df_entity.loc[:, "prediction"] = pretrained_df_entity.loc[:, "prediction"].fillna("")
finetuned_df_entity.loc[:, "prediction"] = finetuned_df_entity.loc[:, "prediction"].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pretrained_df_entity.loc[:, "prediction"] = pretrained_df_entity.loc[:, "prediction"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finetuned_df_entity.loc[:, "prediction"] = finetuned_df_entity.loc[:, "prediction"].fillna("")


In [30]:
pretrained_df_entity.shape, finetuned_df_entity.shape

((21242, 15), (6708, 15))

### Extract named entities

In [191]:
# pretty slow

import os

n = len(pretrained_df_entity) + len(finetuned_df_entity)
save_path = f"../results/normalized/intron-open-test-all_models_filtered_ner-{n}.csv"

if not os.path.exists(save_path):
    pretrained_df_entity["entities_group_prediction"] = pretrained_df_entity.apply(lambda x: nlp.group_entities(nlp(x["prediction"])), axis=1)
    finetuned_df_entity["entities_group_prediction"] = finetuned_df_entity.apply(lambda x: nlp.group_entities(nlp(x["prediction"])), axis=1)

    pretrained_df_entity["model_tag"] = "pretrained"
    finetuned_df_entity["model_tag"] = "finetuned"
    
    all_model_entity = pd.concat([pretrained_df_entity, finetuned_df_entity], axis=0)
    all_model_entity = all_model_entity.reset_index(drop=True)
    
    all_model_entity = all_model_entity.join(df_entity[["idx", "entities_group"]].set_index("idx"), on="idx")
    
    assert n == len(all_model_entity)
    all_model_entity.to_csv(save_path, index=None)

else:
    all_model_entity = pd.read_csv(save_path)
    if "Unnamed: 0" in all_model_entity.columns:
        all_model_entity= all_model_entity.drop(columns=["Unnamed: 0"])
        
    all_model_entity.loc[:, "entities_group"] = all_model_entity["entities_group"].apply(json.loads)
    all_model_entity.loc[:, "entities_group_prediction"] = all_model_entity["entities_group_prediction"].apply(ast.literal_eval)
    

In [192]:
all_model_entity.head(2)

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,audio_paths,origin,country,reference,prediction,wer,name,split,entities_group_prediction,model_tag,entities_group
0,357071,general,Female,7.25898,19-25,jaba,6653712c0e2c617bfa5f8469d69c2163,/AfriSpeech-100/test/3089079d-b2ff-482e-8a5a-39c949698f5c/28f87600531493767159c4abd14b3593.wav,nigerian,NG,tinubu as a game-master in politics can sacrifice personal interests for his partys win.,"tanubu as a key master in politicians has a christ personal interest for his sponsors, being first of all.",0.928571,openai/whisper-small,test,"[{'entity_group': 'PER', 'score': 0.768043, 'word': 'tan', 'start': 0, 'end': 3}]",pretrained,"[{'entity_group': 'PER', 'score': 0.9998998641967773, 'word': 'Tinubu', 'start': 0, 'end': 6}]"
1,450381,general,Male,10.427982,19-25,ukwuani,062a6e856921674d55a3b54f31725f81,/AfriSpeech-100/test/ab80cd5b-cb2b-4b58-a2c4-208d20acc380/983a51e61e3e3bc34177ff4813ba96c5.wav,nigerian,NG,"reuters emails and calls to grandefex went unanswered. in june 2020, germanys regulator said the platform was unauthorised and ordered its closure.","reuters emails and calls to grandfx went unanswered. full stock. in june 2020, german regulator said the plaza must unauthorize and order its closure. full stock.",0.454545,openai/whisper-small,test,"[{'entity_group': 'ORG', 'score': 0.9700304, 'word': 're', 'start': 0, 'end': 2}, {'entity_group': 'ORG', 'score': 0.77678293, 'word': 'grandfx', 'start': 28, 'end': 35}, {'entity_group': 'DATE', 'score': 0.99998695, 'word': 'june 2020', 'start': 68, 'end': 77}]",pretrained,"[{'entity_group': 'ORG', 'score': 0.999998927116394, 'word': 'Reuters', 'start': 0, 'end': 7}, {'entity_group': 'ORG', 'score': 0.9992976784706116, 'word': 'Grandefex', 'start': 28, 'end': 37}, {'entity_group': 'DATE', 'score': 0.999998152256012, 'word': 'June 2020', 'start': 58, 'end': 67}, {'entity_group': 'LOC', 'score': 0.9943286776542664, 'word': 'Germany', 'start': 69, 'end': 76}]"


In [193]:
all_model_entity.shape, len(all_model_entity.idx.unique()), len(all_model_entity.name.unique())

((27950, 18), 1118, 25)

### Extract named entities for the models based on set threshold

Note:

The named entities for the reference is already thresholded.
It makes sense to threshold the extracted named entities from the predicted models as well
but this should be double-checked with the team

In [194]:
all_model_entity["entities_group"][0][0]

{'entity_group': 'PER',
 'score': 0.9998998641967773,
 'word': 'Tinubu',
 'start': 0,
 'end': 6}

In [195]:
all_model_entity["entities_group_prediction"][0]

[{'entity_group': 'PER',
  'score': 0.768043,
  'word': 'tan',
  'start': 0,
  'end': 3}]

In [239]:
## Extract entities

threshhold=0.8

all_model_entity.loc[:, "ner_cat"] = all_model_entity.loc[\
                                                          :, "entities_group"].apply(\
                                                          lambda x: "".join(\
                                                                            [x[i]["word"].lower().strip() \
                                                                            for i in range(len(x)) \
                                                                            if x[i]["entity_group"] == "PER" \
                                                                            or x[i]["entity_group"] == "LOC" \
                                                                            or x[i]["entity_group"] == "ORG"
                                                                            ]))

all_model_entity.loc[:, "ner_cat_prediction"] = all_model_entity.loc[\
                                                                     :, "entities_group_prediction"].apply(\
                                                                     lambda x: "".join(\
                                                                                     [x[i]["word"].lower().strip() \
                                                                                      for i in range(len(x)) \
                                                                                      if 
                                                                                      (x[i]["entity_group"] == "PER" \
                                                                                      or x[i]["entity_group"] == "LOC" \
                                                                                      or x[i]["entity_group"] == "ORG") \
                                                                                      and x[i]["score"] >= threshhold
                                                                                    ]))

In [240]:
len(all_model_entity.idx.unique()), \
len(all_model_entity[all_model_entity.loc[:, "ner_cat_prediction"] != ""])

(1118, 11602)

In [241]:
all_model_entity.loc[:, "ner_cat_prediction"][all_model_entity.loc[:, "ner_cat_prediction"] != ""].sample(5)

27475    wabuwu       
3146     amanda       
954      chikadibia   
4439     omaer yegbaou
22591    busafi       
Name: ner_cat_prediction, dtype: object

In [242]:
# join the entities with space
all_model_entity.loc[:, "ner_cat"] = all_model_entity.loc[:, "ner_cat"].apply(lambda x: "".join(x.split(" ")))
all_model_entity.loc[:, "ner_cat_prediction"] = all_model_entity.loc[\
                                                                     :, "ner_cat_prediction"].apply(\
                                                                     lambda x: "".join(x.split(" ")))

In [281]:
all_model_entity[["reference", "prediction", "ner_cat", "ner_cat_prediction", "entities_group", "entities_group_prediction"]][\
                all_model_entity.loc[:, "ner_cat_prediction"] != ""].sample(5)

Unnamed: 0,reference,prediction,ner_cat,ner_cat_prediction,entities_group,entities_group_prediction
11840,"we will just ignore it and move on, said the part vice-chairman job sikhala.",we vill jos ecnoet an movon commer said take pots vaisa fron cherman job se kala fulston,jobsikhala,fulston,"[{'entity_group': 'PER', 'score': 0.999993085861206, 'word': 'Job Sikhala', 'start': 64, 'end': 75}]","[{'entity_group': 'LOC', 'score': 0.9889206, 'word': 'fulston', 'start': 81, 'end': 88}]"
11727,"will schedule family meeting as soon as brothers damieibi and adeboye are feeling well. saturday 12 july, 1975",ruscedio family meets in assona's brotheras tami bi an a deboye afilinel fusto saturdato of july commer nineteen seventy five,damieibiadeboye,rus,"[{'entity_group': 'PER', 'score': 0.9999575614929199, 'word': 'Damieibi', 'start': 49, 'end': 57}, {'entity_group': 'PER', 'score': 0.9998622536659241, 'word': 'Adeboye', 'start': 62, 'end': 69}, {'entity_group': 'DATE', 'score': 0.9999920129776001, 'word': 'Saturday 12 July, 1975', 'start': 88, 'end': 110}]","[{'entity_group': 'PER', 'score': 0.922902, 'word': 'rus', 'start': 0, 'end': 3}, {'entity_group': 'DATE', 'score': 0.9868036, 'word': 'saturdato of july commer nineteen seventy five', 'start': 79, 'end': 125}]"
13283,ancelotti assured pickford that he would return to play against manchester united this weekend and southgate believes that a brief spell out of the side at club level will not hurt his first-choice goalkeepers confidence.,ancelozi assured pickford that he would return to play against manchester united this weekend and south gates believes that a brief spellout of decide at cloub level wunohots his first hifon choice gookipa's confidence phosto,ancelottipickfordmanchesterunitedsouthgate,ancelozipickfordmanchesterunitedsouthgates,"[{'entity_group': 'PER', 'score': 0.9776719212532043, 'word': 'Ancelotti', 'start': 0, 'end': 9}, {'entity_group': 'PER', 'score': 0.9999873042106628, 'word': 'Pickford', 'start': 18, 'end': 26}, {'entity_group': 'ORG', 'score': 0.9999927282333374, 'word': 'Manchester United', 'start': 64, 'end': 81}, {'entity_group': 'DATE', 'score': 0.7631992101669312, 'word': 'weekend', 'start': 87, 'end': 94}, {'entity_group': 'PER', 'score': 0.9999954104423523, 'word': 'Southgate', 'start': 99, 'end': 108}]","[{'entity_group': 'PER', 'score': 0.9912971, 'word': '', 'start': 0, 'end': 1}, {'entity_group': 'PER', 'score': 0.9717666, 'word': 'ancelozi', 'start': 0, 'end': 8}, {'entity_group': 'PER', 'score': 0.9998683, 'word': 'pickford', 'start': 17, 'end': 25}, {'entity_group': 'ORG', 'score': 0.9999887, 'word': 'manchester united', 'start': 63, 'end': 80}, {'entity_group': 'DATE', 'score': 0.87487984, 'word': 'weekend', 'start': 86, 'end': 93}, {'entity_group': 'PER', 'score': 0.9911662, 'word': 'south gates', 'start': 98, 'end': 109}]"
9222,"bullying can look like a lot of different things, said noah biddlecombe, program director at youth one.",birlinka and joky like a lot of descentis commors restaurant idi condy co program directs ar cheat mark sisters,noahbiddlecombeyouthone,bir,"[{'entity_group': 'PER', 'score': 0.981478214263916, 'word': 'Noah Biddlecombe', 'start': 55, 'end': 71}, {'entity_group': 'ORG', 'score': 0.9999938607215881, 'word': 'Youth One', 'start': 93, 'end': 102}]","[{'entity_group': 'PER', 'score': 0.8432506, 'word': 'bir', 'start': 0, 'end': 3}]"
16190,"when alison murphy sold her tacoma condo this summer, she quickly broadened her search for a new home beyond the city to areas like spanaway and parkland.","when i listen morpheus. so that tacoma condo this summer, she quickly brought in her search for a new home beyond the city to areas like spanaway and parkland..",alisonmurphytacomaspanawayparkland,spanaway,"[{'entity_group': 'PER', 'score': 0.9998760223388672, 'word': 'Alison Murphy', 'start': 5, 'end': 18}, {'entity_group': 'LOC', 'score': 0.9999785423278809, 'word': 'Tacoma', 'start': 28, 'end': 34}, {'entity_group': 'DATE', 'score': 0.8296452760696411, 'word': 'this summer', 'start': 41, 'end': 52}, {'entity_group': 'LOC', 'score': 0.9999881386756897, 'word': 'Spanaway', 'start': 132, 'end': 140}, {'entity_group': 'LOC', 'score': 0.9999922513961792, 'word': 'Parkland', 'start': 145, 'end': 153}]","[{'entity_group': 'LOC', 'score': 0.7809031, 'word': 'tacom', 'start': 32, 'end': 37}, {'entity_group': 'DATE', 'score': 0.6983818, 'word': 'this summer', 'start': 45, 'end': 56}, {'entity_group': 'LOC', 'score': 0.91855717, 'word': 'spanaway', 'start': 137, 'end': 145}, {'entity_group': 'LOC', 'score': 0.6892765, 'word': 'parkland', 'start': 150, 'end': 158}]"


In [356]:
df_names = pd.read_csv("../data/African_names/List of Nigerian names - Sheet1.csv")
df_names2 = pd.read_csv("../data/African_names/mmc2-igbo names.csv", delimiter=";")
df_names3 = pd.read_csv("../data/African_names/AfricanNamesDatabase.csv")
df_names_list = df_names.YORUBA.str.lower().to_list() + df_names.HAUSA.str.lower().to_list()+ \
                    df_names.IBO.str.lower().to_list() + df_names2["Igbo Names"].str.lower().to_list() 
#                    + \ df_names3.Name.str.lower().to_list()
df_names_list = df_names_list
print(len(df_names_list))

# afri ner
all_model_entity.loc[:, "afri_ner_cat"] = all_model_entity.loc[\
                                                               :, "entities_group"].apply(\
                                                                lambda x: "".join(\
                                                                [j.lower().strip() \
                                                                for i in range(len(x)) \
                                                                if x[i]["entity_group"] == "PER" \
                                                                for j in x[i]["word"].split()
                                                                if j.lower() in df_names_list]
                                                               ))
all_model_entity.loc[:, "afri_ner_cat_prediction"] = all_model_entity.loc[\
                                                                        :, "entities_group_prediction"].apply(\
                                                                        lambda x: "".join(\
                                                                        [j.lower().strip() \
                                                                        for i in range(len(x)) \
                                                                        if x[i]["entity_group"] == "PER" \
#                                                                         if x[i]["word"].lower() in df_names_list]
                                                                        for j in x[i]["word"].split()
                                                                        if j.lower() in df_names_list]
                                                                        ))



all_model_entity.loc[:, "afri_ner_cat"] = all_model_entity.loc[\
                                                               :, "afri_ner_cat"].apply(\
                                                               lambda x: "".join(x.split(" ")))
all_model_entity.loc[:, "afri_ner_cat_prediction"] = all_model_entity.loc[\
                                                                          :, "afri_ner_cat_prediction"].apply(\
                                                                          lambda x: "".join(x.split(" ")))

# df_entity_PER.loc[:, "afri_ner_count"] = df_entity_PER.loc[:, "entities_group"].
# apply(lambda x: len([x[i]["word"] for i in range(len(x)) if x[i]["entity_group"] == "PER" 
#                      if x[i]["word"].lower() in df_names_list]))


all_model_entity.loc[:, "afri_ner_count"] = all_model_entity.loc[:, "entities_group"].apply(\
                                            lambda x: len([j \
                                            for i in range(len(x)) \
                                            if x[i]["entity_group"] == "PER" \
                                            for j in x[i]["word"].split()
                                            if j.lower() in df_names_list]
#                                             if x[i]["word"].lower() in df_names_list]             
                                            ))

all_model_entity.loc[:, "afri_ner"] = all_model_entity.loc[:, "entities_group"].apply(\
                                            lambda x: [j \
                                            for i in range(len(x)) \
                                            if x[i]["entity_group"] == "PER" \
                                            for j in x[i]["word"].split()
                                            if j.lower() in df_names_list]
#                                             if x[i]["word"].lower() in df_names_list]             
                                            )


2462


In [352]:
len(all_model_entity[all_model_entity.loc[:, "afri_ner_cat"] != ""].idx.unique()), \
len(all_model_entity[all_model_entity.loc[:, "afri_ner_cat_prediction"] != ""])

(263, 826)

In [353]:
len(all_model_entity[all_model_entity["afri_ner_count"] >= 1].idx.unique())

263

In [354]:
xx = all_model_entity[all_model_entity.loc[:, "afri_ner_count"] >= 1]
xx = xx[~xx.duplicated(subset=["idx"])]
xx.afri_ner_count.sum()

415

In [357]:
xx[["reference", "prediction", "entities_group", "entities_group_prediction", "afri_ner", "afri_ner_count"]].sample(5)

KeyError: "['afri_ner'] not in index"

In [280]:
all_model_entity[["reference", "prediction", "afri_ner_cat", "afri_ner_cat_prediction", "entities_group", "entities_group_prediction"]][\
                all_model_entity.loc[:, "afri_ner_cat_prediction"] != ""].sample(5)

Unnamed: 0,reference,prediction,afri_ner_cat,afri_ner_cat_prediction,entities_group,entities_group_prediction
24921,"children kilani ali and ejefenihu were found last night wandering the streets unattended after their mother and father, went missing while returning from work at minna","children kilani ale and ejefenihi were found last night onderin the streets unattended after their mother and father, went missing while returning from work at minna",kilaniali,kilani,"[{'entity_group': 'PER', 'score': 0.9999977946281433, 'word': 'Kilani Ali', 'start': 9, 'end': 19}, {'entity_group': 'PER', 'score': 0.9999982118606567, 'word': 'E', 'start': 24, 'end': 25}, {'entity_group': 'PER', 'score': 0.9544576406478882, 'word': 'jefenihu', 'start': 25, 'end': 33}, {'entity_group': 'DATE', 'score': 0.9998205900192261, 'word': 'last night', 'start': 45, 'end': 55}, {'entity_group': 'LOC', 'score': 0.9930636882781982, 'word': 'Minna', 'start': 162, 'end': 167}]","[{'entity_group': 'PER', 'score': 0.9852264, 'word': 'kilani ale', 'start': 9, 'end': 19}, {'entity_group': 'PER', 'score': 0.9989297, 'word': 'e', 'start': 24, 'end': 25}, {'entity_group': 'PER', 'score': 0.9246713, 'word': 'jefenihi', 'start': 25, 'end': 33}, {'entity_group': 'DATE', 'score': 0.98987144, 'word': 'last night', 'start': 45, 'end': 55}, {'entity_group': 'LOC', 'score': 0.99964964, 'word': 'minna', 'start': 160, 'end': 165}]"
23417,the family history of nwadiogwa disease was important for udodi who is a doctor at birnin kebbi university but lives in jalingo with his wife nwabugo,the family history of nwadiogwa disease was important for udodi who is a doctor at birnin kebbi university but lives in jalingo with his wife nwabugo,nwadiogwaudodinwabugo,udodinwabugo,"[{'entity_group': 'PER', 'score': 0.9929107427597046, 'word': 'Nwadiogwa', 'start': 22, 'end': 31}, {'entity_group': 'PER', 'score': 0.999977707862854, 'word': 'Udodi', 'start': 58, 'end': 63}, {'entity_group': 'ORG', 'score': 0.9999947547912598, 'word': 'Birnin Kebbi university', 'start': 83, 'end': 106}, {'entity_group': 'LOC', 'score': 0.9980893731117249, 'word': 'Jalingo', 'start': 120, 'end': 127}, {'entity_group': 'PER', 'score': 0.9999624490737915, 'word': 'Nwabugo', 'start': 142, 'end': 149}]","[{'entity_group': 'PER', 'score': 0.9362965, 'word': 'n', 'start': 22, 'end': 23}, {'entity_group': 'PER', 'score': 0.80511516, 'word': 'wadiogwa', 'start': 23, 'end': 31}, {'entity_group': 'PER', 'score': 0.9436272, 'word': 'udodi', 'start': 58, 'end': 63}, {'entity_group': 'ORG', 'score': 0.9999881, 'word': 'birnin kebbi university', 'start': 83, 'end': 106}, {'entity_group': 'LOC', 'score': 0.99116945, 'word': 'ja', 'start': 120, 'end': 122}, {'entity_group': 'LOC', 'score': 0.64981294, 'word': 'lingo', 'start': 122, 'end': 127}, {'entity_group': 'PER', 'score': 0.9382707, 'word': 'nwabugo', 'start': 142, 'end': 149}]"
21715,"stan doty , daniel herbst , deborah miles johnson , haydn johnston , matt zivich production assistants , technical assistance","kandibe, dane had, deborah maus johnson, edin johnson, mitz heavy production assistant, technical assistant",,kandibe,"[{'entity_group': 'PER', 'score': 0.9999957084655762, 'word': 'Stan Doty', 'start': 0, 'end': 9}, {'entity_group': 'PER', 'score': 0.9999991655349731, 'word': 'Daniel Herbst', 'start': 12, 'end': 25}, {'entity_group': 'PER', 'score': 0.9999995231628418, 'word': 'De', 'start': 28, 'end': 30}, {'entity_group': 'PER', 'score': 0.9993184804916382, 'word': 'borah Miles Johnson', 'start': 30, 'end': 49}, {'entity_group': 'PER', 'score': 0.996477484703064, 'word': 'Haydn Johnston', 'start': 52, 'end': 66}, {'entity_group': 'PER', 'score': 0.9999775290489197, 'word': 'Matt Zivich', 'start': 69, 'end': 80}]","[{'entity_group': 'PER', 'score': 0.98145884, 'word': 'kandibe', 'start': 0, 'end': 7}, {'entity_group': 'PER', 'score': 0.9955486, 'word': 'dane had', 'start': 9, 'end': 17}, {'entity_group': 'PER', 'score': 0.9999974, 'word': 'de', 'start': 19, 'end': 21}, {'entity_group': 'PER', 'score': 0.99988514, 'word': 'borah maus johnson', 'start': 21, 'end': 39}, {'entity_group': 'PER', 'score': 0.9998403, 'word': 'edin johnson', 'start': 41, 'end': 53}, {'entity_group': 'PER', 'score': 0.96757674, 'word': 'mitz', 'start': 55, 'end': 59}]"
5187,"nigerian singer, peter okoye, popularly known as mr p, has stated that people without their permanent voters cards were not allowed into his home and office.","nigeria's singer, peter okoye, popularly known as mr. p, has stated that people without their permanent voter's card were not allowed into his home and off his.",okoye,okoye,"[{'entity_group': 'PER', 'score': 0.9999988675117493, 'word': 'Peter Okoye', 'start': 17, 'end': 28}, {'entity_group': 'PER', 'score': 0.9999970197677612, 'word': 'Mr P', 'start': 49, 'end': 53}]","[{'entity_group': 'PER', 'score': 0.9999946, 'word': 'peter okoye', 'start': 18, 'end': 29}, {'entity_group': 'PER', 'score': 0.9999946, 'word': 'mr. p', 'start': 50, 'end': 55}]"
6016,dr. erinola agreed to keep pt sedated for airway protection.,doctor erinola agreed to keep pecien sedated for aiway protation,erinola,erinola,"[{'entity_group': 'PER', 'score': 0.9922321438789368, 'word': 'Erinola', 'start': 4, 'end': 11}]","[{'entity_group': 'PER', 'score': 0.9926196, 'word': 'erinola', 'start': 7, 'end': 14}]"


In [279]:
# some words in predicted afri entities but not in reference
# for example, reference has "al-mustapha" which is predicted as mustapha
# mustapha exist in our list but mustapha does not.
# df_names_list.index("mustafa")  vs "mustapha"

In [278]:
# df_names_list.index("bashir")

### Compute metric

In [282]:
from datasets import load_metric
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

In [284]:
len(all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "clinical")])

147

In [285]:
# Sanity check

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "clinical")]
print(wer_metric.compute(predictions=x.prediction, references=x.reference))

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "general")]
print(wer_metric.compute(predictions=x.prediction, references=x.reference))

x = all_model_entity[all_model_entity.name == "openai/whisper-large"]
print(wer_metric.compute(predictions=x.prediction, references=x.reference))

0.5388821385176185
0.30014974211080914
0.32011993698226354


In [286]:
x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "clinical")]
print(cer_metric.compute(predictions=x.prediction, references=x.reference))

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "general")]
print(cer_metric.compute(predictions=x.prediction, references=x.reference))

x = all_model_entity[all_model_entity.name == "openai/whisper-large"]
print(cer_metric.compute(predictions=x.prediction, references=x.reference))

0.2403334337651903
0.14382112160983127
0.1519860656782361


In [287]:
# Sanity check

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "clinical")]
print(cer_metric.compute(predictions=x.ner_cat_prediction, references=x.ner_cat))

x = all_model_entity[(all_model_entity.name == "openai/whisper-large") & (all_model_entity.domain == "general")]
print(cer_metric.compute(predictions=x.ner_cat_prediction, references=x.ner_cat))

x = all_model_entity[all_model_entity.name == "openai/whisper-large"]
print(cer_metric.compute(predictions=x.ner_cat_prediction, references=x.ner_cat))

0.8412371134020619
0.5650354738948518
0.5874289535272484


In [298]:
def domain_wer(df, metric="wer", columns=None, sort_by=None, predictions="ner_cat_prediction", references="ner_cat"):
    
    if columns is None:
        columns = [
                ("Test (baseline)", "General"), ("Test (baseline)", "Clinical"), ("Test (baseline)", "Both"), \
               ("Test (without ner)", "General"), ("Test (without ner)", "Clinical"), ("Test (without ner)", "Both"), \
                ("Test (ner)", "General"), ("Test (ner)", "Clinical"), ("Test (ner)", "Both"), \
                ("Test (ner ng)", "General"), ("Test (ner ng)", "Clinical"), ("Test (ner ng)", "Both"),
             ]
    
    if sort_by is None:
        sort_by = ("Test (ner)", "Both")
        
    df = df.copy()
    df.domain = df.domain.str.capitalize()
    df.split = df.split.str.capitalize()
    
    
    # compute the domain WER
    if metric == "cer":
        domain_df = df.groupby(["model_tag", "name", "split", "domain"]).apply(lambda x: cer_metric.compute(predictions=x[predictions], references=x[references]))        
        domain_df_ = df.groupby(["model_tag", "name", "split"]).apply(lambda x: cer_metric.compute(predictions=x[predictions], references=x[references])).reset_index()
    
    elif metric=="wer":
        domain_df = df.groupby(["model_tag", "name", "split", "domain"]).apply(lambda x: wer_metric.compute(predictions=x[predictions], references=x[references]))
        domain_df_ = df.groupby(["model_tag", "name", "split"]).apply(lambda x: wer_metric.compute(predictions=x[predictions], references=x[references])).reset_index()
        
    _domain_df = domain_df.reset_index().set_index(["model_tag", "name"]).pivot(columns=["split", "domain"])
    _domain_df = _domain_df.droplevel(0, axis=1)
        
    domain_df_["domain"] = "Both"    
    domain_df_ = domain_df_.set_index(["model_tag", "name"]).pivot(columns=["split", "domain"])
    domain_df_ = domain_df_.droplevel(0, axis=1)
    
    # merge
    domain_df__ = _domain_df.merge(domain_df_, on=["model_tag", "name"]).sort_values(sort_by)    
    domain_df__ = domain_df__[columns]
    domain_df__ = domain_df__.round(3)
    domain_df__.columns.names = ["", ""]
    return domain_df__


In [299]:
all_model_entity.columns

Index(['idx', 'domain', 'gender', 'duration', 'age_group', 'accent',
       'user_ids', 'audio_paths', 'origin', 'country', 'reference',
       'prediction', 'wer', 'name', 'split', 'entities_group_prediction',
       'model_tag', 'entities_group', 'ner_cat', 'ner_cat_prediction',
       'afri_ner_cat', 'afri_ner_cat_prediction'],
      dtype='object')

In [300]:
split = "Test (ner -- wer)"
all_model_entity["split"] = split

columns = [(split, "General"), (split, "Clinical"), (split, "Both")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity, 
                       metric="wer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="reference",
                       predictions="prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (ner -- wer),Test (ner -- wer),Test (ner -- wer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General,Clinical,Both
model_tag,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
finetuned,openai/whisper-medium-general,0.198,0.575,0.23
finetuned,openai/whisper-medium-all,0.208,0.304,0.216
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.258,0.541,0.282
pretrained,openai/whisper-large,0.3,0.539,0.32
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.302,0.388,0.31
pretrained,openai/whisper-medium,0.352,0.595,0.373
pretrained,openai/whisper-medium-en,0.388,0.574,0.403
pretrained,Azure,0.402,0.597,0.418
pretrained,openai/whisper-small,0.405,0.696,0.429
pretrained,AWS,0.426,0.696,0.448


In [310]:
all_model_entity[all_model_entity.loc[:, "afri_ner_count"] >= 1]

Unnamed: 0,idx,domain,gender,duration,age_group,accent,user_ids,audio_paths,origin,country,...,name,split,entities_group_prediction,model_tag,entities_group,ner_cat,ner_cat_prediction,afri_ner_cat,afri_ner_cat_prediction,afri_ner_count
0,357071,general,Female,7.258980,19-25,jaba,6653712c0e2c617bfa5f8469d69c2163,/AfriSpeech-100/test/3089079d-b2ff-482e-8a5a-39c949698f5c/28f87600531493767159c4abd14b3593.wav,nigerian,NG,...,openai/whisper-small,Test (afri-ner -- wer),"[{'entity_group': 'PER', 'score': 0.768043, 'word': 'tan', 'start': 0, 'end': 3}]",pretrained,"[{'entity_group': 'PER', 'score': 0.9998998641967773, 'word': 'Tinubu', 'start': 0, 'end': 6}]",tinubu,,tinubu,,1
2,421195,general,Female,8.366984,19-25,igbo,803297b4bbd524fa708a18f3f117b79e,/AfriSpeech-100/test/ee3a0205-d73d-4890-9c4f-bb29a38e5181/1f6f931ca80ef5f8fad73a4e5649f0db.wav,nigerian,NG,...,openai/whisper-small,Test (afri-ner -- wer),"[{'entity_group': 'DATE', 'score': 0.9996214, 'word': 'tuesday 28 april 1998', 'start': 0, 'end': 21}]",pretrained,"[{'entity_group': 'DATE', 'score': 0.9999867677688599, 'word': 'Tuesday 28 April, 1998', 'start': 0, 'end': 22}, {'entity_group': 'PER', 'score': 0.9967962503433228, 'word': 'Daramola Achike', 'start': 44, 'end': 59}, {'entity_group': 'DATE', 'score': 0.999918520450592, 'word': '08 February, 2003', 'start': 60, 'end': 77}]",daramolaachike,,daramolaachike,,2
3,307962,general,Female,8.720998,26-40,oklo,0025a3634a3f45e792d1567b9cdcd246,/AfriSpeech-100/test/b6c84a08-7e3a-4cf9-ac28-3ff9cbfde3df/388f79f5b4b8de8bda91223cff67c06f.wav,nigerian,NG,...,openai/whisper-small,Test (afri-ner -- wer),[],pretrained,"[{'entity_group': 'PER', 'score': 0.9999991655349731, 'word': 'A', 'start': 19, 'end': 20}, {'entity_group': 'PER', 'score': 0.9731560349464417, 'word': 'beni Chinememma', 'start': 20, 'end': 35}, {'entity_group': 'PER', 'score': 0.9991277456283569, 'word': 'Chinaecherem', 'start': 47, 'end': 59}]",abenichinememmachinaecherem,,chinememmachinaecherem,,2
6,312469,general,Male,5.360000,56yrs>,igbo,a1dd0007687f8c1a5a22e67fc8edbb64,/AfriSpeech-100/test/693814b9-21be-49b2-9644-731fad0c2003/10c9abe5ca5dbe03fb8f54cd4d80ab7f.wav,nigerian,NG,...,openai/whisper-small,Test (afri-ner -- wer),"[{'entity_group': 'DATE', 'score': 0.9988054, 'word': 'this morning', 'start': 0, 'end': 12}, {'entity_group': 'LOC', 'score': 0.9966927, 'word': 'ibala', 'start': 24, 'end': 29}, {'entity_group': 'LOC', 'score': 0.98816526, 'word': 'ibala', 'start': 33, 'end': 38}, {'entity_group': 'PER', 'score': 0.92170006, 'word': 'utena', 'start': 50, 'end': 55}]",pretrained,"[{'entity_group': 'DATE', 'score': 0.8826487064361572, 'word': 'This morning', 'start': 0, 'end': 12}, {'entity_group': 'PER', 'score': 0.9999626278877258, 'word': 'Chimereya', 'start': 13, 'end': 22}, {'entity_group': 'ORG', 'score': 0.99996018409729, 'word': 'Chinoye hospital', 'start': 31, 'end': 47}, {'entity_group': 'LOC', 'score': 0.9987977743148804, 'word': 'Ibadan', 'start': 51, 'end': 57}, {'entity_group': 'PER', 'score': 0.9999955892562866, 'word': 'Hussaina', 'start': 68, 'end': 76}]",chimereyachinoyehospitalibadanhussaina,ibalaibalautena,chimereyahussaina,,2
9,310476,general,Male,7.816984,19-25,kalabari,5328a1cc3e8a8bb87e9736e5201f8c1a,/AfriSpeech-100/test/e7e5f40f-be39-4473-9f41-b46a5f014c66/3ef1f7240d84bc9ab466da9829b38721.wav,nigerian,NG,...,openai/whisper-small,Test (afri-ner -- wer),"[{'entity_group': 'PER', 'score': 0.9973328, 'word': '', 'start': 4, 'end': 5}, {'entity_group': 'PER', 'score': 0.70736927, 'word': 'dotto', 'start': 4, 'end': 9}]",pretrained,"[{'entity_group': 'PER', 'score': 0.9994386434555054, 'word': 'Nwanneka', 'start': 4, 'end': 12}, {'entity_group': 'PER', 'score': 0.9999184608459473, 'word': 'Na', 'start': 26, 'end': 28}, {'entity_group': 'PER', 'score': 0.576937735080719, 'word': 'gona', 'start': 28, 'end': 32}, {'entity_group': 'PER', 'score': 0.9997363686561584, 'word': 'ne', 'start': 74, 'end': 76}, {'entity_group': 'PER', 'score': 0.9977167844772339, 'word': 'ice Kujore', 'start': 76, 'end': 86}]",nwannekanagonaneicekujore,,nwannekakujore,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27931,575112,general,Other,16.961000,41-55,"yoruba, hausa",3f7402dad2ccf1d2f93f931d41ddb72c,/AfriSpeech-100/test/1f5803ce-28f6-4ef4-b4d9-c40e974904bb/e9873957c35ebb74325b330bda5eba1a.wav,nigerian,NG,...,facebook/wav2vec2-large-xlsr-53-english-clinical,Test (afri-ner -- wer),"[{'entity_group': 'DATE', 'score': 0.9997705, 'word': '15march, 1988', 'start': 6, 'end': 19}, {'entity_group': 'PER', 'score': 0.98847884, 'word': 'unkama izichi', 'start': 23, 'end': 36}]",finetuned,"[{'entity_group': 'DATE', 'score': 0.9999293684959412, 'word': 'Tue 15 Mar, 1988', 'start': 0, 'end': 16}, {'entity_group': 'PER', 'score': 0.9999050498008728, 'word': 'Nkama Ezichi', 'start': 21, 'end': 33}, {'entity_group': 'ORG', 'score': 0.9191330075263977, 'word': 'Hospital Potiskum', 'start': 54, 'end': 71}]",nkamaezichihospitalpotiskum,unkamaizichi,nkamaezichi,,2
27932,594568,general,Male,9.430000,41-55,eggon,33e28feaae9e298e95471515415a233d,/AfriSpeech-100/test/4253c742-9ae2-4c29-b7c1-953e9742c0e2/ca22a37fbd640ff2733a10a640cb5c2b.wav,nigerian,NG,...,facebook/wav2vec2-large-xlsr-53-english-clinical,Test (afri-ner -- wer),"[{'entity_group': 'PER', 'score': 0.89530236, 'word': 'chi', 'start': 10, 'end': 13}, {'entity_group': 'ORG', 'score': 0.95886433, 'word': 'fenichayo hospital', 'start': 29, 'end': 47}, {'entity_group': 'LOC', 'score': 0.824607, 'word': 'buguma', 'start': 51, 'end': 57}, {'entity_group': 'PER', 'score': 0.9933101, 'word': 'za', 'start': 68, 'end': 70}, {'entity_group': 'PER', 'score': 0.5758351, 'word': 'wna', 'start': 70, 'end': 73}]",finetuned,"[{'entity_group': 'DATE', 'score': 0.9419776201248169, 'word': 'This morning', 'start': 0, 'end': 12}, {'entity_group': 'PER', 'score': 0.999982476234436, 'word': 'Chimdiomimi', 'start': 13, 'end': 24}, {'entity_group': 'ORG', 'score': 0.9999440908432007, 'word': 'Feyisayo hospital', 'start': 33, 'end': 50}, {'entity_group': 'LOC', 'score': 0.9994399547576904, 'word': 'Buguma', 'start': 54, 'end': 60}, {'entity_group': 'PER', 'score': 0.9999699592590332, 'word': 'Zauna', 'start': 71, 'end': 76}]",chimdiomimifeyisayohospitalbugumazauna,chifenichayohospitalbugumaza,chimdiomimizauna,,2
27938,613596,general,Male,16.677000,19-25,yala mbembe,b2043f0c05b1d68b6c7194e78247da8d,/AfriSpeech-100/test/69150ad5-a088-48bb-b0a4-1beb29aa4aeb/09ddea703a7a7fc505d662bc5ae77a8c.wav,nigerian,NG,...,facebook/wav2vec2-large-xlsr-53-english-clinical,Test (afri-ner -- wer),"[{'entity_group': 'DATE', 'score': 0.9999844, 'word': '56yr', 'start': 19, 'end': 23}]",finetuned,"[{'entity_group': 'PER', 'score': 0.9812123775482178, 'word': 'Ihechinyere Chinweizu', 'start': 0, 'end': 21}, {'entity_group': 'DATE', 'score': 0.9998412132263184, 'word': '56 yr', 'start': 22, 'end': 27}, {'entity_group': 'LOC', 'score': 0.9896509647369385, 'word': 'OR', 'start': 49, 'end': 51}, {'entity_group': 'ORG', 'score': 0.4551510512828827, 'word': 'S', 'start': 55, 'end': 56}, {'entity_group': 'DATE', 'score': 0.9999850392341614, 'word': '08-11-1989', 'start': 75, 'end': 85}]",ihechinyerechinweizuors,,ihechinyerechinweizu,,2
27941,828544,general,Male,14.237000,19-25,ekpeye,9dc1431272e68c73d54ad310595bed50,/AfriSpeech-100/test/482541f2-75d4-4b3a-9db6-2dea5b16bd81/b9035f5a84ba9c6273052b5aaea69158.wav,nigerian,NG,...,facebook/wav2vec2-large-xlsr-53-english-clinical,Test (afri-ner -- wer),[],finetuned,"[{'entity_group': 'PER', 'score': 0.9942795634269714, 'word': 'Ifechiluru Kaetechukwu', 'start': 8, 'end': 30}]",ifechilurukaetechukwu,,ifechilurukaetechukwu,,2


In [309]:
split = "Test (afri-ner -- wer)"
all_model_entity["split"] = split

columns = [(split, "General"), (split, "Clinical"), (split, "Both")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity[all_model_entity.loc[:, "afri_ner_count"] >= 1], 
                       metric="wer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="reference",
                       predictions="prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (afri-ner -- wer),Test (afri-ner -- wer),Test (afri-ner -- wer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General,Clinical,Both
model_tag,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
finetuned,openai/whisper-medium-general,0.108,0.592,0.149
finetuned,openai/whisper-medium-all,0.117,0.28,0.131
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.212,0.576,0.243
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.256,0.395,0.268
pretrained,openai/whisper-large,0.412,0.623,0.43
finetuned,facebook/wav2vec2-large-xlsr-53-english-clinical,0.451,0.455,0.452
pretrained,openai/whisper-medium-en,0.473,0.662,0.489
finetuned,openai/whisper-medium-clinical,0.476,0.421,0.471
pretrained,openai/whisper-medium,0.488,0.628,0.5
pretrained,Azure,0.509,0.704,0.526


In [301]:
split = "Test (ner -- prediction cer)"
all_model_entity["split"] = split

columns = [(split, "General"), (split, "Clinical"), (split, "Both")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity, 
                       metric="cer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="ner_cat",
                       predictions="ner_cat_prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (ner -- cer),Test (ner -- cer),Test (ner -- cer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General,Clinical,Both
model_tag,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
pretrained,openai/whisper-large,0.565,0.841,0.587
finetuned,openai/whisper-medium-general,0.576,0.829,0.597
finetuned,openai/whisper-medium-all,0.596,0.836,0.615
pretrained,openai/whisper-medium,0.607,0.854,0.627
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.622,0.848,0.64
pretrained,openai/whisper-medium-en,0.623,0.884,0.644
pretrained,openai/whisper-small,0.653,0.86,0.67
pretrained,openai/whisper-small-en,0.668,0.877,0.685
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.67,0.882,0.687
pretrained,Azure,0.674,0.899,0.693


In [302]:
split = "Test (afri-ner -- predicted cer)"
all_model_entity["split"] = split

columns = [(split, "General"), (split, "Clinical"), (split, "Both")]
sort_by = (split, "General")

domain_df = domain_wer(all_model_entity[all_model_entity.loc[:, "afri_ner_cat"] != ""], 
                       metric="cer", 
                       columns=columns, 
                       sort_by=sort_by, 
                       references="afri_ner_cat",
                       predictions="afri_ner_cat_prediction")
domain_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Test (afri-ner -- cer),Test (afri-ner -- cer),Test (afri-ner -- cer)
Unnamed: 0_level_1,Unnamed: 1_level_1,General,Clinical,Both
model_tag,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
finetuned,openai/whisper-medium-all,0.7,0.866,0.717
finetuned,openai/whisper-medium-general,0.704,0.948,0.729
finetuned,facebook/wav2vec2-large-xlsr-53-english-all,0.815,0.977,0.831
finetuned,facebook/wav2vec2-large-xlsr-53-english-general,0.816,0.977,0.833
pretrained,openai/whisper-large,0.855,0.948,0.865
pretrained,openai/whisper-medium,0.913,0.971,0.918
pretrained,openai/whisper-medium-en,0.927,0.971,0.931
pretrained,openai/whisper-small-en,0.934,0.98,0.939
finetuned,openai/whisper-medium-clinical,0.938,0.977,0.942
pretrained,openai/whisper-small,0.94,1.0,0.946


150