In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
train_csv = "../data/intron-train-public-58000-clean.csv"
test_csv = "../data/intron-test-public-6346-clean.csv"
dev_csv = "../data/intron-dev-public-3231-clean.csv"
all_csv = {"train": train_csv, "test": test_csv, "dev": dev_csv}
dfs = []
for k,v in all_csv.items():
    df = pd.read_csv(v)
    df["split"] = k
    dfs.append(df)
dfs = pd.concat(dfs)

In [3]:
dfs.shape

(67577, 14)

In [4]:
dfs.split.value_counts()

train    58000
test      6346
dev       3231
Name: split, dtype: int64

In [5]:
dfs.country.value_counts(dropna=False)

NG     45875
KE      8304
ZA      7870
GH      2018
BW      1391
UG      1092
RW       469
US       219
NaN       92
TR        66
ZW        63
MW        60
TZ        51
LS         7
Name: country, dtype: int64

### Table 1

In [6]:
country_mapping = pd.read_csv("../data/african_country_code_mapping.csv")
country_mapping.head()

Unnamed: 0,Country,Code
0,ALGERIA,DZ
1,ANGOLA,AO
2,BENIN,BJ
3,BOTSWANA,BW
4,BURKINA FASO,BF


In [7]:
len(country_mapping)

57

In [8]:
afris_country = dfs.country.value_counts().to_frame(name="clips").reset_index(names="code")[["code"]]
country_mapping.columns = country_mapping.columns.str.lower()
afris_country_code = country_mapping.merge(afris_country, on="code", how="right")
afris_non_african_countries = {"US":"UNITED STATES", "TR": "TURKEY"}
afris_country_code.country = afris_country_code.apply(lambda x: x.country if x.country is not np.nan \
                                                      else afris_non_african_countries[x.code], axis=1)
afris_country_code = afris_country_code.rename(columns = {"country": "country_name", "code": "country"}) 
afris_country_code.country_name = afris_country_code.country_name.str.capitalize()

In [9]:
afris_country_code

Unnamed: 0,country_name,country
0,Nigeria,NG
1,Kenya,KE
2,South africa,ZA
3,Ghana,GH
4,Botswana,BW
5,Uganda,UG
6,Rwanda,RW
7,United states,US
8,Turkey,TR
9,Zimbabwe,ZW


In [10]:
dfs = dfs.merge(afris_country_code, on="country", how="left")
dfs.shape

(67577, 15)

In [11]:
dfs.country_name.value_counts(dropna=False)

Nigeria                         45875
Kenya                            8304
South africa                     7870
Ghana                            2018
Botswana                         1391
Uganda                           1092
Rwanda                            469
United states                     219
NaN                                92
Turkey                             66
Zimbabwe                           63
Malawi                             60
Tanzania, united republic of       51
Lesotho                             7
Name: country_name, dtype: int64

In [12]:
dfs.columns

Index(['idx', 'user_ids', 'accent', 'age_group', 'country', 'transcript',
       'nchars', 'audio_ids', 'audio_paths', 'duration', 'origin', 'domain',
       'split', 'gender', 'country_name'],
      dtype='object')

In [13]:
speakers_stats = dfs.groupby(["country_name", "country"]).apply(lambda x: len(set(x.user_ids))).to_frame().rename(columns={0: "speakers"})
clips_stats = dfs.groupby(["country_name", "country"]).apply(lambda x: len(x.audio_ids)).to_frame().rename(columns={0: "clips"})
durations_stats = dfs.groupby(["country_name", "country"]).apply(lambda x: sum(x.duration) / 3600).to_frame().rename(columns={0: "hours"}).round(2)

table_1 = pd.concat([clips_stats, speakers_stats, durations_stats], axis=1).sort_values("clips", ascending=False).reset_index()
table_1.columns = table_1.columns.str.capitalize()
table_1

Unnamed: 0,Country_name,Country,Clips,Speakers,Hours
0,Nigeria,NG,45875,1979,142.4
1,Kenya,KE,8304,137,20.89
2,South africa,ZA,7870,223,22.69
3,Ghana,GH,2018,37,5.16
4,Botswana,BW,1391,38,3.96
5,Uganda,UG,1092,26,2.89
6,Rwanda,RW,469,9,1.47
7,United states,US,219,5,0.53
8,Turkey,TR,66,1,0.18
9,Zimbabwe,ZW,63,3,0.18


In [14]:
print(table_1.to_latex(index=None))

\begin{tabular}{llrrr}
\toprule
                 Country\_name & Country &  Clips &  Speakers &  Hours \\
\midrule
                      Nigeria &      NG &  45875 &      1979 & 142.40 \\
                        Kenya &      KE &   8304 &       137 &  20.89 \\
                 South africa &      ZA &   7870 &       223 &  22.69 \\
                        Ghana &      GH &   2018 &        37 &   5.16 \\
                     Botswana &      BW &   1391 &        38 &   3.96 \\
                       Uganda &      UG &   1092 &        26 &   2.89 \\
                       Rwanda &      RW &    469 &         9 &   1.47 \\
                United states &      US &    219 &         5 &   0.53 \\
                       Turkey &      TR &     66 &         1 &   0.18 \\
                     Zimbabwe &      ZW &     63 &         3 &   0.18 \\
                       Malawi &      MW &     60 &         1 &   0.15 \\
 Tanzania, united republic of &      TZ &     51 &         2 &   0.18 \\
         

  print(table_1.to_latex(index=None))


In [15]:
dfs.groupby(["country_name", "country"]).apply(lambda x: list(set(x.accent))).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
country_name,country,Unnamed: 2_level_1
Botswana,BW,"[luganda, tswana, setswana]"
Ghana,GH,"[akan (fante), twi, akan]"
Kenya,KE,"[luo, kikuyu, meru, luganda, luhya, swahili, d..."
Lesotho,LS,[zulu]
Malawi,MW,[chichewa]
Nigeria,NG,"[afo, jukun, gerawa, bajju, nyandang, kagoma, ..."
Rwanda,RW,[kinyarwanda]
South africa,ZA,"[portuguese, isizulu, siswati, setswana, xhosa..."
"Tanzania, united republic of",TZ,[swahili]
Turkey,TR,[zulu]


In [16]:
# total number of speakers
table_1.Speakers.sum()

2462

In [17]:
table_1.Hours.sum()

200.70000000000005

In [18]:
dfs.groupby(["country_name", "country", "user_ids", "duration"]).sum().reset_index()[["duration"]].mean()

  dfs.groupby(["country_name", "country", "user_ids", "duration"]).sum().reset_index()[["duration"]].mean()


duration    10.72538
dtype: float64

### Table 2

In [19]:
(dfs.gender.value_counts(normalize=True, dropna=False)*100).round(2)

Female    57.11
Male      42.41
Other      0.48
Name: gender, dtype: float64

In [20]:
(dfs.age_group.value_counts(normalize=True, dropna=False)*100).round(2)

19-25     54.35
26-40     27.18
41-55     15.35
<18yrs     1.87
56yrs>     0.83
NaN        0.42
Name: age_group, dtype: float64

In [21]:
(dfs.age_group.value_counts(normalize=False, dropna=False))

19-25     36728
26-40     18366
41-55     10374
<18yrs     1264
56yrs>      563
NaN         282
Name: age_group, dtype: int64

In [22]:
(dfs.origin.value_counts(normalize=True, dropna=False))

nigerian    0.678855
african     0.321145
Name: origin, dtype: float64

In [23]:
(dfs.domain.value_counts(normalize=False, dropna=False))

clinical    41765
general     25812
Name: domain, dtype: int64

In [24]:
(dfs.domain.value_counts(normalize=True, dropna=False) * 100).round(2)

clinical    61.8
general     38.2
Name: domain, dtype: float64

In [25]:
dfs.columns

Index(['idx', 'user_ids', 'accent', 'age_group', 'country', 'transcript',
       'nchars', 'audio_ids', 'audio_paths', 'duration', 'origin', 'domain',
       'split', 'gender', 'country_name'],
      dtype='object')

### Table 3

In [26]:
dfs.groupby(["split"]).apply(lambda x: len(set(x.user_ids))).to_frame().rename(columns={0: "speakers"})


Unnamed: 0_level_0,speakers
split,Unnamed: 1_level_1
dev,247
test,750
train,1466


In [27]:
dfs.groupby(["split"]).apply(lambda x: len(x.audio_ids)).to_frame().rename(columns={0: "clips"})

Unnamed: 0_level_0,clips
split,Unnamed: 1_level_1
dev,3231
test,6346
train,58000


In [28]:
dfs.groupby(["split"]).apply(lambda x: sum(x.duration) / 3600).to_frame().rename(columns={0: "hours"}).round(2)

Unnamed: 0_level_0,hours
split,Unnamed: 1_level_1
dev,8.74
test,18.77
train,173.4


In [29]:
dfs.groupby(["split"]).apply(lambda x: len(set(x.accent))).to_frame().rename(columns={0: "accents"})

Unnamed: 0_level_0,accents
split,Unnamed: 1_level_1
dev,45
test,108
train,71


In [30]:
dfs.groupby(["split"]).apply(lambda x: x.duration.mean())

split
dev       9.732928
test     10.645934
train    10.762566
dtype: float64

In [31]:
tmp = dfs.groupby(["split", "user_ids"]).apply(lambda x: x.duration.sum()).reset_index().rename(columns={0: "duration"})
tmp.groupby(["split"]).mean().round(2)

  tmp.groupby(["split"]).mean().round(2)


Unnamed: 0_level_0,duration
split,Unnamed: 1_level_1
dev,127.32
test,90.08
train,425.8


In [32]:
tmp = dfs.groupby(["split", "user_ids"]).apply(lambda x: len(x.idx)).reset_index().rename(columns={0: "clips"})
tmp.groupby(["split"]).mean().round(2)

  tmp.groupby(["split"]).mean().round(2)


Unnamed: 0_level_0,clips
split,Unnamed: 1_level_1
dev,13.08
test,8.46
train,39.56


In [33]:
tmp = dfs.groupby(["split", "accent"]).apply(lambda x: x.duration.sum()).reset_index().rename(columns={0: "duration"})
tmp.groupby(["split"]).mean().round(2)

  tmp.groupby(["split"]).mean().round(2)


Unnamed: 0_level_0,duration
split,Unnamed: 1_level_1
dev,698.82
test,625.55
train,8791.96


In [34]:
dfs[dfs.domain == "general"].groupby(["split"]).count()["idx"]

split
dev       1407
test      2723
train    21682
Name: idx, dtype: int64

In [35]:
dfs[dfs.domain == "clinical"].groupby(["split"]).count()["idx"]

split
dev       1824
test      3623
train    36318
Name: idx, dtype: int64

### Verify speaker uniqueness accross different split

In [36]:
# are any train speaker seen in dev?
assert len(set(dfs[dfs.split == "train"].user_ids.unique().tolist()) - set(dfs[dfs.split == "dev"].user_ids.unique().tolist())) \
== len(set(dfs[dfs.split == "train"].user_ids.unique().tolist()))

In [37]:
# are any train speaker seen in test?
assert len(set(dfs[dfs.split == "train"].user_ids.unique().tolist()) - set(dfs[dfs.split == "test"].user_ids.unique().tolist())) \
== len(set(dfs[dfs.split == "train"].user_ids.unique().tolist()))

In [38]:
# are any dev speaker seen in test?
assert len(set(dfs[dfs.split == "dev"].user_ids.unique().tolist()) - set(dfs[dfs.split == "test"].user_ids.unique().tolist())) \
== len(set(dfs[dfs.split == "dev"].user_ids.unique().tolist()))

### Afrispeech accents normalization

In [39]:
# dfs.accent.value_counts().keys().to_frame().to_csv("afrispeech_accents.csv", index=None)