In [2]:
import pandas as pd

In [42]:
data = pd.read_csv("../data/mls_data_index.csv")

In [45]:
data['language'].value_counts()

yoruba         4716
afrikaans      4341
sesotho        4011
hausa          3874
kinyarwanda    3774
arabic         3714
tswana         3668
zulu           3432
swahili        3363
xhosa          3335
pedi           3297
igbo           3259
luganda        2181
shona          1553
french         1405
akan           1134
amharic         812
twi             510
fulani           83
ga               75
Name: language, dtype: int64

In [10]:
data = data[data['source'] == 'intron-MT']

In [None]:
df_hrs = df_hrs.groupby(['language'])['duration'].sum().reset_index()
df_hrs['hours'] = df_hrs['duration'] / 3600
df_hrs.sort_values(by='hours', ascending=False)

Unnamed: 0,language,duration,hours
7,igbo,14394.875,3.998576
13,swahili,14373.395,3.99261
18,zulu,14367.561,3.990989
8,kinyarwanda,14362.256,3.989516
17,yoruba,14362.106,3.989474
6,hausa,14355.222,3.987562
1,akan,14311.839,3.975511
16,xhosa,14298.56,3.971822
14,tswana,14288.501,3.969028
10,pedi,14067.933,3.907759


In [11]:
df_spkrs = data.groupby(['language'])['speaker_id'].nunique().reset_index()
df_spkrs.sort_values(by='speaker_id', ascending=False)

Unnamed: 0,language,speaker_id
13,swahili,215
6,hausa,163
17,yoruba,142
18,zulu,105
7,igbo,85
8,kinyarwanda,80
16,xhosa,69
1,akan,60
0,afrikaans,44
10,pedi,41


In [14]:
data.groupby(['language'])['gender'].value_counts()

language     gender
afrikaans    Female    1172
             Male        87
akan         Male      1042
             Female      86
amharic      Male       116
             Female      85
arabic       Male       642
             Female      46
french       Male       726
             Female     164
ga           Female      52
             Male         5
hausa        Male       915
igbo         Male      1311
             Female      27
kinyarwanda  Male      1007
             Female       2
luganda      Male        48
pedi         Female     613
             Male       478
sesotho      Female    1285
             Male         4
shona        Female     620
             Male         8
swahili      Male       929
             Female       2
tswana       Male       786
             Female     398
twi          Male       354
             Female      53
xhosa        Female    1020
             Male       345
yoruba       Male      1382
zulu         Male      1487
             Female     209


In [6]:
agg_df = data.groupby(['source', 'language']).agg(
    total_duration_seconds=('duration', 'sum'),
    unique_speakers=('speaker_id', 'nunique')
).reset_index()

# Convert duration in seconds to hours
agg_df['hours'] = agg_df['total_duration_seconds'] / 3600

# Optional: drop the total_duration_seconds column if not needed
agg_df.drop(columns=['total_duration_seconds'], inplace=True)

agg_df.sort_values(by="language")

gender_counts = data.groupby(['source', 'language', 'gender']).size().reset_index(name='gender_count')

# Pivot the table so that each gender becomes its own column.
gender_pivot = gender_counts.pivot_table(index=['source', 'language'], 
                                           columns='gender', 
                                           values='gender_count', 
                                           fill_value=0).reset_index()

# If desired, flatten the column names (gender values become regular columns)
gender_pivot.columns.name = None


result = pd.merge(agg_df, gender_pivot, on=['source', 'language'])

result = result.sort_values(by="language")
result


Unnamed: 0,source,language,unique_speakers,hours,Female,Male,female,male,unknown
5,NCHLT,afrikaans,8,2.6647,0,0,1596,1406,0
37,intron-MT,afrikaans,44,3.336081,1172,87,0,0,0
24,fleurs,afrikaans,23,0.072209,0,0,0,23,0
14,common_voice,afrikaans,13,0.107876,0,0,3,1,53
2,Ashesi-Org_Financial-Inclusion-Speech-Dataset,akan,4,0.009593,0,0,1,5,0
38,intron-MT,akan,60,3.975511,86,1042,0,0,0
0,ALFFA,amharic,359,0.730994,0,0,0,0,359
39,intron-MT,amharic,11,0.295589,85,116,0,0,0
25,fleurs,amharic,46,0.14725,0,0,47,0,0
15,common_voice,amharic,17,0.354978,0,0,0,155,50


In [40]:
lang_df = result[result['language']=="ga"]
lang_df

Unnamed: 0,source,language,unique_speakers,hours,Female,Male,female,male,unknown
42,intron-MT,ga,6,0.223285,52,5,0,0,0
3,Ashesi-Org_Financial-Inclusion-Speech-Dataset,ga,12,0.026059,0,0,12,6,0


In [41]:
print(f"Unique speakers:  {lang_df['unique_speakers'].sum()}")
total_hours = lang_df['hours'].sum()
print(f"Total number of hours: {total_hours}")
intron_hrs = lang_df[lang_df['source'] == 'intron-MT']['hours'].values
print(f"Intron (hrs): {intron_hrs}")
print(f"Opensource (hrs): {total_hours - intron_hrs}")

Unique speakers:  18
Total number of hours: 0.24934406250000002
Intron (hrs): [0.223285]
Opensource (hrs): [0.02605906]
