In [1]:
import pandas as pd
import numpy as np

In [42]:
data = pd.read_csv("../data/mls_data_index.csv")

In [45]:
data['language'].value_counts()

yoruba         4716
afrikaans      4341
sesotho        4011
hausa          3874
kinyarwanda    3774
arabic         3714
tswana         3668
zulu           3432
swahili        3363
xhosa          3335
pedi           3297
igbo           3259
luganda        2181
shona          1553
french         1405
akan           1134
amharic         812
twi             510
fulani           83
ga               75
Name: language, dtype: int64

In [10]:
data = data[data['source'] == 'intron-MT']

In [None]:
df_hrs = df_hrs.groupby(['language'])['duration'].sum().reset_index()
df_hrs['hours'] = df_hrs['duration'] / 3600
df_hrs.sort_values(by='hours', ascending=False)

Unnamed: 0,language,duration,hours
7,igbo,14394.875,3.998576
13,swahili,14373.395,3.99261
18,zulu,14367.561,3.990989
8,kinyarwanda,14362.256,3.989516
17,yoruba,14362.106,3.989474
6,hausa,14355.222,3.987562
1,akan,14311.839,3.975511
16,xhosa,14298.56,3.971822
14,tswana,14288.501,3.969028
10,pedi,14067.933,3.907759


In [11]:
df_spkrs = data.groupby(['language'])['speaker_id'].nunique().reset_index()
df_spkrs.sort_values(by='speaker_id', ascending=False)

Unnamed: 0,language,speaker_id
13,swahili,215
6,hausa,163
17,yoruba,142
18,zulu,105
7,igbo,85
8,kinyarwanda,80
16,xhosa,69
1,akan,60
0,afrikaans,44
10,pedi,41


In [14]:
data.groupby(['language'])['gender'].value_counts()

language     gender
afrikaans    Female    1172
             Male        87
akan         Male      1042
             Female      86
amharic      Male       116
             Female      85
arabic       Male       642
             Female      46
french       Male       726
             Female     164
ga           Female      52
             Male         5
hausa        Male       915
igbo         Male      1311
             Female      27
kinyarwanda  Male      1007
             Female       2
luganda      Male        48
pedi         Female     613
             Male       478
sesotho      Female    1285
             Male         4
shona        Female     620
             Male         8
swahili      Male       929
             Female       2
tswana       Male       786
             Female     398
twi          Male       354
             Female      53
xhosa        Female    1020
             Male       345
yoruba       Male      1382
zulu         Male      1487
             Female     209


In [6]:
agg_df = data.groupby(['source', 'language']).agg(
    total_duration_seconds=('duration', 'sum'),
    unique_speakers=('speaker_id', 'nunique')
).reset_index()

# Convert duration in seconds to hours
agg_df['hours'] = agg_df['total_duration_seconds'] / 3600

# Optional: drop the total_duration_seconds column if not needed
agg_df.drop(columns=['total_duration_seconds'], inplace=True)

agg_df.sort_values(by="language")

gender_counts = data.groupby(['source', 'language', 'gender']).size().reset_index(name='gender_count')

# Pivot the table so that each gender becomes its own column.
gender_pivot = gender_counts.pivot_table(index=['source', 'language'], 
                                           columns='gender', 
                                           values='gender_count', 
                                           fill_value=0).reset_index()

# If desired, flatten the column names (gender values become regular columns)
gender_pivot.columns.name = None


result = pd.merge(agg_df, gender_pivot, on=['source', 'language'])

result = result.sort_values(by="language")
result


Unnamed: 0,source,language,unique_speakers,hours,Female,Male,female,male,unknown
5,NCHLT,afrikaans,8,2.6647,0,0,1596,1406,0
37,intron-MT,afrikaans,44,3.336081,1172,87,0,0,0
24,fleurs,afrikaans,23,0.072209,0,0,0,23,0
14,common_voice,afrikaans,13,0.107876,0,0,3,1,53
2,Ashesi-Org_Financial-Inclusion-Speech-Dataset,akan,4,0.009593,0,0,1,5,0
38,intron-MT,akan,60,3.975511,86,1042,0,0,0
0,ALFFA,amharic,359,0.730994,0,0,0,0,359
39,intron-MT,amharic,11,0.295589,85,116,0,0,0
25,fleurs,amharic,46,0.14725,0,0,47,0,0
15,common_voice,amharic,17,0.354978,0,0,0,155,50


In [40]:
lang_df = result[result['language']=="ga"]
lang_df

Unnamed: 0,source,language,unique_speakers,hours,Female,Male,female,male,unknown
42,intron-MT,ga,6,0.223285,52,5,0,0,0
3,Ashesi-Org_Financial-Inclusion-Speech-Dataset,ga,12,0.026059,0,0,12,6,0


In [41]:
print(f"Unique speakers:  {lang_df['unique_speakers'].sum()}")
total_hours = lang_df['hours'].sum()
print(f"Total number of hours: {total_hours}")
intron_hrs = lang_df[lang_df['source'] == 'intron-MT']['hours'].values
print(f"Intron (hrs): {intron_hrs}")
print(f"Opensource (hrs): {total_hours - intron_hrs}")

Unique speakers:  18
Total number of hours: 0.24934406250000002
Intron (hrs): [0.223285]
Opensource (hrs): [0.02605906]


## Multilingual LibriSpeech (MLS)

In [2]:
from datasets import load_dataset, Audio 



In [11]:
mls

IterableDataset({
    features: ['audio', 'original_path', 'begin_time', 'end_time', 'transcript', 'audio_duration', 'speaker_id', 'chapter_id', 'file', 'id'],
    n_shards: 1
})

In [3]:
from IPython.display import Audio as IPAudio

In [23]:
from huggingface_hub import snapshot_download

# This will download only files in the "my_folder" directory
local_dir = snapshot_download(
    repo_id="parler-tts/mls_eng",
    repo_type="dataset",
    allow_patterns=["data/test-*"]
)
print(f"Files downloaded to: {local_dir}")


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

test-00000-of-00001.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

Files downloaded to: /home/busayo/.cache/huggingface/hub/datasets--parler-tts--mls_eng/snapshots/faf6604dcf0bb9ec0d9c280a140ad74da9c931b0


In [25]:
data_dir = '/home/busayo/.cache/huggingface/hub/datasets--facebook--multilingual_librispeech/snapshots/2e83e61823b4c47dcbcb1980bb88601274127609/data'

In [40]:
mls = load_dataset("facebook/multilingual_librispeech", "spanish", split="test")
mls = mls.cast_column("audio", Audio(decode=False))
mls = mls.flatten()
mls = mls.rename_column("audio.bytes", "audio_bytes")
mls = mls.rename_column("audio.path", "audio_path")
mls = mls.to_pandas()



Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

In [41]:
# Specify the directory where you want to save audio files
output_directory = "m_librispeech/spanish/"

# Apply the function to each row. Here, we use the DataFrame index to generate a filename.
mls['audio_path'] = mls.apply(
    lambda row: process_and_save_audio(
        row['audio_bytes'],
        output_directory,
        f"{row['id']}.wav"
    ),
    axis=1
)
mls = mls.rename(columns={'transcript': 'text', 'audio_duration': 'duration'})
mls.to_csv("m_librispeech_spanish.csv", index=False)

In [32]:
mls

Unnamed: 0,audio_bytes,audio_path,original_path,begin_time,end_time,transcript,audio_duration,speaker_id,chapter_id,file,id
0,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,1406_1028_000000.opus,http://www.archive.org/download/les1001nuits_t...,210.62,227.11,pendant le second siècle je fis serment d'ouvr...,16.49,1406,1028,1406_1028_000000.opus,1406_1028_000000
1,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,1406_1028_000001.opus,http://www.archive.org/download/les1001nuits_t...,287.24,306.54,non ta mort est certaine dit le génie choisis ...,19.30,1406,1028,1406_1028_000001.opus,1406_1028_000001
2,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,1406_1028_000002.opus,http://www.archive.org/download/les1001nuits_t...,14.16,26.82,la nuit suivante appela sa soeur quand il en f...,12.66,1406,1028,1406_1028_000002.opus,1406_1028_000002
3,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,1406_1028_000003.opus,http://www.archive.org/download/les1001nuits_t...,244.87,255.54,à l'aspect d'un monstre d'une grandeur si déme...,10.67,1406,1028,1406_1028_000003.opus,1406_1028_000003
4,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,1406_1028_000004.opus,http://www.archive.org/download/les1001nuits_t...,416.96,435.04,le sultan qui n'avait pas moins d'envie que di...,18.08,1406,1028,1406_1028_000004.opus,1406_1028_000004
...,...,...,...,...,...,...,...,...,...,...,...
2421,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,9834_9697_000156.opus,http://www.archive.org/download/zadig_ou_la_de...,390.59,404.58,par les plus grands forfaits sur le trône affe...,13.99,9834,9697,9834_9697_000156.opus,9834_9697_000156
2422,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,9834_9697_000157.opus,http://www.archive.org/download/zadig_ou_la_de...,579.53,589.57,il parla avec tant de grâce d'esprit et de rai...,10.04,9834,9697,9834_9697_000157.opus,9834_9697_000157
2423,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,9834_9697_000158.opus,http://www.archive.org/download/zadig_ou_la_de...,335.30,347.14,quand tu manges donne à manger aux chiens duss...,11.84,9834,9697,9834_9697_000158.opus,9834_9697_000158
2424,b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00...,9834_9697_000159.opus,http://www.archive.org/download/zadig_ou_la_de...,322.71,334.82,d'autres traces en un sens différent qui parai...,12.11,9834,9697,9834_9697_000159.opus,9834_9697_000159


In [24]:
eng = '/home/busayo/.cache/huggingface/hub/datasets--parler-tts--mls_eng/snapshots/faf6604dcf0bb9ec0d9c280a140ad74da9c931b0'

In [4]:
test = {"test": "test-00000-of-00001.parquet"}
raw_datasets = load_dataset("parquet", data_dir='/home/busayo/.cache/huggingface/hub/datasets--parler-tts--mls_eng/snapshots/faf6604dcf0bb9ec0d9c280a140ad74da9c931b0/data', data_files=test)



In [5]:
test = {"test": "test-00000-of-00001.parquet"}
raw_datasets = load_dataset("parquet", data_dir='/home/busayo/.cache/huggingface/hub/datasets--parler-tts--mls_eng/snapshots/faf6604dcf0bb9ec0d9c280a140ad74da9c931b0/data', data_files=test)
raw_datasets = raw_datasets.cast_column("audio", Audio(decode=False))

In [6]:
raw_datasets['test']['audio'][0]

{'bytes': b'OggS\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\xba\xff\xe2x\x00\x00\x00\x00x\xfc\xf5\xd3\x01\x13OpusHead\x01\x018\x01\x80>\x00\x00\x00\x00\x00OggS\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xba\xff\xe2x\x01\x00\x00\x000\x05\xe5>\x04\xff\xff\xff\xfeOpusTags\r\x00\x00\x00libopus 1.1.2\t\x00\x00\x00&\x00\x00\x00ENCODER=opusenc from opus-tools 0.1.10\x17\x00\x00\x00ENCODER_OPTIONS=--quiet \x00\x00\x00title=03 - The Revolt of Timothy\x17\x00\x00\x00artist=Jean M. Thompson$\x00\x00\x00album=The Three Bears of Porcupine R(\x00\x00\x00DESCRIPTION=https://archive.org/details/\r\x00\x00\x00TRACKNUMBER=3\x0c\x00\x00\x00genre=Speech\x15\x00\x00\x00encoder=Lavf57.83.100\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [7]:
raw_datasets = raw_datasets.flatten()
raw_datasets = raw_datasets.rename_column("audio.bytes", "audio_bytes")
raw_datasets = raw_datasets.rename_column("audio.path", "audio_path")
raw_datasets = raw_datasets['test'].to_pandas()

In [8]:
import io
import os
import librosa
import soundfile as sf  # librosa uses PySoundFile internally
import pandas as pd

In [15]:
pp = raw_datasets['audio_bytes'].iloc[0]
audio_buffer = io.BytesIO(pp)

# Load the audio data using librosa.load
# Note: sr=None preserves the original sampling rate
y, sr = librosa.load(audio_buffer, sr=None)

print("Audio time series shape:", y.shape)
print("Sampling rate:", sr)

Audio time series shape: (269920,)
Sampling rate: 16000


In [16]:
IPAudio(y, rate=sr)

In [20]:
def process_and_save_audio(audio_bytes, save_dir, file_name, target_sr=None):
    """
    Reads audio from raw bytes using librosa and saves the audio file.
    
    Parameters:
        audio_bytes (bytes): The raw audio data.
        save_dir (str): Directory where the audio file will be saved.
        file_name (str): Desired file name (e.g., "audio_0.wav").
        target_sr (int or None): If specified, resample audio to this sampling rate.
                                 If None, use the original sampling rate.
    
    Returns:
        str: The path to the saved audio file.
    """
    # Ensure the output directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    # Wrap bytes in a file-like object
    audio_buffer = io.BytesIO(audio_bytes)
    
    # Load audio with librosa; if target_sr is None, preserve original sample rate.
    y, sr = librosa.load(audio_buffer, sr=target_sr)
    
    # Determine the sampling rate to save with:
    save_sr = target_sr if target_sr is not None else sr
    
    # Construct the full path for saving the file
    file_path = os.path.join(save_dir, file_name)
    
    # Save the audio using soundfile
    sf.write(file_path, y, save_sr)
    
    return file_path



In [23]:
# Specify the directory where you want to save audio files
output_directory = "m_librispeech/english/"

# Apply the function to each row. Here, we use the DataFrame index to generate a filename.
raw_datasets['saved_path'] = raw_datasets.apply(
    lambda row: process_and_save_audio(
        row['audio_bytes'],
        output_directory,
        f"{os.path.splitext(row['audio_path'])[0]}.wav"
    ),
    axis=1
)


In [34]:
raw_datasets = raw_datasets.rename(columns={'transcript': 'text', 'audio_duration': 'duration'})
raw_datasets.to_csv("m_librispeech_english.csv", index=False)

In [90]:
raw_datasets['audio_path'].iloc[0]

'10226_10111_000000.opus'

In [54]:
df_path = '/home/busayo/busayo/asr_benchmarking/m_librispeech_spanish.csv'
data = pd.read_csv(df_path)

In [55]:
data.columns

Index(['audio_bytes', 'audio_path', 'original_path', 'begin_time', 'end_time',
       'text', 'duration', 'speaker_id', 'chapter_id', 'file', 'id'],
      dtype='object')

In [56]:
data = data.drop(columns=['audio_bytes'], axis=1)
data = data.rename(columns={'saved_path': 'audio_path'})

In [57]:
data.to_csv(df_path, index=False)

In [63]:
base = "/home/busayo/busayo/asr_benchmarking/m_librispeech_"
df_list = [f"{base}{lang}.csv" for  lang in ["english", "spanish", "french"]] 

In [65]:
pd.read_csv(df_list[0]).columns, pd.read_csv(df_list[1]).columns, pd.read_csv(df_list[2]).columns

(Index(['original_path', 'begin_time', 'end_time', 'text', 'duration',
        'speaker_id', 'book_id', 'audio_path'],
       dtype='object'),
 Index(['audio_path', 'original_path', 'begin_time', 'end_time', 'text',
        'duration', 'speaker_id', 'chapter_id', 'file', 'id'],
       dtype='object'),
 Index(['audio_path', 'original_path', 'begin_time', 'end_time', 'text',
        'duration', 'speaker_id', 'chapter_id', 'file', 'id'],
       dtype='object'))

In [81]:
df_full = pd.DataFrame()
for df in df_list:
    df_temp = pd.read_csv(df)[['audio_path', 'begin_time', 'end_time', 'text',
        'duration', 'speaker_id',]]
    df_temp['language'] = df.split("_")[-1].split(".")[0]
    df_full = pd.concat([df_full, df_temp], ignore_index=True)

In [71]:
df_temp['language'] = df.split("_")[-1].split(".")[0]

In [74]:
df_full['language'].value_counts()

english    3769
french     2426
spanish    2385
Name: language, dtype: int64

In [82]:
df_full['audio_path'] = df_full['audio_path'].apply(lambda x: x.replace("m_librispeech", "/data/m_librispeech"))

In [84]:
df_full.describe()

Unnamed: 0,begin_time,end_time,duration,speaker_id
count,8580.0,8580.0,8580.0,8580.0
mean,665.474608,680.419727,14.945119,7738.164918
std,585.839591,585.892557,2.853238,4256.676642
min,0.0,12.76,10.0,97.0
25%,209.2725,223.3175,12.49,3503.0
50%,498.635,513.53,14.91,7788.0
75%,969.9,984.975,17.33,12037.0
max,3945.36,3958.13,20.0,13894.0


In [77]:
df_full.to_csv("m_librispeech_fulltest.csv", index=False)

In [78]:
df_full

Unnamed: 0,audio_path,begin_time,end_time,text,duration,speaker_id,language
0,/data/m_librispeech/english/10226_10111_000000...,102.02,118.89,after his nap timothy lazily stretched first o...,16.87,10226,english
1,/data/m_librispeech/english/10226_10111_000001...,118.89,131.17,for timothy was a spoiled cat and he allowed n...,12.28,10226,english
2,/data/m_librispeech/english/10226_10111_000002...,239.06,252.27,but timothy resolved to punish them all and th...,13.21,10226,english
3,/data/m_librispeech/english/10226_10111_000003...,229.06,239.06,so timothy went away vainly they searched for ...,10.00,10226,english
4,/data/m_librispeech/english/10226_10111_000004...,282.58,297.79,said timothy then he began to cuff at the hedg...,15.21,10226,english
...,...,...,...,...,...,...,...
8575,/data/m_librispeech/french/9834_9697_000156.wav,390.59,404.58,par les plus grands forfaits sur le trône affe...,13.99,9834,french
8576,/data/m_librispeech/french/9834_9697_000157.wav,579.53,589.57,il parla avec tant de grâce d'esprit et de rai...,10.04,9834,french
8577,/data/m_librispeech/french/9834_9697_000158.wav,335.30,347.14,quand tu manges donne à manger aux chiens duss...,11.84,9834,french
8578,/data/m_librispeech/french/9834_9697_000159.wav,322.71,334.82,d'autres traces en un sens différent qui parai...,12.11,9834,french


In [85]:
df_full['file_size'] = df_full['audio_path'].apply(lambda x: os.path.getsize(x) / (1024 * 1024))
df_full['file_size'].describe()

count    8580.000000
mean        0.456131
std         0.087074
min         0.305218
25%         0.381207
50%         0.455059
75%         0.528912
max         0.610394
Name: file_size, dtype: float64