In [6]:
import os
import requests
import torch
from datasets import load_dataset
import torchcodec

import urllib.request
import shutil
from huggingface_hub import snapshot_download

import pandas as pd
import json
import glob
import os
import zipfile
import soundfile as sf

  from .autonotebook import tqdm as notebook_tqdm


## Primock - UK Datasets

In [None]:


snapshot_download(
    repo_id="sdialog/Primock-57",
    repo_type="dataset",
    local_dir="Primock-57",
)
# load the json files in primock-57/data as dataframe


data_path = '/home/kelechi/bio_ramp_asr/data/Primock-57/data/'
# count number of json files in data_path
json_files = glob.glob(os.path.join(data_path, '*.json'))
print(f'Number of json files in {data_path}: {len(json_files)}')

# read the first json file as dataframe
with open(json_files[0], 'r') as f:
    data = json.load(f)
    primock57_df = pd.json_normalize(data)
# print(primock57_df.head())

# ensure each json item is flattened into a single row
primock57_df = pd.json_normalize(data, sep='_')


# do same for all json files and concatenate into a single dataframe, and include file name as a column
all_dfs = []
for json_file in json_files:
    with open(json_file, 'r') as f:
        data = json.load(f)
        df = pd.json_normalize(data, sep='_')
        df['file_name'] = os.path.basename(json_file)
        all_dfs.append(df)
primock57_df = pd.concat(all_dfs, ignore_index=True)


Fetching 59 files: 100%|██████████| 59/59 [00:00<00:00, 818.73it/s]

Number of json files in /home/kelechi/bio_ramp_asr/Primock-57/data/: 57





## afrispeech - African medical datasets

In [8]:
#intronhealth_dataset = load_dataset("intronhealth/med-convo-nig")
afrispeech_dialog = load_dataset("intronhealth/afrispeech-dialog")

# load the afrispeech-dialog dataset as dataframe
afrispeech_dialog_df = afrispeech_dialog['train'].to_pandas()

# load sample audio file from afrispeech-dialog dataset
audio_1 = afrispeech_dialog_df.iloc[0]['audio']['path']
afrispeech_dialog_df['audio_file'] = afrispeech_dialog_df['audio'].apply(lambda x: x['path'])

# play audio file
# import IPython.display as ipd
# ipd.Audio(audio_1)

## US - Medical dataset

In [1]:
# download dataset from url

url = "https://springernature.figshare.com/ndownloader/files/30598530"
urllib.request.urlretrieve(url, "us_dataset.zip")
# move zip file to data/us_dataset directory
shutil.move("us_dataset.zip", "data/us_datasets/us_dataset.zip")

('us_dataset.zip', <http.client.HTTPMessage at 0x7ee84b39af10>)

In [4]:
# extract zip file
with zipfile.ZipFile("data/us_datasets/us_dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("data/us_datasets/")

us_datasets_path = '/home/kelechi/bio_ramp_asr/data/us_datasets/Data'



# collect transcripts and audio files by base filename (case-insensitive)
audio_exts = {'.wav', '.mp3', '.flac', '.m4a', '.aac', '.ogg'}
transcripts = {}
audios = {}

for root, dirs, files in os.walk(us_datasets_path):
    for fname in files:
        base, ext = os.path.splitext(fname)
        ext = ext.lower()
        fullpath = os.path.join(root, fname)
        if ext == '.txt':
            transcripts[base] = fullpath
        elif ext in audio_exts:
            audios[base] = fullpath

# merge entries that have both transcript and audio
all_data = []
for base, tpath in transcripts.items():
    apath = audios.get(base)
    if apath:
        with open(tpath, 'r', encoding='utf-8', errors='replace') as f:
            transcript = f.read().strip()
        all_data.append({'audio_file': apath, 'transcript': transcript})

us_datasets_df = pd.DataFrame(all_data)

# create utterance IDs from filenames
us_datasets_df['utterance_id'] = us_datasets_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])


In [None]:
# add duration column to us_datasets_df
def get_audio_duration(file_path):
    with sf.SoundFile(file_path) as f:
        return f.frames / f.samplerate

# primock57_df['duration'] = primock57_df['file_name'].apply(get_audio_duration)
us_datasets_df['duration'] = us_datasets_df['audio_file'].apply(get_audio_duration)

## Join datasets

In [None]:
us_datasets_df = us_datasets_df[['utterance_id', 'audio_file', 'transcript', 'duration']]

# randomly sample 20 rows from us_datasets_df.
us_datasets_df = us_datasets_df.sample(n=20, random_state=42).reset_index(drop=True)


afrispeech_dialog_df['utterance_id'] = afrispeech_dialog_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])


medical_afrispeech_df = afrispeech_dialog_df[afrispeech_dialog_df['domain'] == 'medical']
medical_afrispeech_df = medical_afrispeech_df[['utterance_id', 'audio_file', 'transcript', 'duration']]

# Join datasets and include source column
us_datasets_df['source'] = 'uk_medical'
medical_afrispeech_df['source'] = 'afrispeech_medical'

# concatenate dataframes
all_datasets_df = pd.concat([us_datasets_df, medical_afrispeech_df], ignore_index=True)