In [1]:
import os
import requests
import torch
from datasets import load_dataset
import torchcodec
import urllib.request
import shutil
from huggingface_hub import snapshot_download
import pandas as pd
import json
import glob
import os
import zipfile
import soundfile as sf

  from .autonotebook import tqdm as notebook_tqdm


## Help Functions

In [2]:
def get_audio_duration(file_path):
    with sf.SoundFile(file_path) as f:
        return f.frames / f.samplerate

## Primock (UK)

In [3]:
# Download (if needed) and load Primock-57 JSON files into a DataFrame
primock_dir = '/home/kelechi/bio_ramp_asr/Primock-57'
primock_data_dir = os.path.join(primock_dir, 'data')

# Try to snapshot download if data not present locally
if not os.path.exists(primock_data_dir):
    print('Attempting to download Primock-57 via huggingface snapshot')
    safe_snapshot_download(repo_id='sdialog/Primock-57', repo_type='dataset', local_dir=primock_dir)

json_files = []
if os.path.exists(primock_data_dir):
    json_files = glob.glob(os.path.join(primock_data_dir, '*.json'))
    print(f'Found {len(json_files)} Primock JSON files in {primock_data_dir}')

primock57_df = pd.DataFrame()
if json_files:
    all_dfs = []
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            df = pd.json_normalize(data, sep='_')
            df['file_name'] = os.path.basename(json_file)
            # if there is an audio path column try to resolve duration later
            all_dfs.append(df)
        except Exception as e:
            print(f'Warning reading {json_file}: {e}')
    if all_dfs:
        primock57_df = pd.concat(all_dfs, ignore_index=True)

print('Primock loaded rows:', len(primock57_df))
# randomly sample 45 entries from the Primock-57 dataset
primock57_df = primock57_df.sample(n=45, random_state=42).reset_index(drop=True)

Found 57 Primock JSON files in /home/kelechi/bio_ramp_asr/Primock-57/data
Primock loaded rows: 57


## Afrispeech (African medical)

In [4]:
# Load afrispeech-dialog dataset from Hugging Face and filter medical domain
try:
    afrispeech_dialog = load_dataset('intronhealth/afrispeech-dialog')
    afrispeech_dialog_df = afrispeech_dialog['train'].to_pandas()
    afrispeech_dialog_df['audio_file'] = afrispeech_dialog_df['audio'].apply(lambda x: x.get('path') if isinstance(x, dict) else None)
    afrispeech_dialog_df['utterance_id'] = afrispeech_dialog_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0] if isinstance(x, str) else None)
    medical_afrispeech_df = afrispeech_dialog_df[afrispeech_dialog_df.get('domain') == 'medical'].copy()
    medical_afrispeech_df = medical_afrispeech_df[['utterance_id', 'audio_file', 'transcript', 'duration']]
    medical_afrispeech_df['source'] = 'afrispeech_medical'
    print('Loaded afrispeech medical rows:', len(medical_afrispeech_df))
except Exception as e:
    print('Could not load afrispeech dataset:', e)
    medical_afrispeech_df = pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])

Loaded afrispeech medical rows: 20


## Med-Convo (Nigeria Medical)

In [5]:
# Load med-convo-nig dataset from Hugging Face
try:
    med_convo = load_dataset('intronhealth/med-convo-nig')
    med_convo_df = med_convo['train'].to_pandas()
    med_convo_df['audio_file'] = med_convo_df['audio'].apply(lambda x: x.get('path') if isinstance(x, dict) else None)
    med_convo_df['utterance_id'] = med_convo_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0] if isinstance(x, str) else None)
    med_convo_df = med_convo_df[['utterance_id', 'audio_file', 'transcript', 'duration']]
    med_convo_df['source'] = 'med-convo-nig'
    print('Loaded med-convo-nig medical rows:', len(med_convo_df))
except Exception as e:
    print('Could not load med-convo-nig dataset:', e)
    med_convo_df = pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])

Loaded med-convo-nig medical rows: 25


## US Medical dataset (download, extract, collect)

In [6]:
# Download and extract the US dataset ZIP (if needed), then collect transcripts and audio files
us_zip_url = 'https://springernature.figshare.com/ndownloader/files/30598530'
working_data_dir = '/home/kelechi/bio_ramp_asr/data'
us_zip_dest = os.path.join(working_data_dir, 'us_datasets', 'us_dataset.zip')
us_extract_dir = os.path.join(working_data_dir, 'us_datasets')

# os.makedirs(os.path.dirname(us_zip_dest), exist_ok=True)
# if not os.path.exists(us_zip_dest):
#     print('Downloading US dataset ZIP...')
#     try:
#         download_file(us_zip_url, us_zip_dest)
#     except Exception as e:
#         print('Failed to download US dataset:', e)

# # Extract if data folder not already present
# if not os.path.isdir(os.path.join(us_extract_dir, 'Data')):
#     try:
#         print('Extracting US dataset ZIP...')
#         extract_zip(us_zip_dest, us_extract_dir)
#     except Exception as e:
#         print('Failed to extract US dataset:', e)

# Collect transcripts and matching audio files by base filename
us_datasets_path = os.path.join(us_extract_dir, 'Data')
audio_exts = {'.wav', '.mp3', '.flac', '.m4a', '.aac', '.ogg'}
transcripts = {}
audios = {}
all_data = []
if os.path.exists(us_datasets_path):
    for root, dirs, files in os.walk(us_datasets_path):
        for fname in files:
            base, ext = os.path.splitext(fname)
            ext = ext.lower()
            fullpath = os.path.join(root, fname)
            if ext == '.txt':
                transcripts[base] = fullpath
            elif ext in audio_exts:
                audios[base] = fullpath
    for base, tpath in transcripts.items():
        apath = audios.get(base)
        if apath:
            try:
                with open(tpath, 'r', encoding='utf-8', errors='replace') as f:
                    transcript = f.read().strip()
            except Exception:
                transcript = ''
            all_data.append({'audio_file': apath, 'transcript': transcript})
else:
    print('US dataset Data directory not found at', us_datasets_path)

# randomly sample 45 entries from the US dataset
us_datasets_df = pd.DataFrame(all_data)
if not us_datasets_df.empty:
    us_datasets_df = us_datasets_df.sample(n=45, random_state=42).reset_index(drop=True)
    us_datasets_df['utterance_id'] = us_datasets_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
    us_datasets_df['duration'] = us_datasets_df['audio_file'].apply(get_audio_duration)
    us_datasets_df = us_datasets_df[['utterance_id', 'audio_file', 'transcript', 'duration']]
    us_datasets_df['source'] = 'us_medical'
    print('US dataset rows:', len(us_datasets_df))
else:
    us_datasets_df = pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])

US dataset rows: 45


## Merge datasets and save

In [8]:
# Prepare Primock dataframe to a standard shape if possible
def primock_to_standard(df):
    if df.empty:
        return pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])
    # Attempt to find columns for audio path / transcript in common places
    audio_cols = [c for c in df.columns if 'audio' in c.lower() or 'file' in c.lower()]
    text_cols = [c for c in df.columns if 'trans' in c.lower() or 'text' in c.lower()]
    out = pd.DataFrame()
    if audio_cols:
        out['audio_file'] = df[audio_cols[0]]
    else:
        out['audio_file'] = None
    if text_cols:
        out['transcript'] = df[text_cols[0]]
    else:
        out['transcript'] = None
    out['utterance_id'] = out['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0] if isinstance(x, str) else None)
    # Primock-57 does not have audio files, so set duration to None
    out['duration'] = None
    out['source'] = 'uk_medical'
    return out[['utterance_id','audio_file','transcript','duration','source']]

primock_std = primock_to_standard(primock57_df)
all_datasets_df = pd.concat([primock_std, us_datasets_df, medical_afrispeech_df, med_convo_df], ignore_index=True, sort=False)
# drop rows that have neither audio nor transcript
all_datasets_df = all_datasets_df[all_datasets_df['audio_file'].notna() | all_datasets_df['transcript'].notna()].reset_index(drop=True)
out_csv = os.path.join('/home/kelechi/bio_ramp_asr', 'all_datasets_merged.csv')
all_datasets_df.to_csv(out_csv, index=False)
print('Merged rows:', len(all_datasets_df), '-> saved to', out_csv)

Merged rows: 135 -> saved to /home/kelechi/bio_ramp_asr/all_datasets_merged.csv


  all_datasets_df = pd.concat([primock_std, us_datasets_df, medical_afrispeech_df, med_convo_df], ignore_index=True, sort=False)
