In [1]:
import os
import requests
import torch
from datasets import load_dataset
import torchcodec
import urllib.request
import shutil
from huggingface_hub import snapshot_download
import pandas as pd
import json
import glob
import os
import zipfile
import soundfile as sf

  from .autonotebook import tqdm as notebook_tqdm


## Help Functions

In [2]:
def get_audio_duration(file_path):
    with sf.SoundFile(file_path) as f:
        return f.frames / f.samplerate

## Primock (UK)

In [3]:
# Download (if needed) and load Primock-57 JSON files into a DataFrame
primock_dir = 'data/Primock-57'
primock_data_dir = os.path.join(primock_dir, 'data')

# Try to snapshot download if data not present locally
if not os.path.exists(primock_data_dir):
    print('Attempting to download Primock-57 via huggingface snapshot')
    snapshot_download(repo_id='sdialog/Primock-57', repo_type='dataset', local_dir=primock_dir)

json_files = []
if os.path.exists(primock_data_dir):
    json_files = glob.glob(os.path.join(primock_data_dir, '*.json'))
    print(f'Found {len(json_files)} Primock JSON files in {primock_data_dir}')

primock57_df = pd.DataFrame()
if json_files:
    all_dfs = []
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            df = pd.json_normalize(data, sep='_')
            df['file_name'] = os.path.basename(json_file)
            # if there is an audio path column try to resolve duration later
            all_dfs.append(df)
        except Exception as e:
            print(f'Warning reading {json_file}: {e}')
    if all_dfs:
        primock57_df = pd.concat(all_dfs, ignore_index=True)

print('Primock loaded rows:', len(primock57_df))
# randomly sample 45 entries from the Primock-57 dataset
primock57_df = primock57_df.sample(n=45, random_state=42).reset_index(drop=True)

Found 57 Primock JSON files in data/Primock-57/data
Primock loaded rows: 57


In [4]:
# parse Turn columns to extract Doctor and Patient utterances
def extract_utterances(turns):
    doctor_utterances = []
    patient_utterances = []
    for turn in turns:
        speaker = turn.get('speaker', '').upper()
        utterance = turn.get('text', '')
        if 'DOCTOR' in speaker:
            doctor_utterances.append(utterance)
        elif 'PATIENT' in speaker:
            patient_utterances.append(utterance)
    return ' '.join(doctor_utterances), ' '.join(patient_utterances)
primock57_df[['doctor_utterances', 'patient_utterances']] = primock57_df['turns'].apply(
    lambda x: pd.Series(extract_utterances(x))
)

In [None]:
# keep specific columns in primock57
primock57_df = primock57_df[['file_name', 'turns', 'doctor_utterances', 'patient_utterances']]
primock57_df = primock57_df.rename(columns={'file_name': 'utterance_id'})
primock57_df['utterance_id'] = primock57_df['utterance_id'].str.replace('_conversation.json', '', regex=False)

# Download audio files using git clone with LFS
audio_dir = os.path.join(primock_dir, 'audio')
os.makedirs(audio_dir, exist_ok=True)

# use wget -O audio_file.wav https://github.com/babylonhealth/primock57/raw/main/audio/YOUR_FILE_NAME.wav to download audio files
base_audio_url = 'https://github.com/babylonhealth/primock57/raw/main/audio/'

# download doctor and patient audio files
for idx, row in primock57_df.iterrows():
    utterance_id = row['utterance_id']
    doctor_audio_url = f"{base_audio_url}{utterance_id}_doctor.wav"
    patient_audio_url = f"{base_audio_url}{utterance_id}_patient.wav"
    doctor_audio_path = os.path.join(audio_dir, f"{utterance_id}_doctor.wav")
    patient_audio_path = os.path.join(audio_dir, f"{utterance_id}_patient.wav")
    
    # Download doctor audio if not already present
    if not os.path.exists(doctor_audio_path):
        try:
            urllib.request.urlretrieve(doctor_audio_url, doctor_audio_path)
            print(f"Downloaded {doctor_audio_path}")
        except Exception as e:
            print(f"Failed to download {doctor_audio_url}: {e}")
    
    # Download patient audio if not already present
    if not os.path.exists(patient_audio_path):
        try:
            urllib.request.urlretrieve(patient_audio_url, patient_audio_path)
            print(f"Downloaded {patient_audio_path}")
        except Exception as e:
            print(f"Failed to download {patient_audio_url}: {e}")


In [None]:
# add doctor and patient filepath columns
primock57_df['doctor_audio_path'] = primock57_df['utterance_id'].apply(lambda x: f"data/Primock-57/audio/{x}_doctor.wav")
primock57_df['patient_audio_path'] = primock57_df['utterance_id'].apply(lambda x: f"data/Primock-57/audio/{x}_patient.wav")

# add audio durations
primock57_df['doctor_audio_duration'] = primock57_df['doctor_audio_path'].apply(get_audio_duration)
primock57_df['patient_audio_duration'] = primock57_df['patient_audio_path'].apply(get_audio_duration)

## Afrispeech (African medical)

In [7]:
# Load afrispeech-dialog dataset from Hugging Face and filter medical domain
try:
    afrispeech_dialog = load_dataset('intronhealth/afrispeech-dialog')
    afrispeech_dialog_df = afrispeech_dialog['train'].to_pandas()
    afrispeech_dialog_df['audio_file'] = afrispeech_dialog_df['audio'].apply(lambda x: x.get('path') if isinstance(x, dict) else None)
    afrispeech_dialog_df['utterance_id'] = afrispeech_dialog_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0] if isinstance(x, str) else None)
    medical_afrispeech_df = afrispeech_dialog_df[afrispeech_dialog_df.get('domain') == 'medical'].copy()
    medical_afrispeech_df = medical_afrispeech_df[['utterance_id', 'audio_file', 'transcript', 'duration']]
    medical_afrispeech_df['source'] = 'afrispeech_medical'
    print('Loaded afrispeech medical rows:', len(medical_afrispeech_df))
except Exception as e:
    print('Could not load afrispeech dataset:', e)
    medical_afrispeech_df = pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])

Loaded afrispeech medical rows: 20


## Med-Convo (Nigeria Medical)

In [8]:
# Load med-convo-nig dataset from Hugging Face
try:
    med_convo = load_dataset('intronhealth/med-convo-nig')
    med_convo_df = med_convo['train'].to_pandas()
    med_convo_df['audio_file'] = med_convo_df['audio'].apply(lambda x: x.get('path') if isinstance(x, dict) else None)
    med_convo_df['utterance_id'] = med_convo_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0] if isinstance(x, str) else None)
    med_convo_df = med_convo_df[['utterance_id', 'audio_file', 'transcript', 'duration']]
    med_convo_df['source'] = 'med-convo-nig'
    print('Loaded med-convo-nig medical rows:', len(med_convo_df))
except Exception as e:
    print('Could not load med-convo-nig dataset:', e)
    med_convo_df = pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])

Loaded med-convo-nig medical rows: 25


## US Medical dataset (download, extract, collect)

In [9]:
# Download and extract the US dataset ZIP (if needed), then collect transcripts and audio files
us_zip_url = 'https://springernature.figshare.com/ndownloader/files/30598530'
working_data_dir = 'data'
us_zip_dest = os.path.join(working_data_dir, 'us_datasets', 'us_dataset.zip')
us_extract_dir = os.path.join(working_data_dir, 'us_datasets')

# os.makedirs(os.path.dirname(us_zip_dest), exist_ok=True)
# if not os.path.exists(us_zip_dest):
#     print('Downloading US dataset ZIP...')
#     try:
#         download_file(us_zip_url, us_zip_dest)
#     except Exception as e:
#         print('Failed to download US dataset:', e)

# # Extract if data folder not already present
# if not os.path.isdir(os.path.join(us_extract_dir, 'Data')):
#     try:
#         print('Extracting US dataset ZIP...')
#         extract_zip(us_zip_dest, us_extract_dir)
#     except Exception as e:
#         print('Failed to extract US dataset:', e)

# Collect transcripts and matching audio files by base filename
us_datasets_path = os.path.join(us_extract_dir, 'Data')
audio_exts = {'.wav', '.mp3', '.flac', '.m4a', '.aac', '.ogg'}
transcripts = {}
audios = {}
all_data = []
if os.path.exists(us_datasets_path):
    for root, dirs, files in os.walk(us_datasets_path):
        for fname in files:
            base, ext = os.path.splitext(fname)
            ext = ext.lower()
            fullpath = os.path.join(root, fname)
            if ext == '.txt':
                transcripts[base] = fullpath
            elif ext in audio_exts:
                audios[base] = fullpath
    for base, tpath in transcripts.items():
        apath = audios.get(base)
        if apath:
            try:
                with open(tpath, 'r', encoding='utf-8', errors='replace') as f:
                    transcript = f.read().strip()
            except Exception:
                transcript = ''
            all_data.append({'audio_file': apath, 'transcript': transcript})
else:
    print('US dataset Data directory not found at', us_datasets_path)

# randomly sample 45 entries from the US dataset
us_datasets_df = pd.DataFrame(all_data)
if not us_datasets_df.empty:
    us_datasets_df = us_datasets_df.sample(n=45, random_state=42).reset_index(drop=True)
    us_datasets_df['utterance_id'] = us_datasets_df['audio_file'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
    us_datasets_df['duration'] = us_datasets_df['audio_file'].apply(get_audio_duration)
    us_datasets_df = us_datasets_df[['utterance_id', 'audio_file', 'transcript', 'duration']]
    us_datasets_df['source'] = 'us_medical'
    print('US dataset rows:', len(us_datasets_df))
else:
    us_datasets_df = pd.DataFrame(columns=['utterance_id','audio_file','transcript','duration','source'])

US dataset rows: 45


## Merge datasets and save

In [None]:
all_datasets_df = pd.concat([us_datasets_df, medical_afrispeech_df, med_convo_df], ignore_index=True, sort=False)
# drop rows that have neither audio nor transcript
all_datasets_df = all_datasets_df[all_datasets_df['audio_file'].notna() | all_datasets_df['transcript'].notna()].reset_index(drop=True)
out_csv = os.path.join('/home/kelechi/bio_ramp_asr', 'all_datasets_merged.csv')
all_datasets_df.to_csv(out_csv, index=False)
print('Merged rows:', len(all_datasets_df), '-> saved to', out_csv)

In [None]:
# save primock57_df to CSV
primock_out_csv = os.path.join('/home/kelechi/bio_ramp_asr', 'primock_uk_datasets_processed.csv')
primock57_df.to_csv(primock_out_csv, index=False)
print('Primock-57 processed rows:', len(primock57_df), '-> saved to', primock_out_csv)