# Overview

This notebook ingests data from multiple sources and formats and coverts it to a standard HuggingFace dataset format.

All datasets will have "sentence" and "audio" features. Different datasets may contain additional metadata information.

In [1]:
import os
from pathlib import Path
from typing import Dict, List, Tuple

from datasets import Dataset, Features, Audio, Value

## CSLU Kids

In [5]:
# Path to your decompressed directory
cslu_dir_path = "../data/cslu_kids"  # Adjust this to your decompressed directory path

def read_transcript(transcript_path: str) -> str:
    """Read transcript content from a file path."""
    try:
        with open(transcript_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            return content
    except Exception as e:
        print(f"Could not read transcript {transcript_path}: {e}")
        return ""

def extract_and_match_files(dir_path: str) -> Dataset:
    """
    Find matching audio and transcript files in a directory structure.
    
    Args:
        dir_path: Path to the decompressed directory
        
    Returns:
        Hugging Face Dataset with 'audio' and 'sentence' features
    """
    audio_transcript_pairs = []
    
    # Walk through the directory structure
    audio_files = {}
    transcript_files = {}
    
    for root, dirs, files in os.walk(dir_path):
        path_parts = Path(root).parts
        
        for file in files:
            file_path = os.path.join(root, file)
            file_id = Path(file).stem
            
            # Check if it's an audio file in speech directory
            if 'speech' in path_parts and file.endswith('.wav'):
                audio_files[file_id] = file_path
            
            # Check if it's a transcript file in trans directory
            elif 'trans' in path_parts and file.endswith('.txt'):
                transcript_files[file_id] = file_path
    
    # Find matching pairs
    common_ids = set(audio_files.keys()) & set(transcript_files.keys())
    print(f"Found {len(common_ids)} matching audio-transcript pairs")
    
    # Build lists of matched audio paths and sentences
    audio_paths = []
    sentences = []
    
    for file_id in common_ids:
        audio_path = audio_files[file_id]
        transcript_path = transcript_files[file_id]
        
        # Read the transcript
        sentence = read_transcript(transcript_path)
        
        # Only add if we successfully read the transcript
        if sentence:
            audio_paths.append(audio_path)
            sentences.append(sentence)
    
    print(f"Successfully processed {len(audio_paths)} pairs")
    print("Building dataset...")
    
    # Create dataset with correct feature name 'sentence' instead of 'sentences'
    ds = Dataset.from_dict({
        "audio": audio_paths,
        "sentence": sentences  # Changed from 'sentences' to 'sentence'
    })
    
    print("Casting dataset column...")
    ds = ds.cast_column("audio", Audio())
    
    return ds

# Create the dataset
cslu_kids = extract_and_match_files(cslu_dir_path)

# Print dataset info
print(f"\nDataset created successfully!")
print(f"Number of samples: {len(cslu_kids)}")
print(f"Features: {cslu_kids.features}")

# Show first sample info
if len(cslu_kids) > 0:
    first_sample = cslu_kids[0]
    print(f"\nFirst sample:")
    print(f"Audio path: {first_sample['audio']['path']}")
    print(f"Sentence: {first_sample['sentence'][:100]}...")  # First 100 chars

Found 1101 matching audio-transcript pairs
Successfully processed 1101 pairs
Building dataset...
Casting dataset column...

Dataset created successfully!
Number of samples: 1101
Features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}

First sample:
Audio path: ../data/cslu_kids/speech/spontaneous/02/0/ksd14/ksd14xx0.wav
Sentence: <bn> a b c d e f g <br> h i j k<ln> l m n o p<ln> <br> q r s t u v w x y and z <bn> <pau> my<bn> fam...


In [6]:
# Save dataset
output_path = "../data/cslu_kids.ds"
cslu_kids.save_to_disk(output_path)
print(f"\nDataset saved to: {output_path}")

Saving the dataset (0/8 shards):   0%|          | 0/1101 [00:00<?, ? examples/s]


Dataset saved to: ../data/cslu_kids.ds
