In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("AIxBlock/Human-to-machine-Japanese-audio-call-center-conversations")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 3112 examples [00:00, 45907.10 examples/s]


In [1]:
from datasets import load_dataset
ds = load_dataset("reazon-research/reazonspeech", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['name', 'audio', 'transcription'],
        num_rows: 5323
    })
})

In [3]:
ds["train"]["audio"][0]

{'path': '/Users/joey/.cache/huggingface/datasets/downloads/extracted/6931f71321a9af351e7e177c78bee3da636dbc9151fcfbc2b8e14cf9c0b3ab91/000/000734dcb35d6.flac',
 'array': array([-0.01309204, -0.01068115, -0.006073  , ...,  0.00613403,
         0.00558472,  0.00674438], shape=(22291,)),
 'sampling_rate': 16000}

In [4]:
ds = ds["train"]

In [5]:
# Check all available fields in the dataset
print("Dataset columns:", ds.column_names)
print("\nFirst sample with all fields:")
print(ds[0].keys())
print("\nSample data (excluding large audio array):")
for key, value in ds[0].items():
    if key != 'audio':
        print(f"  {key}: {value}")
    else:
        print(f"  {key}: [audio data with shape {value['array'].shape}]")

Dataset columns: ['name', 'audio', 'transcription']

First sample with all fields:
dict_keys(['name', 'audio', 'transcription'])

Sample data (excluding large audio array):
  name: 000/000734dcb35d6.flac
  audio: [audio data with shape (22291,)]
  transcription: „Åì„Çå„Åæ„Åü„Ç∏„Éü„Éº„Åï„Çì


In [19]:
import json
from pathlib import Path
import soundfile as sf

# Create output directory
output_dir = Path("sample_audios")
output_dir.mkdir(exist_ok=True)

# Number of samples to save
num_samples = 10

print(f"Saving {num_samples} audio samples with transcripts to: {output_dir.absolute()}")
print("="*80)

saved_data = []

for i in range(min(num_samples, len(ds))):
    sample = ds[i]
    
    # Extract audio
    audio_array = sample['audio']['array']
    sample_rate = sample['audio']['sampling_rate']
    duration = len(audio_array) / sample_rate
    
    # Create filenames
    audio_filename = f"audio_{i:03d}.wav"
    transcript_filename = f"audio_{i:03d}.txt"
    
    audio_path = output_dir / audio_filename
    transcript_path = output_dir / transcript_filename
    
    # Save audio file
    sf.write(audio_path, audio_array, sample_rate)
    
    # Extract transcript (check multiple possible field names)
    transcript = None
    for field in ['transcript', 'transcription', 'text', 'sentence']:
        if field in sample:
            transcript = sample[field]
            break
    
    # Save transcript if available
    if transcript:
        with open(transcript_path, 'w', encoding='utf-8') as f:
            f.write(transcript)
    else:
        transcript = "[No transcript available]"
    
    # Store metadata
    metadata = {
        'index': i,
        'audio_file': audio_filename,
        'transcript_file': transcript_filename if transcript != "[No transcript available]" else None,
        'transcript': transcript,
        'duration_seconds': round(duration, 2),
        'sample_rate': sample_rate
    }
    saved_data.append(metadata)
    
    print(f"‚úì Sample {i:03d}: {audio_filename} ({duration:.2f}s)")
    print(f"  Transcript: {transcript[:100]}{'...' if len(str(transcript)) > 100 else ''}")

# Save metadata to JSON file
metadata_path = output_dir / "metadata.json"
with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(saved_data, f, ensure_ascii=False, indent=2)

print("="*80)
print(f"\n‚úì Saved {len(saved_data)} samples to: {output_dir.absolute()}")
print(f"‚úì Metadata saved to: {metadata_path.absolute()}")

Saving 10 audio samples with transcripts to: /Users/joey/Workspace/interPro/ASR/dataset/sample_audios
‚úì Sample 000: audio_000.wav (1.39s)
  Transcript: „Åì„Çå„Åæ„Åü„Ç∏„Éü„Éº„Åï„Çì
‚úì Sample 001: audio_001.wav (7.65s)
  Transcript: ‰ªä„ÇÇÁõ∏Êâã„Å´„É≠„É≥„Éê„É´„Éâ„ÅÆ„Åª„ÅÜ„Å´ËÇ©Âè£„ÅßÊè°„Çâ„Çå„Å¶„ÇÇ„Åô„Åê„Åï„ÅæÊµÅ„Çå„ÇíÂàá„ÇãÂºï„ÅçËæº„ÅøËøî„Åó„Å´Â§â„Åà„Åü„Å®„ÄÇ
‚úì Sample 002: audio_002.wav (2.94s)
  Transcript: ÂÉï„ÅØ„Çø„ÇØ„Ç∑„Éº„ÅÆ„Åì„Å®„Å´Èñ¢„Åó„Å¶„ÅØ„ÅÇ„Çì„Åæ„Çä„Åì„ÅÜ„ÄÇ
‚úì Sample 003: audio_003.wav (4.78s)
  Transcript: Ôºª„Éê„Éº„Éü„É§„É≥„Ç∫„Ç™„É≥„Ç®„Ç¢Áç≤Âæó„ÇÇÁñëÊÉëÊµÆ‰∏äÔºΩ
‚úì Sample 004: audio_004.wav (4.63s)
  Transcript: „Åù„Åó„Å¶„ÇÇ„ÅÜ‰∏ÄÊûö„Åå„Åì„Å°„Çâ„ÄÇ
‚úì Sample 005: audio_005.wav (0.62s)
  Transcript: ‰ΩïÔºü
‚úì Sample 006: audio_006.wav (8.15s)
  Transcript: Á©çÊ•µÁöÑ„Å´„ÅäÈáë„Çí‰Ωø„ÅÜ„Åπ„Åç„Å†„Å®‰∏ªÂºµ„Åô„ÇãÊîøÊ≤ªÂÆ∂„ÇÑÁúÅÂ∫Å„Å®ÊîØÂá∫„ÇíÊäë„Åà„Åü„ÅÑË≤°ÂãôÁúÅ„Å®„ÅÆÈñì„Åß„Åõ„ÇÅ„ÅéÂêà„ÅÑ„ÅåÁ∂ö„Åç„Åæ„Åô„ÄÇ
‚úì Sample 007: audio_007.wav (5.26s)
  Transcript: ‰ªäÂ§ß‰ºö„

In [20]:
# Example: Load and use the saved audio and transcript
sample_idx = 0

# Load audio
audio_file = output_dir / f"audio_{sample_idx:03d}.wav"
transcript_file = output_dir / f"audio_{sample_idx:03d}.txt"

audio, sr = sf.read(audio_file)

# Load transcript
with open(transcript_file, 'r', encoding='utf-8') as f:
    transcript = f.read()

print(f"Loaded sample {sample_idx}:")
print(f"  Audio file: {audio_file.name}")
print(f"  Duration: {len(audio) / sr:.2f}s")
print(f"  Sample rate: {sr} Hz")
print(f"  Transcript: {transcript}")

# Load metadata
with open(output_dir / "metadata.json", 'r') as f:
    metadata = json.load(f)

print(f"\nTotal samples saved: {len(metadata)}")

Loaded sample 0:
  Audio file: audio_000.wav
  Duration: 1.39s
  Sample rate: 16000 Hz
  Transcript: „Åì„Çå„Åæ„Åü„Ç∏„Éü„Éº„Åï„Çì

Total samples saved: 10


# üìÅ Saved Files Structure

The `sample_audios/` folder now contains:

```
sample_audios/
‚îú‚îÄ‚îÄ audio_000.wav          # Audio file 1
‚îú‚îÄ‚îÄ audio_000.txt          # Transcript for audio 1
‚îú‚îÄ‚îÄ audio_001.wav          # Audio file 2
‚îú‚îÄ‚îÄ audio_001.txt          # Transcript for audio 2
‚îú‚îÄ‚îÄ ...
‚îú‚îÄ‚îÄ audio_009.wav          # Audio file 10
‚îú‚îÄ‚îÄ audio_009.txt          # Transcript for audio 10
‚îî‚îÄ‚îÄ metadata.json          # Complete metadata for all samples
```

## metadata.json Format:
```json
[
  {
    "index": 0,
    "audio_file": "audio_000.wav",
    "transcript_file": "audio_000.txt",
    "transcript": "The actual transcript text...",
    "duration_seconds": 54.16,
    "sample_rate": 44100
  },
  ...
]
```

## Usage with ASR Systems:

You can now test your ASR systems and compare with ground truth transcripts!

```python
# Test ASR system
predicted_transcript = your_asr_system('sample_audios/audio_000.wav')

# Compare with ground truth
with open('sample_audios/audio_000.txt', 'r') as f:
    ground_truth = f.read()

print(f"Ground truth: {ground_truth}")
print(f"Predicted:    {predicted_transcript}")
```

In [6]:
import soundfile as sf
from pathlib import Path

# Save the first audio sample to a local WAV file
output_dir = Path(".")
output_file = output_dir / "call_center_sample_0.wav"

# Extract audio data
audio_array = ds[0]['audio']['array']
sample_rate = ds[0]['audio']['sampling_rate']

# Save to WAV file
sf.write(output_file, audio_array, sample_rate)

print(f"‚úì Audio saved to: {output_file.absolute()}")
print(f"  Duration: {len(audio_array) / sample_rate:.2f} seconds")
print(f"  Sample rate: {sample_rate} Hz")
print(f"  File size: {output_file.stat().st_size / 1024:.1f} KB")

‚úì Audio saved to: /Users/joey/Workspace/interPro/ASR/dataset/call_center_sample_0.wav
  Duration: 54.18 seconds
  Sample rate: 44100 Hz
  File size: 4666.5 KB


In [None]:
# Save multiple audio samples (5 samples)
num_samples = 5
saved_files = []

for i in range(num_samples):
    sample = ds[i]
    audio_array = sample['audio']['array']
    sample_rate = sample['audio']['sampling_rate']
    
    output_file = output_dir / f"call_center_sample_{i}.wav"
    sf.write(output_file, audio_array, sample_rate)
    
    duration = len(audio_array) / sample_rate
    saved_files.append({
        'index': i,
        'file': str(output_file),
        'duration': duration
    })
    
    print(f"‚úì Sample {i}: {output_file.name} ({duration:.2f}s)")

In [None]:
# Load audio file back from disk
test_file = saved_files[0]['file']
audio_loaded, sr_loaded = sf.read(test_file)

print(f"Loaded audio from: {test_file}")
print(f"  Shape: {audio_loaded.shape}")
print(f"  Sample rate: {sr_loaded} Hz")
print(f"  Duration: {len(audio_loaded) / sr_loaded:.2f}s")
print(f"  Data type: {audio_loaded.dtype}")

# Show how to use with ASR systems
print("\n" + "="*60)
print("How to Use These Audio Files:")
print("="*60)
print(f"\n1. Kotoba Whisper (optimized):")
print(f"   !python ../test_asr_optimized.py {test_file}")
print(f"\n2. Kotoba Whisper (streaming):")
print(f"   !python ../test_asr_streaming.py --mode file --audio {test_file}")
print(f"\n3. AssemblyAI (batch):")
print(f"   !python ../test_asr_assemblyai.py --audio {test_file}")

# Summary: Audio Loading Process

## What You Just Did:

1. **Loaded Dataset** - Successfully loaded 3,112 Japanese call center audio samples from HuggingFace
2. **Extracted Audio** - Got the audio as NumPy arrays with shape information
3. **Saved to Local Files** - Saved audio samples as WAV files to disk
4. **Loaded Back** - Verified you can load the saved WAV files

## Audio Data Structure:

The dataset provides audio in this format:
```python
{
    'audio': {
        'path': 'zip://...',           # Original location in cache
        'array': numpy.ndarray,         # Audio samples as array
        'sampling_rate': 44100          # Sample rate in Hz
    }
}
```

## How to Use the Audio Files:

### Method 1: Load from saved WAV file
```python
import soundfile as sf
audio_array, sample_rate = sf.read('call_center_sample_0.wav')
```

### Method 2: Load directly from dataset
```python
audio_array = ds[0]['audio']['array']
sample_rate = ds[0]['audio']['sampling_rate']
```

### Method 3: Use with ASR scripts
Run the audio through your ASR systems using the saved WAV files!

## Next Steps:
- Test these audio files with your ASR systems
- Compare transcription quality across different systems
- Download more samples as needed (you have 3,112 available!)