## Python imports

In [1]:
%pip install -r requirements.txt

import os
import sys
import numpy as np
import datasets
import math
from scipy.io import wavfile
from tqdm import tqdm
from pathlib import Path
from src.common import find_samples
from IPython.display import Audio

Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl (864 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch>=1.9
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl (865.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m488.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:05[0m
[?25hCollecting sentencepiece
  Using cached sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting hyperpyyaml
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cusolver-cu12==11.7.1.2
  Downloading nvidia_cusolver_cu12-11.7.1.2-py3-none

  from .autonotebook import tqdm as notebook_tqdm


## Parameters

In [63]:
SAMPLE_DIR = '../../home-asist-samples'
FEATURES_DIM = 24

## Check the samples



In [65]:

positive_length_max = 0
positive_length_min = float('inf')
negative_length_max = 0
negative_length_min = float('inf')
hist = np.zeros(1000).astype('int32')
for sample in tqdm(find_samples(SAMPLE_DIR)):
    for label in sample.labels:
        length = label.end - label.begin
        if label.text.lower().startswith('p'):
            if positive_length_max < length:
                print(f'File {sample.wav} has a positive label with length {length}, ')
            positive_length_max = max(positive_length_max, length)
            positive_length_min = min(positive_length_min, length)
            hist[math.ceil(length * 100)] += 1
        elif label.text.lower().startswith('n'):
            negative_length_max = max(negative_length_max, length)
            negative_length_min = min(negative_length_min, length)
positive_length_max = 1.8
print(f'Positive length max: {positive_length_max}')
print(f'Positive length min: {positive_length_min}')
print(f'Negative length max: {negative_length_max}')
print(f'Negative length min: {negative_length_min}')
max_dim = math.ceil((positive_length_max * 1000 + 100) / 80)
print(f'Max dimension: {max_dim}x96')
for i in range(math.floor(positive_length_min * 100), math.ceil(positive_length_max * 100) + 1):
    print(f'{i * 100}ms: {hist[i]} samples')
if max_dim > FEATURES_DIM:
    raise ValueError(f'WARNING: max dimension {max_dim} is greater than FEATURES_DIM {FEATURES_DIM}, '
          f'you may need to adjust the model configuration or positive samples.')

100%|██████████| 292/292 [00:00<00:00, 264236.63it/s]

File ../../home-asist-samples/online/elevenlabs.io-1.wav has a positive label with length 1.0435000000000003, 
File ../../home-asist-samples/google/target-pl-PL-Chirp3-HD-Charon-r0.8-p0.wav has a positive label with length 1.2033125, 
File ../../home-asist-samples/google/target-pl-PL-Chirp3-HD-Aoede-r0.8-p0.wav has a positive label with length 1.23125, 
Positive length max: 1.8
Positive length min: 0.42856249999999996
Negative length max: 6.8001875
Negative length min: 1.4336875
Max dimension: 24x96
4200ms: 0 samples
4300ms: 1 samples
4400ms: 1 samples
4500ms: 1 samples
4600ms: 2 samples
4700ms: 1 samples
4800ms: 4 samples
4900ms: 0 samples
5000ms: 4 samples
5100ms: 1 samples
5200ms: 3 samples
5300ms: 5 samples
5400ms: 3 samples
5500ms: 1 samples
5600ms: 4 samples
5700ms: 4 samples
5800ms: 3 samples
5900ms: 4 samples
6000ms: 2 samples
6100ms: 1 samples
6200ms: 5 samples
6300ms: 2 samples
6400ms: 1 samples
6500ms: 2 samples
6600ms: 4 samples
6700ms: 2 samples
6800ms: 4 samples
6900ms: 2




## Download features set

> TODO: Download datasets from https://huggingface.co/datasets/davidscripka/openwakeword_features
> if FEATURES_DIM <= 16

## Download room impulse responses collected by MIT

Source: https://mcdermottlab.mit.edu/Reverb/IR_Survey.html

Actually downloaded from mirror: https://huggingface.co/datasets/davidscripka/MIT_environmental_impulse_responses

In [None]:
output_dir = Path('data/mit_rirs')
if not output_dir.exists():
    tmp_dir = output_dir.with_suffix('.tmp')
    tmp_dir.mkdir(parents=True, exist_ok=True)
    rir_dataset = datasets.load_dataset("davidscripka/MIT_environmental_impulse_responses", split="train", streaming=True)
    for row in tqdm(rir_dataset):
        name = row['audio']['path'].split('/')[-1]
        file = tmp_dir / name
        if row['audio']['sampling_rate'] != 16000:
            raise ValueError(f"Expected sampling rate of 16000, got {row['audio']['sampling_rate']}")
        wavfile.write(file, 16000, (row['audio']['array'] * 32767).astype(np.int16))
    tmp_dir.rename(output_dir)
else:
    print(f"Output directory {output_dir} already exists, skipping download.")

Output directory data/mit_rirs already exists, skipping download.


Postprocess RIR by removing unimportant samples at the beginning. It will prevent from audio shifting when it is used.

In [42]:
output_dir = Path('data/mit_rirs')
removed_max = 0
for file in output_dir.glob('*.wav'):
    sample_rate, data = wavfile.read(file)
    threshold = max(abs(data.max()), abs(data.min())) / 4
    first_real_sample = 0
    for i in range(len(data)):
        if abs(data[i]) > threshold:
            first_real_sample = max(0, i - 1)
            break
    data = data[first_real_sample:]
    if first_real_sample > 0:
        wavfile.write(file, sample_rate, data)
    removed_max = max(removed_max, first_real_sample)
print(f'Maximum removed {removed_max / sample_rate} seconds from the beginning of each RIR file.')

Maximum removed 0.0 seconds from the beginning of each RIR file.


In [43]:

import torchaudio
from speechbrain.processing.signal_processing import reverberate
from IPython.display import display

i = 0
for sample in find_samples(SAMPLE_DIR):
    i += 1
    if i > 0:
        break

#sample_rate, data = wavfile.read(sample.wav)
#sample_rate2, data2 = wavfile.read('data/mit_rirs/h099_Classroom_2txts.wav')

waveform, sr = torchaudio.load(sample.wav)
rir_waveform, sr = torchaudio.load('data/mit_rirs/h045_Livingroom_4txts.wav')

data3 = reverberate(waveform, rir_waveform)

display(
    Audio(waveform, rate=sr, normalize=True),
    Audio(data3, rate=sr, normalize=True, autoplay=True)
)
