In [7]:
import os
import numpy as np
import webrtcvad

def trim_human_voice(
        input_dir = 'processed/denoised',
        output_dir = 'processed/trimmed',
        frame_duration_ms = 30,
        aggressiveness = 2,
        verbose = True
):
    os.makedirs(output_dir, exist_ok= True)
    vad = webrtcvad.Vad(aggressiveness)

    for fname in sorted(os.listdir(input_dir)):
        if fname.endswith('.npz'):
            input_path = os.path.join(input_dir, fname)
            data = np.load(input_path)
            y = data['y']
            sr = int(data['sr'])

            frame_size = int(sr * frame_duration_ms / 1000)
            trimmed_audio = []

            for i in range(0, len(y) - frame_size, frame_size):
                frame = y[i:i + frame_size]
                pcm = (frame * 32768).astype(np.int16).tobytes()
                is_speech = vad.is_speech(pcm, sr)

                if not is_speech:
                    trimmed_audio.extend(frame)
            
            trimmed_audio = np.array(trimmed_audio, dtype=np.float32)

            output_path = os.path.join(output_dir, fname)
            np.savez_compressed(output_path, y = trimmed_audio, sr = sr)

            if verbose:
                print(f"Trimmed {fname}: Original = {len(y)}, After = {len(trimmed_audio)}")

In [8]:
trim_human_voice()

Trimmed CSA18786.npz: Original = 2273804, After = 1197120
Trimmed CSA35130.npz: Original = 6239441, After = 776640
Trimmed CSA35146.npz: Original = 8881899, After = 2201280
Trimmed CSA35175.npz: Original = 4628519, After = 2871360
Trimmed CSA35188.npz: Original = 760086, After = 511680
Trimmed CSA35800.npz: Original = 3016493, After = 417600
Trimmed CSA36359.npz: Original = 3233071, After = 559680
Trimmed XC10486.npz: Original = 678752, After = 28800
Trimmed XC10557.npz: Original = 1352512, After = 72960
Trimmed XC110422.npz: Original = 449724, After = 77760
Trimmed XC111444.npz: Original = 3124992, After = 1024320
Trimmed XC112708.npz: Original = 412108, After = 30720
Trimmed XC112987.npz: Original = 992235, After = 162240
Trimmed XC113116.npz: Original = 728085, After = 306240
Trimmed XC113451.npz: Original = 1533696, After = 510720
Trimmed XC113781.npz: Original = 657032, After = 132480
Trimmed XC113880.npz: Original = 276689, After = 3840
Trimmed XC113884.npz: Original = 706351, Af

In [None]:
import IPython.display as ipd

# Load denoised waveform
npz_path = 'processed/denoised/CSA18786.npz'
data = np.load(npz_path)
y = data['y']
sr = int(data['sr'])

# Play denoised audio
ipd.Audio(y, rate=sr)

In [11]:
# Load trimmed waveform
npz_path = 'processed/trimmed/CSA18786.npz'
data = np.load(npz_path)
y = data['y']
sr = int(data['sr'])

# Play trimmed audio
ipd.Audio(y, rate=sr)