In [2]:
import librosa
import soundfile as sf
import os
import numpy as np
import glob

def split_audio_folder(input_folder, output_base_dir, chunk_duration=4.0, top_db=30):
    # Create output base directory if it doesn't exist
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)
    
    # Supported audio file extensions
    audio_extensions = ['*.wav', '*.mp3', '*.ogg', '*.flac']
    
    # Get list of all audio files in the input folder
    audio_files = []
    for ext in audio_extensions:
        audio_files.extend(glob.glob(os.path.join(input_folder, ext)))
    
    if not audio_files:
        print(f"No audio files found in {input_folder}")
        return
    
    # Process each audio file
    for input_file in audio_files:
        # Create output directory for this specific audio file
        file_name = os.path.splitext(os.path.basename(input_file))[0]
        output_dir = os.path.join(output_base_dir, file_name + "_chunks")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        print(f"Processing {input_file}...")
        
        try:
            # Load the audio file
            y, sr = librosa.load(input_file)
            
            # Trim non-speech portions from beginning and end
            y_trimmed, _ = librosa.effects.trim(y, top_db=top_db)
            
            # Calculate total duration in seconds of trimmed audio
            total_duration = librosa.get_duration(y=y_trimmed, sr=sr)
            
            # Calculate number of samples per chunk
            chunk_samples = int(chunk_duration * sr)
            
            # Calculate total number of chunks
            num_chunks = int(np.ceil(total_duration / chunk_duration))
            
            # Split and save audio chunks
            for i in range(num_chunks):
                start_sample = i * chunk_samples
                end_sample = min((i + 1) * chunk_samples, len(y_trimmed))
                
                # Extract chunk
                chunk = y_trimmed[start_sample:end_sample]
                
                # Generate output filename
                output_file = os.path.join(output_dir, f"chunk_{i+1:03d}.wav")
                
                # Save chunk to file
                sf.write(output_file, chunk, sr)
                print(f"Saved chunk {i+1} to {output_file}")
                
        except Exception as e:
            print(f"Error processing {input_file}: {str(e)}")
            continue

if __name__ == "__main__":
    # Example usage
    input_folder = "/nvme1/hungdx/Lightning-hydra/notebooks/audio_segments/audio_input"  # Replace with your input folder path
    output_base_directory = "audio_chunks_2"  # Replace with desired output base directory
    split_audio_folder(input_folder, output_base_directory, chunk_duration=4.0, top_db=30)

Processing /nvme1/hungdx/Lightning-hydra/notebooks/audio_segments/audio_input/hung_iconipaudio_2025-04-19_16-18-23.ogg...


  y, sr = librosa.load(input_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Saved chunk 1 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_001.wav
Saved chunk 2 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_002.wav
Saved chunk 3 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_003.wav
Saved chunk 4 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_004.wav
Saved chunk 5 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_005.wav
Saved chunk 6 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_006.wav
Saved chunk 7 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_007.wav
Saved chunk 8 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_008.wav
Saved chunk 9 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_009.wav
Saved chunk 10 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_010.wav
Saved chunk 11 to audio_chunks_2/hung_iconipaudio_2025-04-19_16-18-23_chunks/chunk_011.wav
Saved ch

  y, sr = librosa.load(input_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Saved chunk 1 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_001.wav
Saved chunk 2 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_002.wav
Saved chunk 3 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_003.wav
Saved chunk 4 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_004.wav
Saved chunk 5 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_005.wav
Saved chunk 6 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_006.wav
Saved chunk 7 to audio_chunks_2/audio_2025-04-19_16-18-35_chunks/chunk_007.wav
Processing /nvme1/hungdx/Lightning-hydra/notebooks/audio_segments/audio_input/audio_2025-04-19_16-18-32.ogg...


  y, sr = librosa.load(input_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Saved chunk 1 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_001.wav
Saved chunk 2 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_002.wav
Saved chunk 3 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_003.wav
Saved chunk 4 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_004.wav
Saved chunk 5 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_005.wav
Saved chunk 6 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_006.wav
Saved chunk 7 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_007.wav
Saved chunk 8 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_008.wav
Saved chunk 9 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_009.wav
Saved chunk 10 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_010.wav
Saved chunk 11 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_011.wav
Saved chunk 12 to audio_chunks_2/audio_2025-04-19_16-18-32_chunks/chunk_012.wav
Saved chunk 13 to audio_chunks_2/audio_2025-04-19

  y, sr = librosa.load(input_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Saved chunk 1 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_001.wav
Saved chunk 2 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_002.wav
Saved chunk 3 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_003.wav
Saved chunk 4 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_004.wav
Saved chunk 5 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_005.wav
Saved chunk 6 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_006.wav
Saved chunk 7 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_007.wav
Saved chunk 8 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_008.wav
Saved chunk 9 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_009.wav
Saved chunk 10 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_010.wav
Saved chunk 11 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_011.wav
Saved chunk 12 to audio_chunks_2/audio_2025-04-19_16-18-28_chunks/chunk_012.wav
Saved chunk 13 to audio_chunks_2/audio_2025-04-19