In [1]:
import os
import librosa
import csv
from tqdm import tqdm

def get_audio_durations(audio_folder, output_csv):
    """
    Calculate duration of each audio file in the folder and write results to CSV
    
    Parameters:
        audio_folder (str): Path to folder containing audio files
        output_csv (str): Path to output CSV file
    """
    
    # Get list of audio files
    audio_files = [f for f in os.listdir(audio_folder) if f.endswith(('.wav', '.mp3', '.flac'))]
    
    # Open CSV file to write results
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['filename', 'duration'])  # Write header
        
        # Process each audio file with progress bar
        for audio_file in tqdm(audio_files, desc="Processing audio files"):
            try:
                # Load audio file and get duration
                audio_path = os.path.join(audio_folder, audio_file)
                duration = librosa.get_duration(filename=audio_path)
                
                # Write result to CSV
                writer.writerow([audio_file, f"{duration:.2f}"])
                
            except Exception as e:
                print(f"Error processing {audio_file}: {str(e)}")

if __name__ == "__main__":
    # Example usage
    audio_folder = "/data/hungdx/Lightning-hydra/data/in_the_wild"
    output_csv = "in_the_wild_durations.csv"
    
    get_audio_durations(audio_folder, output_csv)
    print(f"Results written to {output_csv}")

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
Processing audio files: 100%|██████████| 31779/31779 [02:46<00:00, 190.63it/s]

Results written to in_the_wild_durations.csv





In [3]:
import pandas as pd

df = pd.read_csv('in_the_wild_durations.csv')

print("Average duration: ", df['duration'].mean())


Average duration:  4.287989552849366
