In [None]:
import pandas as pd
import numpy as np
import os
import librosa
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from functools import partial


def get_audio_duration(row, base_dir):
    """Calculate duration for a single audio file"""
    try:
        file_path = os.path.join(base_dir, row['utt_id'])
        duration = librosa.get_duration(path=file_path)
        return {
            'utt_id': row['utt_id'],
            'subset': row['subset'],
            'label': row['label'],
            'duration': duration
        }
    except Exception as e:
        print(f"Error processing {row['utt_id']}: {str(e)}")
        return {
            'utt_id': row['utt_id'],
            'subset': row['subset'],
            'label': row['label'],
            'duration': -1  # Mark failed files with -1
        }


def process_chunk(chunk, base_dir):
    """Process a chunk of the dataframe"""
    return [get_audio_duration(row, base_dir) for row in chunk.to_dict('records')]


def calculate_durations(protocol_file, base_dir, output_file, n_workers=None):
    """
    Calculate durations for all audio files in parallel
    
    Args:
        protocol_file: Path to protocol file
        base_dir: Base directory containing audio files
        output_file: Path to output CSV file
        n_workers: Number of worker processes (default: CPU count - 1)
    """
    if n_workers is None:
        n_workers = cpu_count() - 1

    print("Reading protocol file...")
    protocol = pd.read_csv(protocol_file, sep=" ", header=None)
    protocol.columns = ["utt_id", "subset", "label"]

    # Split dataframe into chunks for parallel processing
    chunk_size = len(protocol) // n_workers + 1
    chunks = np.array_split(protocol, n_workers)

    print(f"Processing {len(protocol)} files using {n_workers} workers...")

    # Process chunks in parallel
    with Pool(n_workers) as pool:
        partial_process = partial(process_chunk, base_dir=base_dir)
        results = list(tqdm(
            pool.imap(partial_process, chunks),
            total=len(chunks),
            desc="Calculating durations"
        ))

    # Flatten results and convert to dataframe
    all_results = [item for sublist in results for item in sublist]
    df_results = pd.DataFrame(all_results)

    # Calculate statistics
    valid_durations = df_results[df_results['duration'] != -1]['duration']
    stats = {
        'total_files': len(df_results),
        'failed_files': len(df_results[df_results['duration'] == -1]),
        'total_duration_hours': valid_durations.sum() / 3600,
        'mean_duration': valid_durations.mean(),
        'min_duration': valid_durations.min(),
        'max_duration': valid_durations.max()
    }

    # Save results
    print("\nSaving results...")
    df_results.to_csv(output_file, index=False)

    # Print statistics
    print("\nProcessing Statistics:")
    print(f"Total files processed: {stats['total_files']}")
    print(f"Failed files: {stats['failed_files']}")
    print(f"Total duration: {stats['total_duration_hours']:.2f} hours")
    print(f"Mean duration: {stats['mean_duration']:.2f} seconds")
    print(f"Min duration: {stats['min_duration']:.2f} seconds")
    print(f"Max duration: {stats['max_duration']:.2f} seconds")

    return df_results, stats


# Usage
BASE_DIR = "/data/hungdx/Lightning-hydra/data/0_large-corpus"
protocol_file = "new_protocol_trim_vocoded.txt"
output_file = "audio_durations.csv"

# Run the processing
df_results, stats = calculate_durations(
    protocol_file=protocol_file,
    base_dir=BASE_DIR,
    output_file=output_file,
    n_workers=8  # Adjust based on your system
)

# Display first few rows of results
print("\nFirst few rows of results:")
print(df_results.head())

Reading protocol file...
Processing 408572 files using 8 workers...


  return bound(*args, **kwds)
Calculating durations: 100%|██████████| 8/8 [00:49<00:00,  6.20s/it]



Saving results...

Processing Statistics:
Total files processed: 408572
Failed files: 0
Total duration: 671.90 hours
Mean duration: 5.92 seconds
Min duration: 0.08 seconds
Max duration: 26.20 seconds

First few rows of results:
                          utt_id subset     label  duration
0     SNS/2024/Zuckerberg_38.wav  train  bonafide       7.0
1     SNS/2024/Matt_damon_58.wav  train  bonafide       6.5
2  SNS/2024/Anne_Hathaway_83.wav  train  bonafide       6.5
3    SNS/2024/Zuckerberg_127.wav  train  bonafide       7.0
4        SNS/2024/Macron_193.wav  train  bonafide       7.0


In [2]:
# Filter audio files with duration less than 1 second
short_files = df_results[df_results['duration'] < 1]
print("\nShort audio files (duration < 1 second):")
print(len(short_files))


Short audio files (duration < 1 second):
2993


In [3]:
df_results = df_results[df_results['duration'] >= 1]
# drop duration column and save
df_results.drop(columns=['duration'], inplace=True)
df_results.to_csv("new_protocol_trim_vocoded_cleaned.txt", index=False, header=False, sep=" ")

In [4]:
import pandas as pd

df = pd.read_csv("new_protocol_trim_vocoded_cleaned.txt", sep=" ", header=None)
df.columns = ["utt_id", "subset", "label"]

print("Before removing vocoded files:", len(df))

df = df[~((df["utt_id"].str.startswith("vocoded")) & (df["subset"] == "dev"))]

print("After removing vocoded files:", len(df))

Before removing vocoded files: 405579
After removing vocoded files: 390512
