In [2]:
# import os
# import librosa
# import csv
# from tqdm import tqdm

# def get_audio_durations(audio_folder, output_csv):
#     """
#     Calculate duration of each audio file in the folder and write results to CSV
    
#     Parameters:
#         audio_folder (str): Path to folder containing audio files
#         output_csv (str): Path to output CSV file
#     """
    
#     # Get list of audio files
#     audio_files = [f for f in os.listdir(audio_folder) if f.endswith(('.wav', '.mp3', '.flac'))]
    
#     # Open CSV file to write results
#     with open(output_csv, 'w', newline='') as csvfile:
#         writer = csv.writer(csvfile)
#         writer.writerow(['filename', 'duration'])  # Write header
        
#         # Process each audio file with progress bar
#         for audio_file in tqdm(audio_files, desc="Processing audio files"):
#             try:
#                 # Load audio file and get duration
#                 audio_path = os.path.join(audio_folder, audio_file)
#                 duration = librosa.get_duration(filename=audio_path)
                
#                 # Write result to CSV
#                 writer.writerow([audio_file, f"{duration:.2f}"])
                
#             except Exception as e:
#                 print(f"Error processing {audio_file}: {str(e)}")

# if __name__ == "__main__":
#     # Example usage
#     audio_folder = "/nvme1/hungdx/Datasets/ASVspoof2021_DF_eval"
#     output_csv = "ASVspoof2021_DF_eval_durations.csv"
    
#     get_audio_durations(audio_folder, output_csv)
#     print(f"Results written to {output_csv}")

import os
import librosa
import csv
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from functools import partial
import pandas as pd

def process_audio_file(audio_file, audio_folder):
    """
    Process a single audio file and return its duration
    
    Parameters:
        audio_file (str): Name of the audio file
        audio_folder (str): Path to the folder containing audio files
        
    Returns:
        tuple: (filename, duration) or (filename, error_message)
    """
    try:
        audio_path = os.path.join(audio_folder, audio_file)
        duration = librosa.get_duration(filename=audio_path)
        return (audio_file, f"{duration:.2f}")
    except Exception as e:
        return (audio_file, f"Error: {str(e)}")

def get_audio_durations(audio_folder, output_csv, num_processes=None):
    """
    Calculate duration of each audio file in parallel and write results to CSV
    
    Parameters:
        audio_folder (str): Path to folder containing audio files
        output_csv (str): Path to output CSV file
        num_processes (int): Number of processes to use (default: CPU count - 1)
    """
    # Get list of audio files
    audio_files = [f for f in os.listdir(audio_folder) if f.endswith(('.wav', '.mp3', '.flac'))]
    
    # Determine number of processes
    if num_processes is None:
        num_processes = max(1, cpu_count() - 1)  # Leave one CPU free
    
    print(f"Processing {len(audio_files)} files using {num_processes} processes...")
    
    # Create partial function with fixed audio_folder parameter
    process_func = partial(process_audio_file, audio_folder=audio_folder)
    
    # Process files in parallel with progress bar
    results = []
    with Pool(processes=num_processes) as pool:
        for result in tqdm(
            pool.imap_unordered(process_func, audio_files),
            total=len(audio_files),
            desc="Processing audio files"
        ):
            results.append(result)
    
    # Convert results to DataFrame for easier handling
    df = pd.DataFrame(results, columns=['filename', 'duration'])
    
    # Separate successful and failed processes
    success_df = df[~df['duration'].str.contains('Error')]
    error_df = df[df['duration'].str.contains('Error')]
    
    # Write successful results to CSV
    success_df.to_csv(output_csv, index=False)
    
    # If there were errors, write them to a separate file
    if not error_df.empty:
        error_csv = output_csv.replace('.csv', '_errors.csv')
        error_df.to_csv(error_csv, index=False)
        print(f"Errors encountered for {len(error_df)} files. Error details written to {error_csv}")
    
    return len(success_df), len(error_df)

if __name__ == "__main__":
    # Example usage
    audio_folder = "/nvme1/hungdx/Datasets/ASVspoof2021_DF_eval"
    output_csv = "ASVspoof2021_DF_eval_durations.csv"
    
    # Optional: specify number of processes
    # num_processes = 4
    
    success_count, error_count = get_audio_durations(audio_folder, output_csv)
    print(f"Successfully processed {success_count} files")
    print(f"Results written to {output_csv}")

Processing 611829 files using 79 processes...


	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
	This alia

Successfully processed 611829 files
Results written to ASVspoof2021_DF_eval_durations.csv


In [3]:
import pandas as pd

df = pd.read_csv('in_the_wild_durations.csv')

print("Average duration: ", df['duration'].mean())


Average duration:  4.287989552849366


# Check NaN input

In [5]:
import sys

sys.path.append('/data/hungdx/Lightning-hydra/src')

from data.normal_multiview_datamodule import Dataset_for_dev


def genList(protocol_path, is_train=False, is_eval=False, is_dev=False):
    """
            This function generates the list of files and their corresponding labels
            Specifically for the standard CNSL dataset
        """
     # bonafide: 1, spoof: 0
    d_meta = {}
    file_list = []

    if (is_train):
        with open(protocol_path, 'r') as f:
            l_meta = f.readlines()
        for line in l_meta:
            utt, subset, label = line.strip().split()
            if subset == 'train':
                file_list.append(utt)
                d_meta[utt] = 1 if label == 'bonafide' else 0

        return d_meta, file_list
    if (is_dev):
        with open(protocol_path, 'r') as f:
            l_meta = f.readlines()
        for line in l_meta:
            utt, subset, label = line.strip().split()
            if subset == 'dev':
                file_list.append(utt)
                d_meta[utt] = 1 if label == 'bonafide' else 0
        return d_meta, file_list

    if (is_eval):
        # no eval protocol_path yet
        with open(protocol_path, 'r') as f:
            l_meta = f.readlines()
        for line in l_meta:
            utt, subset, label = line.strip().split()
            if subset == 'eval':
                file_list.append(utt)
                d_meta[utt] = 1 if label == 'bonafide' else 0
        # return d_meta, file_list
        return d_meta, file_list

In [21]:
from dataclasses import dataclass
from dataclasses import dataclass, asdict
from typing import List, Optional


@dataclass
class DataConfig:
    augmentation_methods: List[str]
    wav_samp_rate: int
    online_aug: bool
    aug_dir: str
    noise_path: str
    rir_path: str
    repeat_pad: bool
    random_start: bool


@dataclass
class Args:
    views: List[int]
    protocol_path: str
    nBands: int
    minF: int
    maxF: int
    minBW: int
    maxBW: int
    minCoeff: int
    maxCoeff: int
    minG: int
    maxG: int
    minBiasLinNonLin: int
    maxBiasLinNonLin: int
    N_f: int
    P: int
    g_sd: int
    SNRmin: int
    SNRmax: int
    data: DataConfig

    def __getitem__(self, key):
        if key == 'data':
            return asdict(self.data)
        return asdict(self)[key]

    @classmethod
    def from_dict(cls, config_dict: dict):
        data_config = DataConfig(**config_dict['data'])
        config_dict['data'] = data_config
        return cls(**config_dict)


# Example usage:
config_dict = {
    'views': [1, 2, 3, 4],
    'protocol_path': '${oc.env:LARGE_CORPUS_FOR_CNSL_PROTOCOLS}',
    'nBands': 5,
    'minF': 20,
    'maxF': 8000,
    'minBW': 100,
    'maxBW': 1000,
    'minCoeff': 10,
    'maxCoeff': 100,
    'minG': 0,
    'maxG': 0,
    'minBiasLinNonLin': 5,
    'maxBiasLinNonLin': 20,
    'N_f': 5,
    'P': 10,
    'g_sd': 2,
    'SNRmin': 10,
    'SNRmax': 40,
    'data': {
        'augmentation_methods': ["RawBoost12", "pitch_1", "volume_10", "speed_01", "none"],
        'wav_samp_rate': 16000,
        'online_aug': True,
        'aug_dir': '${oc.env:LARGE_CORPUS_FOR_CNSL}/aug',
        'noise_path': '${oc.env:NOISE_PATH}',
        'rir_path': '${oc.env:RIR_PATH}',
        'repeat_pad': True,
        'random_start': True
    }
}
# Create Args object
args = Args.from_dict(config_dict)

In [23]:
protocol_path = "/data/hungdx/Lightning-hydra/notebooks/new_protocol_trim_vocoded_cleaned.txt"
data_dir = "/data/hungdx/Lightning-hydra/data/0_large-corpus"

d_label_dev, file_dev = genList(protocol_path,
    is_train=False, is_eval=False, is_dev=True)

data_val = Dataset_for_dev(args, list_IDs=file_dev, labels=d_label_dev,
                       base_dir=data_dir+'/',  is_train=False, **args['data'])

vocoders: []


In [27]:
from torch.utils.data import DataLoader, Dataset
from data.components.collate_fn import multi_view_collate_fn, variable_multi_view_collate_fn


def collate_fn(x): return multi_view_collate_fn(
    x,
    args.views,
    args.data.wav_samp_rate,
    "repeat",
    True
)

dev_dataloader = DataLoader(
    dataset=data_val,
    batch_size=1,
    num_workers=1,
    pin_memory=True,
    shuffle=False,
    collate_fn=collate_fn,
)

In [30]:
import torch
from tqdm import tqdm

# Iterate through the DataLoader with a progress bar
for batch_idx, batch in enumerate(tqdm(dev_dataloader, desc="Validating")):
    for view, (x, y) in batch.items():
        if torch.isnan(x).any() or torch.isinf(x).any():
            print("Found NaN or Inf in validation inputs!")
        if torch.isnan(y).any() or torch.isinf(y).any():
            print("Found NaN or Inf in validation labels!")

Validating: 100%|██████████| 66725/66725 [08:32<00:00, 130.08it/s]
