In [8]:
import pandas as pd
import numpy as np
import os
import librosa
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from functools import partial


def get_audio_duration(row, base_dir):
    """Calculate duration for a single audio file"""
    try:
        file_path = os.path.join(base_dir, row['utt_id'])
        duration = librosa.get_duration(path=file_path)
        return {
            'utt_id': row['utt_id'],
            'subset': row['subset'],
            'label': row['label'],
            'duration': duration
        }
    except Exception as e:
        print(f"Error processing {row['utt_id']}: {str(e)}")
        return {
            'utt_id': row['utt_id'],
            'subset': row['subset'],
            'label': row['label'],
            'duration': -1  # Mark failed files with -1
        }


def process_chunk(chunk, base_dir):
    """Process a chunk of the dataframe"""
    return [get_audio_duration(row, base_dir) for row in chunk.to_dict('records')]


def calculate_durations(protocol_file, base_dir, output_file, n_workers=None):
    """
    Calculate durations for all audio files in parallel
    
    Args:
        protocol_file: Path to protocol file
        base_dir: Base directory containing audio files
        output_file: Path to output CSV file
        n_workers: Number of worker processes (default: CPU count - 1)
    """
    if n_workers is None:
        n_workers = cpu_count() - 1

    print("Reading protocol file...")
    protocol = pd.read_csv(protocol_file, sep=" ", header=None)
    protocol.columns = ["utt_id", "subset", "label"]

    # Split dataframe into chunks for parallel processing
    chunk_size = len(protocol) // n_workers + 1
    chunks = np.array_split(protocol, n_workers)

    print(f"Processing {len(protocol)} files using {n_workers} workers...")

    # Process chunks in parallel
    with Pool(n_workers) as pool:
        partial_process = partial(process_chunk, base_dir=base_dir)
        results = list(tqdm(
            pool.imap(partial_process, chunks),
            total=len(chunks),
            desc="Calculating durations"
        ))

    # Flatten results and convert to dataframe
    all_results = [item for sublist in results for item in sublist]
    df_results = pd.DataFrame(all_results)

    # Calculate statistics
    valid_durations = df_results[df_results['duration'] != -1]['duration']
    stats = {
        'total_files': len(df_results),
        'failed_files': len(df_results[df_results['duration'] == -1]),
        'total_duration_hours': valid_durations.sum() / 3600,
        'mean_duration': valid_durations.mean(),
        'min_duration': valid_durations.min(),
        'max_duration': valid_durations.max()
    }

    # Save results
    print("\nSaving results...")
    df_results.to_csv(output_file, index=False)

    # Print statistics
    print("\nProcessing Statistics:")
    print(f"Total files processed: {stats['total_files']}")
    print(f"Failed files: {stats['failed_files']}")
    print(f"Total duration: {stats['total_duration_hours']:.2f} hours")
    print(f"Mean duration: {stats['mean_duration']:.2f} seconds")
    print(f"Min duration: {stats['min_duration']:.2f} seconds")
    print(f"Max duration: {stats['max_duration']:.2f} seconds")

    return df_results, stats


# Usage
BASE_DIR = "/data/hungdx/Lightning-hydra/data/0_large-corpus"
protocol_file = "new_protocol_trim_vocoded_v2.txt"
output_file = "audio_durations_v2.csv"

# Run the processing
df_results, stats = calculate_durations(
    protocol_file=protocol_file,
    base_dir=BASE_DIR,
    output_file=output_file,
    n_workers=8  # Adjust based on your system
)

# Display first few rows of results
print("\nFirst few rows of results:")
print(df_results.head())

Reading protocol file...
Processing 248159 files using 8 workers...


  return bound(*args, **kwds)
Calculating durations: 100%|██████████| 8/8 [00:02<00:00,  2.93it/s]



Saving results...

Processing Statistics:
Total files processed: 248159
Failed files: 0
Total duration: 433.71 hours
Mean duration: 6.29 seconds
Min duration: 0.08 seconds
Max duration: 26.20 seconds

First few rows of results:
                          utt_id subset     label  duration
0     SNS/2024/Zuckerberg_38.wav  train  bonafide       7.0
1     SNS/2024/Matt_damon_58.wav  train  bonafide       6.5
2  SNS/2024/Anne_Hathaway_83.wav  train  bonafide       6.5
3    SNS/2024/Zuckerberg_127.wav  train  bonafide       7.0
4        SNS/2024/Macron_193.wav  train  bonafide       7.0


In [9]:
# Filter audio files with duration less than 1 second with subset == train
short_files = df_results[df_results['duration'] < 1 & (df_results['subset'] == 'train')]
print("\nShort audio files (duration < 1 second):")
print(len(short_files))


Short audio files (duration < 1 second):
436


In [10]:
# remove all files with duration < 1 second and subset == train
df_results = df_results[~((df_results['duration'] < 1) & (df_results['subset'] == 'train'))]

# drop duration column and save
df_results.drop(columns=['duration'], inplace=True)
df_results.to_csv("new_protocol_trim_vocoded_cleaned_v2.txt", index=False, header=False, sep=" ")

In [4]:
import pandas as pd

df = pd.read_csv("new_protocol_trim_vocoded_cleaned.txt", sep=" ", header=None)
df.columns = ["utt_id", "subset", "label"]

print("Before removing vocoded files:", len(df))

df = df[~((df["utt_id"].str.startswith("vocoded")) & (df["subset"] == "dev"))]

print("After removing vocoded files:", len(df))

Before removing vocoded files: 405579
After removing vocoded files: 390512


# MLAAD

In [5]:
import pandas as pd

df = pd.read_csv("~/MLAAD/protocol.txt", sep=" ", header=None)
df.columns = ["utt_id", "subset", "unk", "label"]

# drop unk column
df.drop(columns=['unk'], inplace=True)

df.to_csv("~/MLAAD/protocol_lts.txt", index=False, header=False, sep=" ")

In [4]:
import pandas as pd

df = pd.read_csv("new_protocol_trim_vocoded_cleaned.txt", sep=" ", header=None)
df.columns = ["utt_id", "subset", "label"]

print("Before removing vocoded files:", len(df))

df = df[~((df["utt_id"].str.startswith("vocoded")) & (df["subset"] == "dev"))]

print("After removing vocoded files:", len(df))

Before removing vocoded files: 405579
After removing vocoded files: 390512


# MLAAD V5

In [3]:
import os
import pandas as pd
from pathlib import Path
import csv


def read_csv_safely(file_path):
    try:
        # First attempt with standard reading
        return pd.read_csv(file_path, sep='|')
    except:
        try:
            # Second attempt with quote character handling
            return pd.read_csv(file_path, sep='|', quoting=csv.QUOTE_NONE, escapechar='\\')
        except:
            try:
                # Third attempt with error handling
                return pd.read_csv(file_path, sep='|', on_bad_lines='skip', quoting=csv.QUOTE_NONE)
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")
                return None


def create_protocol_and_merge_meta(root_dir):
    # Initialize empty list to store all metadata
    all_meta_data = []

    # Initialize protocol file
    protocol_lines = []

    # Walk through the directory structure
    for language_dir in os.listdir(root_dir):
        language_path = os.path.join(root_dir, language_dir)
        if not os.path.isdir(language_path):
            continue

        # For each model directory in the language directory
        for model_dir in os.listdir(language_path):
            model_path = os.path.join(language_path, model_dir)
            if not os.path.isdir(model_path):
                continue

            # Look for meta.csv file
            meta_file = os.path.join(model_path, 'meta.csv')
            if os.path.exists(meta_file):
                # Read meta.csv with safe reading function
                df = read_csv_safely(meta_file)

                if df is not None:
                    # Add to combined metadata
                    all_meta_data.append(df)

                    # Create protocol lines for all audio files in this directory
                    for audio_path in df['path']:
                        protocol_lines.append(f"{audio_path} eval spoof")

    # Combine all metadata
    if all_meta_data:
        combined_meta = pd.concat(all_meta_data, ignore_index=True)

        # Save combined metadata
        combined_meta.to_csv('mlaad_v5_combined_meta.csv', sep='|',
                             index=False, quoting=csv.QUOTE_NONE, escapechar='\\')

        # Save protocol file
        with open('mlaad_v5_protocol.txt', 'w') as f:
            f.write('\n'.join(protocol_lines))

        print(f"Created protocol.txt with {len(protocol_lines)} entries")
        print(f"Created combined_meta.csv with {len(combined_meta)} entries")
    else:
        print("No meta.csv files found or all files had errors")


# Run the function
if __name__ == "__main__":
    create_protocol_and_merge_meta(
        '/nvme1/hungdx/Lightning-hydra/data/mlaad_v5/fake')

Created protocol.txt with 152388 entries
Created combined_meta.csv with 152388 entries


# partialSpoof protocol generator

In [4]:
import os
from pathlib import Path
from tqdm import tqdm


def read_protocol_metadata(protocol_file):
    """Read protocol metadata and create mapping dictionary."""
    metadata = {}
    with open(protocol_file, 'r') as f:
        lines = f.readlines()
        for line in tqdm(lines, desc=f"Reading {protocol_file.name}", leave=False):
            parts = line.strip().split()
            # print(parts)
            # import sys
            # sys.exit()
            if len(parts) >= 4:
                # Extract the file ID (e.g., LA_0079) from the first column
                file_id = parts[1]
                label = parts[4]    # spoof or bonafide
                metadata[file_id] = label
    return metadata


def create_new_protocol(lst_file, wav_folder, protocol_metadata, subset, output_file):
    """Create new protocol file with wav paths and labels."""
    with open(lst_file, 'r') as f:
        file_ids = [line.strip() for line in f.readlines()]

    missing_labels = []
    with open(output_file, 'w') as out_f:
        for file_id in tqdm(file_ids, desc=f"Processing {subset}", leave=False):
            wav_path = f"{subset}/con_wav/{file_id}.wav"
            # Direct dictionary lookup instead of loop
            label = protocol_metadata.get(file_id)

            if label:
                out_f.write(f"{wav_path} {subset} {label}\n")
            else:
                missing_labels.append(file_id)

    if missing_labels:
        print(
            f"\nWarning: No labels found for {len(missing_labels)} files in {subset}")
        print("First few missing files:", missing_labels[:5])


def main():
    # Define paths
    base_dir = Path("/nvme1/hungdx/Lightning-hydra/data/PartialSpoof/database")
    protocol_dir = base_dir / "protocols/PartialSpoof_LA_cm_protocols"

    # Process each subset (train, dev, eval)
    subsets = ['train', 'dev', 'eval']

    print("Starting protocol file creation...")
    for subset in tqdm(subsets, desc="Processing subsets"):
        # Input files
        protocol_file = protocol_dir / f"PartialSpoof.LA.cm.{subset}.trl.txt"
        lst_file = base_dir / subset / f"{subset}.lst"
        output_file = base_dir / f"protocol_{subset}.txt"

        # Read protocol metadata
        metadata = read_protocol_metadata(protocol_file)

        # Create new protocol file
        create_new_protocol(
            lst_file=lst_file,
            wav_folder=subset,
            protocol_metadata=metadata,
            subset=subset,
            output_file=output_file
        )

        print(f"✓ Created protocol file for {subset}: {output_file}")

    print("\nProtocol file creation completed!")

main()

Starting protocol file creation...


Processing subsets:   0%|          | 0/3 [00:00<?, ?it/s]

✓ Created protocol file for train: /nvme1/hungdx/Lightning-hydra/data/PartialSpoof/database/protocol_train.txt


Processing subsets:  67%|██████▋   | 2/3 [00:00<00:00, 14.60it/s]

✓ Created protocol file for dev: /nvme1/hungdx/Lightning-hydra/data/PartialSpoof/database/protocol_dev.txt


Processing subsets: 100%|██████████| 3/3 [00:00<00:00, 11.17it/s]


First few missing files: ['CON_E_0034982', 'CON_E_0058039']
✓ Created protocol file for eval: /nvme1/hungdx/Lightning-hydra/data/PartialSpoof/database/protocol_eval.txt

Protocol file creation completed!





## Extend ASVspoof generator 

This dataset includes: Orginal ASVspoof dataset and my trimmed version of ASVspoof dataset.

In [16]:
import pandas as pd


# Prepare original train set
df_ori_train = pd.read_csv(
    "/nvme1/hungdx/Datasets/protocols/database/ASVspoof_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt", sep=" ", header=None)
df_ori_train.columns = ["unk1", "utt", "unk2", "attack", "label"]

df_ori_train["subset"] = "train"

# prepare trimmed train set

df_trimmed_train = pd.read_csv(
    "/nvme1/hungdx/Datasets/prototcols/LA19.cm.train.trn_1s.txt", sep=" ", header=None)
df_trimmed_train.columns = ["unk1", "utt", "unk2", "attack", "label"]

# add subset column
df_trimmed_train["subset"] = "train"


# Filter utt  without contain no_speech
df_trimmed_train = df_trimmed_train[~df_trimmed_train["utt"].str.contains("no_speech")]

# Prepare original dev set
df_ori_dev = pd.read_csv(
    "/nvme1/hungdx/Datasets/protocols/database/ASVspoof_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt", sep=" ", header=None)
df_ori_dev.columns = ["unk1", "utt", "unk2", "attack", "label"]

df_ori_dev["subset"] = "dev"

index_col = 'trial'
# =====
# Configuration to load CM protocol and score file
# =====
# name of data series for procotol file
p_names = ['speaker', index_col, 'compr', 'source', 'attack',
                'label', 'trim', 'subset', 'vocoder', 
                'task', 'team', 'gender-pair', 'language']


df_eval_trial = pd.read_csv(
    '/nvme1/hungdx/Datasets/protocols/database/ASVspoof_DF_cm_protocols/ASVspoof2021.DF.cm.eval.trl.txt', sep=" ", header=None)
df_eval_trial.columns = ['utt']

df_eval_meta = pd.read_csv(
    '/nvme1/hungdx/Lightning-hydra/logs/asvspoof-challenge-2021/eval-package/keys/DF/CM/trial_metadata.txt', sep=" ", header=None)

df_eval_meta.columns = p_names

# Merge trial and metadata

df_eval = pd.merge(df_eval_trial, df_eval_meta, left_on='utt', right_on=index_col)

# keep only utt and label
df_eval = df_eval[['utt', 'label']]

df_eval["subset"] = "eval"

TRAIN_PATH = "ASVspoof2019_LA_train/"
TRIMMED_TRAIN_PATH = "LA19_train_1s/"
DEV_PATH = "ASVspoof2019_LA_dev/"
EVAL_PATH = "ASVspoof2021_DF_eval/"


# Merge df_ori_train, df_trimmed_train, df_ori_dev, df_eval into a single dataframe
# Keep only utt, subset and label columns
# new utt should be the full path to the audio file
# For example, with subset = "train" and utt = "LA_T_1000001", the new utt should be f"{TRAIN_PATH}/LA_T_1000001.flac"

df_ori_train["utt"] = df_ori_train["utt"].apply(lambda x: f"{TRAIN_PATH}/{x}.flac")
df_trimmed_train["utt"] = df_trimmed_train["utt"].apply(lambda x: f"{TRIMMED_TRAIN_PATH}/{x}.flac")
df_ori_dev["utt"] = df_ori_dev["utt"].apply(lambda x: f"{DEV_PATH}/{x}.flac")
df_eval["utt"] = df_eval["utt"].apply(lambda x: f"{EVAL_PATH}/{x}.flac")


df = pd.concat([df_ori_train[["utt", "subset", "label"]], df_trimmed_train[["utt", "subset", "label"]], df_ori_dev[["utt", "subset", "label"]], df_eval[["utt", "subset", "label"]]], ignore_index=True)

df.to_csv("/nvme1/hungdx/Lightning-hydra/data/ExtendedASVspoof/protocol.txt",
          index=False, header=False, sep=" ")

# ADD track 2

In [4]:
import pandas as pd


label = pd.read_csv(
    "/nvme1/hungdx/Lightning-hydra/data/ADD_eval/track2_label.txt", sep=" ", header=None)
label.columns = ["utt", "label"]

# add subset column
label["subset"] = "eval"

# move subset column to second column
label = label[["utt", "subset", "label"]]

# Change value of label column (fake -> spoof, genuine -> bonafide)
label["label"] = label["label"].apply(lambda x: "spoof" if x == "fake" else "bonafide")

# save to file
label.to_csv("/nvme1/hungdx/Lightning-hydra/data/ADD_eval/protocol.txt",
             index=False, header=False, sep=" ")


# Noisy dataset (ASVSpoof_Laundered)

In [1]:
import pandas as pd

Filtering_df = pd.read_csv("/nvme1/Datasets/ASVSpoof_Laundered_protocols/ASVspoofLauneredDatabase_Filtering.txt", sep=" ", header=None)
Filtering_df.columns = ["LA_group", "utt", "attack", "label", "group", "subgroup"]

Noise_Addition_df = pd.read_csv("/nvme1/Datasets/ASVSpoof_Laundered_protocols/ASVspoofLauneredDatabase_Noise_Addition.txt", sep=" ", header=None)
Noise_Addition_df.columns = ["LA_group", "utt", "attack", "label", "group", "subgroup"]

Recompression_df = pd.read_csv("/nvme1/Datasets/ASVSpoof_Laundered_protocols/ASVspoofLauneredDatabase_Recompression.txt", sep=" ", header=None)
Recompression_df.columns = ["LA_group", "utt", "attack", "label", "group", "subgroup"]


Resampling_df = pd.read_csv("/nvme1/Datasets/ASVSpoof_Laundered_protocols/ASVspoofLauneredDatabase_Resampling.txt", sep=" ", header=None)
Resampling_df.columns = ["LA_group", "utt", "attack", "label", "group", "subgroup"]


Reverberation_df = pd.read_csv("/nvme1/Datasets/ASVSpoof_Laundered_protocols/ASVspoofLauneredDatabase_Reverberation.txt", sep=" ", header=None)
Reverberation_df.columns = ["LA_group", "utt", "attack", "label", "group", "subgroup"]

In [2]:
# Merge all dataframes

df = pd.concat([Filtering_df, Noise_Addition_df, Recompression_df, Resampling_df, Reverberation_df], ignore_index=True)
#df
# Create a protocol.txt file with utt, subset and label columns
# the subset column has two values: train and dev 
# The ratio of train and dev is 50:50




In [18]:


# Select only required columns: utt, label
df = df[['utt', 'label']].copy()

# Shuffle the dataset to ensure randomness
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into 50% train, 50% dev
split_idx = len(df) // 2
df.loc[:split_idx, 'subset'] = 'train'
df.loc[split_idx:, 'subset'] = 'dev'

# Reorder columns as required
df = df[['utt', 'subset', 'label']]
df['utt'] = df['utt'].apply(lambda x: f"{x}.flac")
# Save to protocol.txt (tab-separated)
df.to_csv("protocol.txt", sep=' ', index=False, header=False)

print("protocol.txt file has been created successfully!")

protocol.txt file has been created successfully!


In [15]:
print(df['subset'].value_counts())


subset
dev      1032937
train    1032936
Name: count, dtype: int64


In [20]:
# Select only required columns: utt, label
df = df[['utt', 'label']].copy()

# Separate bonafide and spoof samples
df_bonafide = df[df['label'] == 'bonafide']
df_spoof = df[df['label'] == 'spoof']

# Ensure equal number of spoof and bonafide samples
num_bonafide = len(df_bonafide)
df_spoof_balanced = df_spoof.sample(n=num_bonafide, random_state=42)

# Combine balanced dataset
df_balanced = pd.concat([df_bonafide, df_spoof_balanced])

# Shuffle the dataset for randomness
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into 50% train and 50% dev
split_idx = len(df_balanced) // 2
df_balanced.loc[:split_idx, 'subset'] = 'train'
df_balanced.loc[split_idx:, 'subset'] = 'dev'

# Reorder columns as required
df_balanced = df_balanced[['utt', 'subset', 'label']]

# Append ".flac" to utt column
df_balanced['utt'] = df_balanced['utt'].apply(lambda x: f"{x}.flac")

# Save to protocol.txt (space-separated, no header)
df_balanced.to_csv("balanced_protocol.txt", sep=' ', index=False, header=False)

print("✅ balanced_protocol.txt file has been created successfully with balanced classes!")

✅ balanced_protocol.txt file has been created successfully with balanced classes!


In [3]:
# Select only required columns: utt, label
df = df[['utt', 'label']].copy()

# Separate bonafide and spoof samples
df_bonafide = df[df['label'] == 'bonafide']
df_spoof = df[df['label'] == 'spoof']

# Ensure equal number of spoof and bonafide samples
num_bonafide = len(df_bonafide)
df_spoof_balanced = df_spoof.sample(n=num_bonafide, random_state=42)

# Combine balanced dataset
df_balanced = pd.concat([df_bonafide, df_spoof_balanced])

# Shuffle the dataset for randomness
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into 50% train and 50% dev
split_idx = len(df_balanced) // 2
df_balanced.loc[:split_idx, 'subset'] = 'train'
df_balanced.loc[split_idx:, 'subset'] = 'dev'

# Reorder columns as required
df_balanced = df_balanced[['utt', 'subset', 'label']]

# Append ".flac" to utt column
df_balanced['utt'] = df_balanced['utt'].apply(lambda x: f"ASVSpoof_Laundered_flac/{x}.flac")

# Save to protocol.txt (space-separated, no header)
# df_balanced.to_csv("balanced_protocol.txt", sep=' ', index=False, header=False)

# print("✅ balanced_protocol.txt file has been created successfully with balanced classes!")

In [5]:
df_balanced.to_csv("new_balanced_protocol.txt", sep=' ', index=False, header=False)

In [None]:
df_balanced