In [1]:
import pandas as pd
import os 

LARGE_CORPUS_FOLDER = '../0_large-corpus'
original_protocol_file_path = os.path.join(LARGE_CORPUS_FOLDER, 'protocol.txt')
df = pd.read_csv(original_protocol_file_path, sep=' ', header=None)
df.columns = ['file_name', 'subset', 'label']

df.head()

Unnamed: 0,file_name,subset,label
0,SNS/2024/Zuckerberg_38.wav,train,bonafide
1,SNS/2024/Matt_damon_58.wav,train,bonafide
2,SNS/2024/Anne_Hathaway_83.wav,train,bonafide
3,SNS/2024/Zuckerberg_127.wav,train,bonafide
4,SNS/2024/Macron_193.wav,train,bonafide


# Add more librosa trimmed

In [2]:
df_trimmed_librosa = df.copy()

TRIMMED_LIBROSA_FOLDER = 'trim_librosa'

df_trimmed_librosa['file_name'] = df_trimmed_librosa['file_name'].apply(lambda x: os.path.join(TRIMMED_LIBROSA_FOLDER, x))

df_trimmed_librosa.head()

Unnamed: 0,file_name,subset,label
0,trim_librosa/SNS/2024/Zuckerberg_38.wav,train,bonafide
1,trim_librosa/SNS/2024/Matt_damon_58.wav,train,bonafide
2,trim_librosa/SNS/2024/Anne_Hathaway_83.wav,train,bonafide
3,trim_librosa/SNS/2024/Zuckerberg_127.wav,train,bonafide
4,trim_librosa/SNS/2024/Macron_193.wav,train,bonafide


In [6]:
# Ignore vocoded files in eval set
df_trimmed_librosa = df_trimmed_librosa[~((df_trimmed_librosa["file_name"].str.startswith("trim_librosa")) & (df_trimmed_librosa["subset"] == "dev"))]
df_trimmed_librosa = df_trimmed_librosa[~((df_trimmed_librosa["file_name"].str.startswith("trim_librosa")) & (df_trimmed_librosa["subset"] == "eval"))]

In [7]:
# Concatemate df_trimmed_librosa and df
df_concat = pd.concat([df, df_trimmed_librosa], axis=0)

df_concat.describe()

Unnamed: 0,file_name,subset,label
count,218159,218159,218159
unique,218159,3,2
top,trim_librosa/Real/ASVspoof5/T_0000020664.wav,eval,spoof
freq,1,134441,120695


In [8]:
# save df_trimmed_librosa
df_trimmed_librosa.to_csv(os.path.join(LARGE_CORPUS_FOLDER, 'protocol_trimmed_librosa_v2.txt'), sep=' ', header=False, index=False)

In [17]:
from tqdm import tqdm

with open('./scp_librosa_trim/bonafide_train.lst', 'w') as f:
    real_train = df_trimmed_librosa[(df_trimmed_librosa['subset'] == 'train') &
                         (df_trimmed_librosa['label'] == 'bonafide')]
    for index, row in tqdm(real_train.iterrows(), total=real_train.shape[0]):
        f.write(f'{row["file_name"]}\n')
        # shutil.copy(row['path'], 'vocoded/')

with open('./scp_librosa_trim/bonafide_dev.lst', 'w') as f:
    real_dev = df_trimmed_librosa[(df_trimmed_librosa['subset'] == 'dev') &
                       (df_trimmed_librosa['label'] == 'bonafide')]
    for index, row in tqdm(real_dev.iterrows(), total=real_dev.shape[0]):
        f.write(f'{row["file_name"]}\n')
        # shutil.copy(row['path'], 'vocoded/')

with open('./scp_librosa_trim/spoof_train.lst', 'w') as f:
    real_train = df_trimmed_librosa[(df_trimmed_librosa['subset'] == 'train') &
                         (df_trimmed_librosa['label'] == 'spoof')]
    for index, row in tqdm(real_train.iterrows(), total=real_train.shape[0]):
        f.write(f'{row["file_name"]}\n')


with open('./scp_librosa_trim/spoof_dev.lst', 'w') as f:
    real_dev = df_trimmed_librosa[(df_trimmed_librosa['subset'] == 'dev') &
                       (df_trimmed_librosa['label'] == 'spoof')]
    for index, row in tqdm(real_dev.iterrows(), total=real_dev.shape[0]):
        f.write(f'{row["file_name"]}\n')


with open('./scp_librosa_trim/eval.lst', 'w') as f:
    full_eval = df_trimmed_librosa[df_trimmed_librosa['subset'] == 'eval']
    for index, row in tqdm(full_eval.iterrows(), total=full_eval.shape[0]):
        f.write(f'{row["file_name"]}\n')

  0%|          | 0/9850 [00:00<?, ?it/s]

100%|██████████| 9850/9850 [00:00<00:00, 33045.51it/s]
100%|██████████| 9787/9787 [00:00<00:00, 29515.01it/s]
100%|██████████| 19023/19023 [00:00<00:00, 38587.49it/s]
100%|██████████| 16185/16185 [00:00<00:00, 34050.36it/s]
100%|██████████| 134441/134441 [00:03<00:00, 37602.80it/s]


# Add VAD trimmed

In [None]:
import pandas as pd
import os
import re
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from functools import partial
import numpy as np


def get_base_filename(filename):
    # Extract base filename (number before '___')
    match = re.search(r'(\d+)(?:___\d+.*)?\.wav$', os.path.basename(filename))
    if match:
        return f"{os.path.dirname(filename)}/{match.group(1)}.wav"
    return filename


def is_speech_segment(filename):
    # Check if the file is a speech segment (no 'residual' or 'no_speech' in name)
    basename = os.path.basename(filename)
    if '___' not in basename:
        return True
    return 'residual' not in basename and 'no_speech' not in basename


def process_file_chunk(files, trim_folder):
    speech_segments = {}
    for file in files:
        if is_speech_segment(file):
            base_file = get_base_filename(file)
            if base_file not in speech_segments:
                speech_segments[base_file] = []
            speech_segments[base_file].append(file)
    return speech_segments


def create_new_protocol(meta_data_path, trim_folder, n_workers=None):
    if n_workers is None:
        n_workers = cpu_count() - 1  # Leave one CPU free

    # Read original protocol
    print("Reading protocol file...")
    meta_data = pd.read_csv(meta_data_path, sep=' ', header=None)
    meta_data.columns = ['file_name', 'subset', 'label']

    # Get all files from trim folder
    print("Scanning trim folder...")
    all_files = []
    for root, _, files in tqdm(list(os.walk(trim_folder)), desc="Scanning directories"):
        for file in files:
            if file.endswith('.wav'):
                all_files.append(os.path.relpath(
                    os.path.join(root, file), trim_folder))

    # Split files into chunks for parallel processing
    chunk_size = len(all_files) // n_workers + 1
    file_chunks = np.array_split(all_files, n_workers)

    # Process chunks in parallel
    print(f"Processing files using {n_workers} workers...")
    with Pool(n_workers) as pool:
        partial_process = partial(process_file_chunk, trim_folder=trim_folder)
        results = list(tqdm(
            pool.imap(partial_process, file_chunks),
            total=len(file_chunks),
            desc="Processing file chunks"
        ))

    # Merge results from all workers
    speech_segments = {}
    for result in results:
        for base_file, segments in result.items():
            if base_file not in speech_segments:
                speech_segments[base_file] = []
            speech_segments[base_file].extend(segments)

    # Create new protocol entries
    print("Creating new protocol entries...")
    new_entries = []
    for _, row in tqdm(meta_data.iterrows(), total=len(meta_data), desc="Creating entries"):
        original_file = row['file_name']
        base_file = get_base_filename(original_file)

        if base_file in speech_segments:
            for segment in speech_segments[base_file]:
                new_entries.append({
                    'file_name': segment,
                    'subset': row['subset'],
                    'label': row['label']
                })

    # Create new protocol DataFrame
    print("Creating final DataFrame...")
    new_protocol = pd.DataFrame(new_entries)
    print(f"Original protocol size: {len(meta_data)}")
    print(f"New protocol size: {len(new_protocol)}")
    return new_protocol


# Usage example
meta_data_path = original_protocol_file_path
trim_folder = os.path.join(LARGE_CORPUS_FOLDER, 'trim')

# You can specify the number of workers, or let it use (CPU cores - 1)
new_protocol = create_new_protocol(meta_data_path, trim_folder, n_workers=20)
new_protocol.head()
# new_protocol.to_csv('trim_protocol.txt', sep='\t', index=False)
new_protocol['file_name'] = new_protocol['file_name'].apply(lambda x: os.path.join('trim', x))

Reading protocol file...
Scanning trim folder...


Scanning directories: 100%|██████████| 367/367 [00:03<00:00, 110.99it/s]

Processing files using 20 workers...



Processing file chunks: 100%|██████████| 20/20 [00:01<00:00, 14.45it/s]


Creating new protocol entries...


Creating entries: 100%|██████████| 189286/189286 [01:00<00:00, 3134.22it/s] 


Creating final DataFrame...
Original protocol size: 189286
New protocol size: 1891235


In [7]:
df_concat.groupby(['subset', 'label']).size()

subset  label   
dev     bonafide     19574
        spoof        32370
eval    bonafide    135954
        spoof       132928
train   bonafide     19700
        spoof        38046
dtype: int64

In [5]:
# Concatemate df_trimmed_librosa and df
df_concat2 = pd.concat([df_concat, new_protocol], axis=0)

# save df_concat2
df_concat2.to_csv('new_protocol.txt', sep=' ', index=False, header=False)

In [6]:

df_concat2 = pd.read_csv('new_protocol.txt', sep=' ', header=None)
df_concat2.columns = ['file_name', 'subset', 'label']

# Check number of spoof and bonafide samples each subset
df_concat2.groupby(['subset', 'label']).size()



subset  label   
dev     bonafide     1784317
        spoof         141749
eval    bonafide    14488209
        spoof         909166
train   bonafide     1792113
        spoof         175372
dtype: int64

# Vocoder add

In [9]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

vocoded_list = ['hifigan', 'hn-sinc-nsf-hifi', 'waveglow']
vocoded_dict = {}

print("Scanning vocoded folders...")

for vocoder in vocoded_list:
    print(f"\nProcessing {vocoder}...")
    vocoder_folder = os.path.join(LARGE_CORPUS_FOLDER, 'vocoded', vocoder)
    all_files = []

    # Collect all files with correct relative paths
    for root, _, files in tqdm(list(os.walk(vocoder_folder)), desc="Scanning directories"):
        for file in files:
            if file.endswith('.wav'):
                # Create path in format: vocoded/<vocoder_type>/filename.wav
                file_path = os.path.join('vocoded', vocoder, file)
                all_files.append(file_path)

    print(f"Total files found for {vocoder}: {len(all_files)}")

    if len(all_files) > 10000:
        print(f"Randomly selecting 10000 files from {len(all_files)} files")
        all_files = np.random.choice(all_files, 10000, replace=False)
    else:
        print(f"Using all {len(all_files)} files (less than 10000)")

    vocoded_dict[vocoder] = sorted(all_files)  # Sort for consistency

# Print example paths for verification
print("\nExample paths from each vocoder:")
for vocoder in vocoded_list:
    print(f"\n{vocoder} first 5 files:")
    for file in vocoded_dict[vocoder][:5]:
        print(f"  {file}")

# Return first 10 files from hifigan for the original request
vocoded_dict['hifigan'][:10]

Scanning vocoded folders...

Processing hifigan...


Scanning directories: 100%|██████████| 1/1 [00:00<00:00,  5.59it/s]


Total files found for hifigan: 150591
Randomly selecting 10000 files from 150591 files

Processing hn-sinc-nsf-hifi...


Scanning directories: 100%|██████████| 1/1 [00:00<00:00,  6.67it/s]


Total files found for hn-sinc-nsf-hifi: 150591
Randomly selecting 10000 files from 150591 files

Processing waveglow...


Scanning directories: 100%|██████████| 1/1 [00:00<00:00,  6.57it/s]


Total files found for waveglow: 150591
Randomly selecting 10000 files from 150591 files

Example paths from each vocoder:

hifigan first 5 files:
  vocoded/hifigan/01FHSH0028_00212.wav
  vocoded/hifigan/01FHSH0028_00274.wav
  vocoded/hifigan/01FHSH0028_00309.wav
  vocoded/hifigan/01FHSH0028_00353.wav
  vocoded/hifigan/01FHSH0028_00450.wav

hn-sinc-nsf-hifi first 5 files:
  vocoded/hn-sinc-nsf-hifi/01FHSH0028_00221.wav
  vocoded/hn-sinc-nsf-hifi/01FHSH0028_00397.wav
  vocoded/hn-sinc-nsf-hifi/01FHSH0028_00684.wav
  vocoded/hn-sinc-nsf-hifi/01MLD00033_00763.wav
  vocoded/hn-sinc-nsf-hifi/01MLD00033_00948.wav

waveglow first 5 files:
  vocoded/waveglow/01FHSH0028_00240.wav
  vocoded/waveglow/01FHSH0028_00263.wav
  vocoded/waveglow/01MPJH0028_00196.wav
  vocoded/waveglow/01MPJH0028_00213.wav
  vocoded/waveglow/01MPJH0028_00258.wav


['vocoded/hifigan/01FHSH0028_00212.wav',
 'vocoded/hifigan/01FHSH0028_00274.wav',
 'vocoded/hifigan/01FHSH0028_00309.wav',
 'vocoded/hifigan/01FHSH0028_00353.wav',
 'vocoded/hifigan/01FHSH0028_00450.wav',
 'vocoded/hifigan/01FHSH0028_00782.wav',
 'vocoded/hifigan/01FHSH0028_01001.wav',
 'vocoded/hifigan/01MLD00033_00983.wav',
 'vocoded/hifigan/01MPJH0028_00196.wav',
 'vocoded/hifigan/01MPJH0028_00258.wav']

In [10]:
# Randomly pick 50% of the files from each vocoder for train 50% for dev 
# label them as spoof and concat them with df_concat

# Create new protocol entries
print("Creating new protocol entries...")
new_entries = []
for vocoder, files in vocoded_dict.items():
    for file in tqdm(files, desc=f"Processing {vocoder}"):
        new_entries.append({
            'file_name': file,
            'subset': 'train',
            'label': 'spoof'
        })

# Create new protocol DataFrame
print("Creating final DataFrame...")
new_protocol_vocoded = pd.DataFrame(new_entries)
print(f"Original protocol size: {len(df_concat)}")
print(f"New protocol size: {len(new_protocol_vocoded)}")

# Concatemate df_concat and new_protocol_vocoded
df_concat3 = pd.concat([df_concat, new_protocol_vocoded], axis=0)

# save df_concat3
df_concat3.to_csv('new_protocol_trim_vocoded_v2.txt', sep=' ', index=False, header=False)

Creating new protocol entries...


Processing hifigan: 100%|██████████| 10000/10000 [00:00<00:00, 1463061.25it/s]
Processing hn-sinc-nsf-hifi: 100%|██████████| 10000/10000 [00:00<00:00, 1650780.86it/s]
Processing waveglow: 100%|██████████| 10000/10000 [00:00<00:00, 1680208.31it/s]

Creating final DataFrame...
Original protocol size: 218159
New protocol size: 30000





In [11]:

df_concat3 = pd.read_csv('new_protocol_trim_vocoded_v2.txt', sep=' ', header=None)
df_concat3.columns = ['file_name', 'subset', 'label']

# Check number of spoof and bonafide samples each subset
df_concat3.groupby(['subset', 'label']).size()

subset  label   
dev     bonafide     9787
        spoof       16185
eval    bonafide    67977
        spoof       66464
train   bonafide    19700
        spoof       68046
dtype: int64