## This notebook is designed to update protocol.txt files from following datasets to standardize the format with [huggingface](https://huggingface.co/spaces/Speech-Arena-2025/Speech-DF-Arena) datasets

List of datasets:
- ASVSpoof 2019 eval 
- ASVSpoof 2021 LA eval
- ASVSpoof 2021 DF eval
- ASVSpoof 2024 eval
- SONAR
- Codecfake
- Fake or Real

In [2]:
import pandas as pd
import numpy as np
import os
import sys

### ASVSpoof 2019 eval

In [14]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_asv19 = pd.read_csv(
    os.path.join(hf_protocol_dir, "asvspoof_2019.csv"),
    sep=",")
# change the column names to be more readable
hf_asv19.columns = [
    "utt", "label"]
hf_asv19

Unnamed: 0,utt,label
0,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
1,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
2,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
3,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
4,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
...,...,...
71232,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,bonafide
71233,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
71234,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,spoof
71235,/data/data/AntiSpoofing-Datasets/ASVSpoof2019_...,bonafide


In [15]:
current_asv19 = pd.read_csv("/nvme1/hungdx/Lightning-hydra/data/huggingface_benchrmark_Speech-DF-Arena/ASVspoof2019_LA_eval/protocol.txt", sep=" ", header=None)
current_asv19.columns = ["utt", "subset", "label"]
current_asv19

Unnamed: 0,utt,subset,label
0,flac/LA_E_8877452.flac,eval,spoof
1,flac/LA_E_6828287.flac,eval,spoof
2,flac/LA_E_6977360.flac,eval,spoof
3,flac/LA_E_5932896.flac,eval,spoof
4,flac/LA_E_5849185.flac,eval,bonafide
...,...,...,...
71231,flac/LA_E_1665632.flac,eval,bonafide
71232,flac/LA_E_5085671.flac,eval,spoof
71233,flac/LA_E_4926022.flac,eval,spoof
71234,flac/LA_E_2894498.flac,eval,bonafide


In [None]:
import pandas as pd

# Ensure 'utt' column exists in both DataFrames
if 'utt' not in current_asv19.columns or 'utt' not in hf_asv19.columns:
    raise ValueError("Column 'utt' not found in one or both DataFrames")

# Extract the final component (filename) from 'utt' column
current_asv19['utt'] = current_asv19['utt'].apply(lambda x: x.split('/')[-1] if isinstance(x, str) else x)
hf_asv19['utt'] = hf_asv19['utt'].apply(lambda x: x.split('/')[-1] if isinstance(x, str) else x)

# Check lines missing from current_asv19
missing_lines = set(hf_asv19['utt']) - set(current_asv19['utt'])
print(f"Missing lines in current_asv19: {missing_lines}")

# Add missing lines to current_asv19
if missing_lines:
    missing_rows = hf_asv19[hf_asv19['utt'].isin(missing_lines)]
    current_asv19 = pd.concat([current_asv19, missing_rows], ignore_index=True)
else:
    print("No missing lines to add")

# Optional: Verify the updated DataFrame
print(f"Updated current_asv19 shape: {current_asv19.shape}")

# Manually add the missing lines to current_asv19

Missing lines in current_asv19: {'LA_E_2834763.flac'}
Updated current_asv19 shape: (71237, 3)


### ASVSpoof 2021 LA eval

In [None]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_asvla21 = pd.read_csv(
    os.path.join(hf_protocol_dir, "asvspoof_2021_la.csv"),
    sep=",")
# change the column names to be more readable
hf_asvla21.columns = [
    "utt", "label"]

hf_asvla21['utt'] = hf_asvla21['utt'].apply(lambda x: x.split('/')[-1] if isinstance(x, str) else x)

# Add column 'subset' to hf_asvla21
hf_asvla21['subset'] = 'eval'


# change column order to [utt, subset, label]
hf_asvla21 = hf_asvla21[["utt", "subset", "label"]]

# export the DataFrame to a CSV file
hf_asvla21.to_csv("asvspoof_2021_la.txt", sep=" ", index=False, header=None)


### ASVSpoof 2021 DF eval

In [22]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_asvdf21 = pd.read_csv(
    os.path.join(hf_protocol_dir, "asvspoof_2021_df.csv"),
    sep=",")
# change the column names to be more readable
hf_asvdf21.columns = [
    "utt", "label"]

hf_asvdf21['utt'] = hf_asvdf21['utt'].apply(lambda x: x.split('/')[-1] if isinstance(x, str) else x)

# Add column 'subset' to hf_asvdf21
hf_asvdf21['subset'] = 'eval'


# change column order to [utt, subset, label]
hf_asvdf21 = hf_asvdf21[["utt", "subset", "label"]]

# export the DataFrame to a CSV file
hf_asvdf21.to_csv("asvspoof_2021_df.txt", sep=" ", index=False, header=None)


### ASVSpoof 2024 eval
correct 

### SONAR

In [5]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_sonar = pd.read_csv(
    os.path.join(hf_protocol_dir, "sonar.csv"),
    sep=",")
# change the column names to be more readable
hf_sonar.columns = [
    "utt", "label"]

# Add column 'subset' to hf_asvdf21
hf_sonar['subset'] = 'eval'


# change column order to [utt, subset, label]
hf_sonar = hf_sonar[["utt", "subset", "label"]]

hf_sonar['utt'] = hf_sonar['utt'].apply(lambda x: x.split('/data/data/AntiSpoofing-Datasets/SONAR/SONAR_dataset/')[-1] if isinstance(x, str) else x)
hf_sonar.to_csv("sonar.txt", sep=" ", index=False, header=None)

### Codecfake

In [6]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_codecfake = pd.read_csv(
    os.path.join(hf_protocol_dir, "codecfake.csv"),
    sep=",")
# change the column names to be more readable
hf_codecfake.columns = [
    "utt", "label"]

hf_codecfake['utt'] = hf_codecfake['utt'].apply(lambda x: x.split('/data/data/AntiSpoofing-Datasets/CodecFake/')[-1] if isinstance(x, str) else x)

# Add column 'subset' to hf_asvdf21
hf_codecfake['subset'] = 'eval'


# change column order to [utt, subset, label]
hf_codecfake = hf_codecfake[["utt", "subset", "label"]]
            
hf_codecfake.to_csv("codecfake.txt", sep=" ", index=False, header=None)


### Fake or Real

In [7]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_fake_or_real = pd.read_csv(
    os.path.join(hf_protocol_dir, "fake_or_real.csv"),
    sep=",")
# change the column names to be more readable
hf_fake_or_real.columns = [
    "utt", "label"]

hf_fake_or_real['utt'] = hf_fake_or_real['utt'].apply(lambda x: x.split('/data/data/AntiSpoofing-Datasets/for-norm/')[-1] if isinstance(x, str) else x)

# Add column 'subset' to hf_asvdf21
hf_fake_or_real['subset'] = 'eval'


# change column order to [utt, subset, label]
hf_fake_or_real = hf_fake_or_real[["utt", "subset", "label"]]
     
hf_fake_or_real.to_csv("fake_or_real.txt", sep=" ", index=False, header=None)


# In the wild

In [8]:
hf_protocol_dir = "/nvme1/hungdx/Lightning-hydra/data/speech_df_arena/protocol_files"
hf_in_the_wild = pd.read_csv(
    os.path.join(hf_protocol_dir, "in_the_wild.csv"),
    sep=",")
# change the column names to be more readable
hf_in_the_wild.columns = [
    "utt", "label"]

hf_in_the_wild['utt'] = hf_in_the_wild['utt'].apply(lambda x: x.split('/data/data/AntiSpoofing-Datasets/In_Wild/')[-1] if isinstance(x, str) else x)

# Add column 'subset' to hf_asvdf21
hf_in_the_wild['subset'] = 'eval'


# change column order to [utt, subset, label]
hf_in_the_wild = hf_in_the_wild[["utt", "subset", "label"]]
     
hf_in_the_wild.to_csv("in_the_wild.txt", sep=" ", index=False, header=None)
