In [7]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m161.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.31-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m568.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.3 (from transformers)


In [1]:
import os

# Đường dẫn thư mục data
path = "data"

# Đếm số folder con
num_dirs = sum(1 for item in os.listdir(path) if os.path.isdir(os.path.join(path, item)))
print("Số thư mục trong data:", num_dirs)


Số thư mục trong data: 816


In [9]:
import os

base_dir = "data"
output_file = "full_path.txt"

with open(output_file, "w") as f:
    # Duyệt qua từng speaker folder (idxxxxx)
    for speaker in sorted(os.listdir(base_dir)):
        speaker_path = os.path.join(base_dir, speaker)
        if not os.path.isdir(speaker_path):
            continue
        
        # Duyệt qua nhãn (spoof, bonafide)
        for label in ["spoof", "bonafide"]:
            label_path = os.path.join(speaker_path, label)
            if not os.path.isdir(label_path):
                continue
            
            # Duyệt qua từng file audio
            for wav in sorted(os.listdir(label_path)):
                if wav.endswith(".wav"):
                    rel_path = os.path.join(speaker, label, wav)  # relative path
                    line = f"{speaker} {rel_path} {label}\n"
                    f.write(line)

print(f"Tạo file {output_file} thành công ✅")


Tạo file full_path.txt thành công ✅


In [6]:
import random
from collections import defaultdict
import itertools

def make_train_val_pairs(all_file, train_file="train.txt", val_file="val.txt", val_pairs_file="val_pairs.txt",
                         split_ratio=0.8, max_pairs=50000, seed=42):
    random.seed(seed)

    # B1: Gom file theo speaker, chỉ lấy bonafide (label == "0")
    speaker2lines = defaultdict(list)
    with open(all_file, "r") as f:
        for line in f:
            spk, path, label = line.strip().split()
            if label == "bonafide":  # chỉ giữ bonafide
                speaker2lines[spk].append((spk, path, label))

    speakers = list(speaker2lines.keys())
    random.shuffle(speakers)

    # B2: Chia speaker train/val
    n_train = int(len(speakers) * split_ratio)
    train_speakers = set(speakers[:n_train])
    val_speakers = set(speakers[n_train:])

    # B3: Xuất train.txt và val.txt (val chỉ giữ file chứa "orig")
    with open(train_file, "w") as f_train, open(val_file, "w") as f_val:
        for spk in train_speakers:
            for entry in speaker2lines[spk]:
                f_train.write(" ".join(entry) + "\n")
        for spk in val_speakers:
            for entry in speaker2lines[spk]:
                if "orig" in entry[1]:  # chỉ lấy file có 'orig' trong tên
                    f_val.write(" ".join(entry) + "\n")

    print(f"✅ Train: {len(train_speakers)} speakers → {train_file}")
    print(f"✅ Val:   {len(val_speakers)} speakers (only bonafide 'orig' files) → {val_file}")

    # B4: Sinh val pairs (giới hạn 20k cặp, chỉ bonafide)
    val_files_by_spk = {spk: [path for _, path, _ in lines if "orig" in path]
                        for spk, lines in speaker2lines.items() if spk in val_speakers}
    val_speakers_list = [spk for spk, files in val_files_by_spk.items() if len(files) > 0]

    # Positive pairs
    pos_pairs = []
    for spk, files in val_files_by_spk.items():
        if len(files) < 2:
            continue
        pos_pairs.extend([(f1, f2, 1) for f1, f2 in itertools.combinations(files, 2)])

    random.shuffle(pos_pairs)
    n_pos = min(len(pos_pairs), max_pairs // 2)  # 10k positive
    val_pairs = pos_pairs[:n_pos]

    # Negative pairs
    n_neg = n_pos
    while len(val_pairs) < n_pos + n_neg and len(val_speakers_list) >= 2:
        spk1, spk2 = random.sample(val_speakers_list, 2)
        f1 = random.choice(val_files_by_spk[spk1])
        f2 = random.choice(val_files_by_spk[spk2])
        val_pairs.append((f1, f2, 0))

    random.shuffle(val_pairs)

    # Xuất val_pairs.txt
    with open(val_pairs_file, "w") as f:
        for f1, f2, label in val_pairs:
            f.write(f"{f1} {f2} {label}\n")

    print(f"✅ Val pairs: {len(val_pairs)} (≈{n_pos} pos + {n_neg} neg, only bonafide) → {val_pairs_file}")


In [7]:
make_train_val_pairs("full_path.txt")

✅ Train: 652 speakers → train.txt
✅ Val:   163 speakers (only bonafide 'orig' files) → val.txt
✅ Val pairs: 50000 (≈25000 pos + 25000 neg, only bonafide) → val_pairs.txt


In [3]:
import random
from collections import defaultdict, Counter

random.seed(42)

# === 1. Load ASV speaker IDs đã dùng (loại bỏ khỏi cohort) ===
asv_ids = set()
with open("./path_list/train_asv.txt", "r") as f:
    for line in f:
        spk_id = line.strip().split()[0]
        asv_ids.add(spk_id)

# === 2. Load metadata và tách bonafide / spoof ===
spk2bonafide = defaultdict(list)
spk2spoof = defaultdict(list)

with open("./path_list/train_vlsp_2025_metadata.txt", "r") as f:
    for line in f:
        spk, path, label = line.strip().split()
        if spk in asv_ids:
            continue
        if label == "bonafide":
            spk2bonafide[spk].append(path)
        elif label == "spoof":
            spk2spoof[spk].append(path)

speakers = list(spk2bonafide.keys())
print(f"✅ Số speaker còn lại sau khi loại ASV: {len(speakers)}")

output_file = "sasv_binary_trials_3labels.txt"

# === 3. Tạo target trials (cùng speaker, đều bonafide) ===
target_goal = 500000
max_enroll_usage_target = 30
enroll_usage_target = Counter()
target_count = 0

with open(output_file, "w") as f:
    for spk in speakers:
        utts = spk2bonafide[spk]
        if len(utts) < 2:
            continue

        pairs = [(a, b) for i, a in enumerate(utts) for b in utts[i + 1:]]
        random.shuffle(pairs)

        for a, b in pairs:
            if enroll_usage_target[a] >= max_enroll_usage_target:
                continue
            f.write(f"{a} {b} target\n")
            enroll_usage_target[a] += 1
            target_count += 1
            if target_count >= target_goal:
                break
        if target_count >= target_goal:
            break

print(f"✅ Đã sinh {target_count} target pairs")

# === 4. Nontarget trials (khác speaker, đều bonafide) ===
required_nontarget = target_count  # bạn có thể set tỉ lệ khác nếu muốn
nontarget_count = 0
max_enroll_usage_nt = 15
enroll_usage_nontarget = Counter()

with open(output_file, "a") as f:
    for i in range(len(speakers)):
        for j in range(i + 1, len(speakers)):
            u1_list = spk2bonafide[speakers[i]]
            u2_list = spk2bonafide[speakers[j]]
            random.shuffle(u1_list)
            random.shuffle(u2_list)
            for u1 in u1_list:
                if enroll_usage_nontarget[u1] >= max_enroll_usage_nt:
                    continue
                for u2 in u2_list:
                    f.write(f"{u1} {u2} nontarget\n")
                    enroll_usage_nontarget[u1] += 1
                    nontarget_count += 1
                    if nontarget_count >= required_nontarget:
                        break
                if nontarget_count >= required_nontarget:
                    break
            if nontarget_count >= required_nontarget:
                break
        if nontarget_count >= required_nontarget:
            break

print(f"✅ Đã sinh {nontarget_count} nontarget pairs")

# === 5. Spoof trials (file verification là spoof) ===
# a) bonafide–spoof cùng speaker
spoof_count = 0
max_enroll_usage_spoof = 10
enroll_usage_spoof = Counter()

with open(output_file, "a") as f:
    for spk in speakers:
        for b in spk2bonafide[spk]:
            if enroll_usage_spoof[b] >= max_enroll_usage_spoof:
                continue
            for s in spk2spoof.get(spk, []):
                f.write(f"{b} {s} spoof\n")
                enroll_usage_spoof[b] += 1
                spoof_count += 1

    # b) bonafide–spoof khác speaker
    for spk1 in speakers:
        for spk2 in spk2spoof:
            if spk1 == spk2:
                continue
            b_list = spk2bonafide[spk1]
            s_list = spk2spoof[spk2]
            if not s_list:
                continue
            random.shuffle(b_list)
            for b in b_list:
                if enroll_usage_spoof[b] >= max_enroll_usage_spoof:
                    continue
                s = random.choice(s_list)
                f.write(f"{b} {s} spoof\n")
                enroll_usage_spoof[b] += 1
                spoof_count += 1

print(f"✅ Đã sinh {spoof_count} spoof pairs")
print(f"▶ Target: {target_count}")
print(f"▶ Nontarget: {nontarget_count}")
print(f"▶ Spoof: {spoof_count}")
print(f"▶ Tổng cộng: {target_count + nontarget_count + spoof_count}")


✅ Số speaker còn lại sau khi loại ASV: 163
✅ Đã sinh 336913 target pairs
✅ Đã sinh 336913 nontarget pairs
✅ Đã sinh 529603 spoof pairs
▶ Target: 336913
▶ Nontarget: 336913
▶ Spoof: 529603
▶ Tổng cộng: 1203429
