In [7]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m161.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.31-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m568.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.3 (from transformers)


In [1]:
import os

# Đường dẫn thư mục data
path = "data"

# Đếm số folder con
num_dirs = sum(1 for item in os.listdir(path) if os.path.isdir(os.path.join(path, item)))
print("Số thư mục trong data:", num_dirs)


Số thư mục trong data: 816


In [9]:
import os

base_dir = "data"
output_file = "full_path.txt"

with open(output_file, "w") as f:
    # Duyệt qua từng speaker folder (idxxxxx)
    for speaker in sorted(os.listdir(base_dir)):
        speaker_path = os.path.join(base_dir, speaker)
        if not os.path.isdir(speaker_path):
            continue
        
        # Duyệt qua nhãn (spoof, bonafide)
        for label in ["spoof", "bonafide"]:
            label_path = os.path.join(speaker_path, label)
            if not os.path.isdir(label_path):
                continue
            
            # Duyệt qua từng file audio
            for wav in sorted(os.listdir(label_path)):
                if wav.endswith(".wav"):
                    rel_path = os.path.join(speaker, label, wav)  # relative path
                    line = f"{speaker} {rel_path} {label}\n"
                    f.write(line)

print(f"Tạo file {output_file} thành công ✅")


Tạo file full_path.txt thành công ✅


In [6]:
import random
from collections import defaultdict
import itertools

def make_train_val_pairs(all_file, train_file="train.txt", val_file="val.txt", val_pairs_file="val_pairs.txt",
                         split_ratio=0.8, max_pairs=50000, seed=42):
    random.seed(seed)

    # B1: Gom file theo speaker, chỉ lấy bonafide (label == "0")
    speaker2lines = defaultdict(list)
    with open(all_file, "r") as f:
        for line in f:
            spk, path, label = line.strip().split()
            if label == "bonafide":  # chỉ giữ bonafide
                speaker2lines[spk].append((spk, path, label))

    speakers = list(speaker2lines.keys())
    random.shuffle(speakers)

    # B2: Chia speaker train/val
    n_train = int(len(speakers) * split_ratio)
    train_speakers = set(speakers[:n_train])
    val_speakers = set(speakers[n_train:])

    # B3: Xuất train.txt và val.txt (val chỉ giữ file chứa "orig")
    with open(train_file, "w") as f_train, open(val_file, "w") as f_val:
        for spk in train_speakers:
            for entry in speaker2lines[spk]:
                f_train.write(" ".join(entry) + "\n")
        for spk in val_speakers:
            for entry in speaker2lines[spk]:
                if "orig" in entry[1]:  # chỉ lấy file có 'orig' trong tên
                    f_val.write(" ".join(entry) + "\n")

    print(f"✅ Train: {len(train_speakers)} speakers → {train_file}")
    print(f"✅ Val:   {len(val_speakers)} speakers (only bonafide 'orig' files) → {val_file}")

    # B4: Sinh val pairs (giới hạn 20k cặp, chỉ bonafide)
    val_files_by_spk = {spk: [path for _, path, _ in lines if "orig" in path]
                        for spk, lines in speaker2lines.items() if spk in val_speakers}
    val_speakers_list = [spk for spk, files in val_files_by_spk.items() if len(files) > 0]

    # Positive pairs
    pos_pairs = []
    for spk, files in val_files_by_spk.items():
        if len(files) < 2:
            continue
        pos_pairs.extend([(f1, f2, 1) for f1, f2 in itertools.combinations(files, 2)])

    random.shuffle(pos_pairs)
    n_pos = min(len(pos_pairs), max_pairs // 2)  # 10k positive
    val_pairs = pos_pairs[:n_pos]

    # Negative pairs
    n_neg = n_pos
    while len(val_pairs) < n_pos + n_neg and len(val_speakers_list) >= 2:
        spk1, spk2 = random.sample(val_speakers_list, 2)
        f1 = random.choice(val_files_by_spk[spk1])
        f2 = random.choice(val_files_by_spk[spk2])
        val_pairs.append((f1, f2, 0))

    random.shuffle(val_pairs)

    # Xuất val_pairs.txt
    with open(val_pairs_file, "w") as f:
        for f1, f2, label in val_pairs:
            f.write(f"{f1} {f2} {label}\n")

    print(f"✅ Val pairs: {len(val_pairs)} (≈{n_pos} pos + {n_neg} neg, only bonafide) → {val_pairs_file}")


In [7]:
make_train_val_pairs("full_path.txt")

✅ Train: 652 speakers → train.txt
✅ Val:   163 speakers (only bonafide 'orig' files) → val.txt
✅ Val pairs: 50000 (≈25000 pos + 25000 neg, only bonafide) → val_pairs.txt


In [5]:
# file chứa các speaker đã train ASV
asv_train_file = "train.txt"
asv_ids = set()
with open(asv_train_file, "r") as f:
    for line in f:
        spk_id = line.strip().split()[0]   # cột 1
        asv_ids.add(spk_id)

print("Số speaker đã có trong ASV train:", len(asv_ids))


Số speaker đã có trong ASV train: 652


In [7]:
# file full
full_file = "full_path.txt"
spk2utts = {}

with open(full_file, "r") as f:
    for line in f:
        spk, path, label = line.strip().split()
        if spk not in asv_ids:   # chỉ lấy speaker chưa có trong ASV train
            spk2utts.setdefault(spk, []).append((path, label))

print("Số speaker còn lại:", len(spk2utts))


Số speaker còn lại: 163


In [12]:
import random
from collections import defaultdict

random.seed(42)

# 1. Load speaker IDs đã train ASV
asv_ids = set()
with open("train.txt", "r") as f:
    for line in f:
        spk_id = line.strip().split()[0]  # cột 1 = speaker id
        asv_ids.add(spk_id)

print(f"✅ Speakers trong ASV train: {len(asv_ids)}")

# 2. Load metadata gốc, chỉ giữ speaker chưa có trong ASV train
spk2bonafide = defaultdict(list)
spk2spoof = defaultdict(list)

with open("full_path.txt", "r") as f:
    for line in f:
        spk, path, label = line.strip().split()
        if spk in asv_ids:   # bỏ qua speaker đã train
            continue
        if label == "bonafide":
            spk2bonafide[spk].append(path)
        elif label == "spoof":
            spk2spoof[spk].append(path)

speakers = list(spk2bonafide.keys())
print(f"✅ Speakers còn lại để tạo trial: {len(speakers)}")

# 3. Sinh trials
target_trials = []

# Target: bonafide vs bonafide cùng speaker
for spk, utts in spk2bonafide.items():
    if len(utts) < 2:
        continue
    pairs = [(a, b) for i, a in enumerate(utts) for b in utts[i+1:]]
    for a, b in pairs:
        target_trials.append((a, b, "target"))

# Nontarget = tất cả còn lại
nontarget_a, nontarget_b, nontarget_c = [], [], []

# a) bonafide speaker A vs bonafide speaker B
for i in range(len(speakers)):
    for j in range(i+1, len(speakers)):
        u1 = random.choice(spk2bonafide[speakers[i]])
        u2 = random.choice(spk2bonafide[speakers[j]])
        nontarget_a.append((u1, u2, "nontarget"))

# b) bonafide vs spoof (same speaker)
for spk, bona in spk2bonafide.items():
    for b in bona:
        for s in spk2spoof.get(spk, []):
            nontarget_b.append((b, s, "nontarget"))

# c) bonafide vs spoof (different speakers)
for spk1 in speakers:
    for spk2 in spk2spoof:
        if spk1 == spk2:
            continue
        b = random.choice(spk2bonafide[spk1])
        s = random.choice(spk2spoof[spk2])
        nontarget_c.append((b, s, "nontarget"))

# ---- Balance: lấy 25k mỗi loại ----
nontarget_a = random.sample(nontarget_a, min(25000, len(nontarget_a)))
nontarget_b = random.sample(nontarget_b, min(25000, len(nontarget_b)))
nontarget_c = random.sample(nontarget_c, min(25000, len(nontarget_c)))

# Gộp lại
nontarget_trials = nontarget_a + nontarget_b + nontarget_c

# 4. Balance: lấy 25k target và 25k nontarget
target_trials = random.sample(target_trials, min(25000, len(target_trials)))

all_trials = target_trials + nontarget_trials
random.shuffle(all_trials)

# 5. Save
with open("sasv_binary_trials.txt", "w") as f:
    for enroll, test, label in all_trials:
        f.write(f"{enroll} {test} {label}\n")

print("✅ Saved trials:", len(all_trials), "Target:", len(target_trials), "Nontarget:", len(nontarget_trials))


✅ Speakers trong ASV train: 652
✅ Speakers còn lại để tạo trial: 163
✅ Saved trials: 84101 Target: 25000 Nontarget: 59101
