In [1]:
# Standard library
import os
import math
import random
import time
import zipfile
import shutil
from pathlib import Path
import pathlib
import requests
import zipfile
from typing import Tuple, Dict, List

# Third-party libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, Subset, IterableDataset, SubsetRandomSampler
import torchaudio
import torchaudio.functional as Fa
from torchaudio.utils import download_asset
import soundfile as sf
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio
from tqdm import tqdm
import wandb
from safetensors.torch import load_file
from transformers import Trainer, TrainingArguments
from collections import defaultdict
from sklearn import metrics
from sklearn.model_selection import train_test_split

2025-08-18 15:38:49.691584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755531529.968836      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755531530.061569      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data Augmentation + Build CustomDataset

## Data Augmentation

In [2]:
SAMPLE_RATE = 16000

### Add RIR

In [3]:
### 1. Mô phỏng âm thanh vang được nói trong phòng; sử dụng tích chập với RIR
""" Cách làm:
load waveform và chỉ định sample rate= 16000Hz bằng thư viện librosa
"""

# Load file audio RIR bằng librosa, cast sample rate = 16000Hz
SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")
rir_waveform_librosa, rir_sample_rate_librosa = librosa.load(SAMPLE_RIR,sr = 16000)
# convert sang tensor
rir_waveform = torch.tensor(rir_waveform_librosa)
# First, we need to clean up the RIR. We extract the main impulse and normalize it by its power.
rir_waveform = rir_waveform[int(SAMPLE_RATE * 1.01) : int(SAMPLE_RATE * 1.3)]
rir_waveform = rir_waveform / torch.norm(rir_waveform, p=2)

# Then, by using: torchaudio.functional.fftconvolve(), we convolve the speech signal with the RIR.
def add_RIR(origin_waveform, num_frames):
  rir_applied_waveform = Fa.fftconvolve(origin_waveform, rir_waveform)
  return rir_applied_waveform[: num_frames * 160 + 240].type(torch.float)

100%|██████████| 31.3k/31.3k [00:00<00:00, 36.2MB/s]


### Add Background Noise

In [4]:
### 2. Background noise
SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")

def add_background_noise(origin_waveform):
  noise_np, _ =  sf.read(SAMPLE_NOISE)
  target_len = origin_waveform.shape[0]
  noise_np = noise_np[:target_len]
  noise_np = np.pad(noise_np, (0,target_len - len(noise_np)), 'wrap')
  noise_np = torch.from_numpy(noise_np)

  snr_db = torch.tensor(10)
  bg_added_waveform = Fa.add_noise(origin_waveform, noise_np, snr_db)
  return bg_added_waveform.type(torch.float)

100%|██████████| 78.2k/78.2k [00:00<00:00, 11.6MB/s]


### Simulate a Phone Call

In [5]:
import torchaudio.transforms as T

def simulate_phone_call(origin_waveform: torch.Tensor, num_frames: int) -> torch.Tensor:
    """
    Mô phỏng audio gọi điện thoại (không dùng torchaudio.sox_effects).
    """
    # 1. Apply RIR
    rir_applied_waveform = add_RIR(origin_waveform, num_frames)

    # 2. Add background noise
    noisy_waveform = add_background_noise(rir_applied_waveform)

    # 3. Downsample về 8000Hz
    downsample = T.Resample(orig_freq=16000, new_freq=8000)
    lowrate = downsample(noisy_waveform.unsqueeze(0))  # shape (1, L)

    # 4. Mô phỏng codec: apply GSM codec (nếu lỗi thì bỏ qua)
    try:
        coded = Fa.apply_codec(lowrate, 8000, format="gsm")  # (1, L)
    except:
        # Nếu không có codec thì dùng luôn tín hiệu downsample
        coded = lowrate

    # 5. Upsample lại về 16kHz
    upsample = T.Resample(orig_freq=8000, new_freq=16000)
    final = upsample(coded)

    return final.squeeze()[:num_frames * 160 + 240]

## Add Music Noise

In [6]:
MUSIC_PATH = "/kaggle/input/music-aug/music-fma-0000.wav"

def add_music_noise(origin_waveform: torch.Tensor, snr_range=(5, 15)) -> torch.Tensor:
    """
    Trộn nhạc vào origin_waveform với SNR 5–15 dB.
    """
    # Load music file
    music_waveform, music_sr = torchaudio.load(MUSIC_PATH)

    # Resample về 16kHz nếu cần
    if music_sr != 16000:
        resample = torchaudio.transforms.Resample(orig_freq=music_sr, new_freq=16000)
        music_waveform = resample(music_waveform)

    music_waveform = music_waveform[0]  # mono

    target_len = origin_waveform.shape[0]

    # Cắt hoặc wrap nhạc
    if music_waveform.shape[0] >= target_len:
        music_waveform = music_waveform[:target_len]
    else:
        shortage = target_len - music_waveform.shape[0]
        music_waveform = torch.cat([music_waveform, music_waveform[:shortage]], dim=0)

    # === Tính SNR ===
    snr_db = random.uniform(*snr_range)
    snr = 10 ** (snr_db / 10)

    # Tính năng lượng
    power_signal = origin_waveform.pow(2).mean()
    power_noise = music_waveform.pow(2).mean()

    # Tính hệ số scale cho noise
    scale = torch.sqrt(power_signal / (power_noise * snr + 1e-8))
    music_scaled = music_waveform * scale

    # Trộn
    mixed = origin_waveform + music_scaled
    return mixed.type(torch.float)

### SETUP

In [7]:
# === SETUP ===
INPUT_ROOT = "/kaggle/input/vsasv-train/vlsp_train/home4/vuhl/VSASV-Dataset/vlsp2025/train"
OUTPUT_ROOT = "/kaggle/working/augmented_output"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

START_INDEX = 50000
END_INDEX = 75000
# START_INDEX = 25_001
# END_INDEX = 50_000
# START_INDEX = 50_001
# END_INDEX = 75_000
# START_INDEX = 75_001
# END_INDEX = 100_000
NUM_FRAMES = 400
OUTPUT_ORIG = True  # lưu cả file gốc nếu muốn


### Export to Folder

In [8]:
# === UTIL: Lấy danh sách tất cả file .wav ===
def list_all_wavs_in_vsasv_structure(root):
    paths = list(pathlib.Path(root).glob("id*/**/*.wav"))
    return sorted(paths)

# === Tạo thư mục gốc nếu chưa có ===
os.makedirs(OUTPUT_ROOT, exist_ok=True)

# === RUN AUGMENT ===
all_files = list_all_wavs_in_vsasv_structure(INPUT_ROOT)
#selected_files = all_files[50000:75000]
selected_files = all_files
print(f"🔎 Tổng số file WAV sẽ xử lý: {len(selected_files)}")

for path in tqdm(selected_files):
    waveform_np, sr = sf.read(path)
    length = NUM_FRAMES * 160 + 240

    # Pad nếu audio quá ngắn
    if waveform_np.shape[0] < length:
        shortage = length - waveform_np.shape[0]
        waveform_np = np.pad(waveform_np, (0, shortage), mode='wrap')

    start = int(random.random() * (waveform_np.shape[0] - length))
    segment = waveform_np[start:start + length]
    segment_tensor = torch.FloatTensor(segment)

    # === AUGMENT ===
    # r = np.random.rand()
    # if r < 0.7:
    #     augmented = segment_tensor
    # elif r < 0.8:
    augmented_RIR = add_RIR(segment_tensor, NUM_FRAMES)
    augmented_BGNoise = add_background_noise(segment_tensor)
    augmented_SimPhoneCall = simulate_phone_call(segment_tensor, NUM_FRAMES)
    augmented_MusicNoise = add_music_noise(segment_tensor)

    # === SAVE ===
    # path: .../id00003/bonafide/abc.wav
    id_folder = path.parent.parent.name  # id00003
    label = path.parent.name             # bonafide or spoof
    filename = path.stem

    out_dir = os.path.join(OUTPUT_ROOT, id_folder, label)
    os.makedirs(out_dir, exist_ok=True)

    if OUTPUT_ORIG:
        out_orig = os.path.join(out_dir, f"{filename}_orig.wav")
        sf.write(out_orig, segment_tensor.numpy(), SAMPLE_RATE)

    out_augRIR = os.path.join(out_dir, f"{filename}_RIR.wav")
    sf.write(out_augRIR, augmented_RIR.numpy(), SAMPLE_RATE)
    out_augBGNoise = os.path.join(out_dir, f"{filename}_BGNosie.wav")
    sf.write(out_augBGNoise, augmented_BGNoise.numpy(), SAMPLE_RATE)
    out_augSimPhoneCall = os.path.join(out_dir, f"{filename}_SimPhoneCall.wav")
    sf.write(out_augSimPhoneCall, augmented_SimPhoneCall.numpy(), SAMPLE_RATE)
    out_augMusicNoise = os.path.join(out_dir, f"{filename}_MusicNoise.wav")
    sf.write(out_augMusicNoise, augmented_MusicNoise.numpy(), SAMPLE_RATE)
print("✅ DONE")

KeyboardInterrupt: 

### ZIP

In [None]:
import os
import zipfile

OUTPUT_DIR = "/kaggle/working/augmented_output"
ZIP_NAME = f"/kaggle/working/augmented_full.zip"

with zipfile.ZipFile(ZIP_NAME, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(OUTPUT_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, OUTPUT_DIR)

            # Thêm file vào zip
            zipf.write(file_path, arcname)

            # Xoá file sau khi đã nén xong
            os.remove(file_path)
