## Clone repo

In [2]:
%%capture
!git clone https://github.com/taoruijie/ecapa-tdnn.git
%cd ecapa-tdnn

# Cài đặt các thư viện cần thiết
%pip install -r requirements.txt
%pip install librosa==0.9.2  

# Tạo folder lưu mô hình
!mkdir -p exps/


##  Ghi đè dataLoader.py để phù hợp với Vietnam-Celeb và bỏ các hàm thêm nhiễu, thêm tiếng vang

In [3]:
code = '''
import os
import torch
import torchaudio
import torch.nn.functional as F
from torch.utils.data import Dataset
import random

class trainDataset(Dataset):
    def __init__(self, data_list_path, data_path, max_length=16000*5, noise_level=0.005, add_noise=True):
        self.data_path = data_path
        self.max_length = max_length
        self.data_list = []
        self.noise_level = noise_level
        self.add_noise = add_noise

        with open(data_list_path, 'r') as f:
            lines = f.readlines()
        
        self.speakers = sorted(list(set([line.strip().split('\t')[0] for line in lines])))
        self.spk2id = {spk: idx for idx, spk in enumerate(self.speakers)}

        for line in lines:
            spk, utt = line.strip().split('\t')
            full_path = os.path.join(data_path, spk, utt)
            if os.path.exists(full_path):
                self.data_list.append([os.path.join(spk, utt), self.spk2id[spk]])
            else:
                print(f"[Warning] Missing file: {full_path}")

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, index):
        utt, label = self.data_list[index]
        audio_path = os.path.join(self.data_path, utt)

        try:
            audio, _ = torchaudio.load(audio_path)
            audio = audio.squeeze(0)  # assume mono
        except Exception as e:
            print(f"[Error] Failed to load {audio_path}: {e}")
            return self.__getitem__((index + 1) % len(self.data_list))

        # Pad or trim to max_length
        if audio.shape[0] < self.max_length:
            pad_len = self.max_length - audio.shape[0]
            audio = F.pad(audio, (0, pad_len))
        else:
            audio = audio[:self.max_length]

        # Thêm white noise augmentation nếu bật
        if self.add_noise:
            noise = torch.randn_like(audio) * self.noise_level
            audio = audio + noise
            # Clamp để tránh vượt ngoài [-1,1]
            audio = torch.clamp(audio, -1.0, 1.0)

        return audio, label


'''

with open("dataLoader.py", "w") as f:
    f.write(code)

## Ghi đè các file train, loss

In [4]:
train_code = '''
import torch
import torch.nn as nn
import torch.optim as optim
import os
import argparse
from dataLoader import trainDataset
from ECAPAModel import ECAPA_TDNN
from torch.utils.data import DataLoader
from lossFunction import AAMsoftmax
from torch.optim.lr_scheduler import CosineAnnealingLR

parser = argparse.ArgumentParser()
parser.add_argument('--train_list', type=str, required=True)
parser.add_argument('--eval_list', type=str, default='')
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--save_path', type=str, required=True)
parser.add_argument('--initial_model', type=str, default='')
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--epochs', type=int, default=15)
parser.add_argument('--noise_level', type=float, default=0.005, help="Mức độ noise trắng thêm vào")
args = parser.parse_args()

if not os.path.exists(args.save_path):
    os.makedirs(args.save_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ECAPA_TDNN(C=1024).to(device)
aamsoftmax_layer = AAMsoftmax(n_class=5000).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(list(model.parameters()) + list(aamsoftmax_layer.parameters()),
                      lr=0.01, momentum=0.9, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=args.epochs)

if args.initial_model != "":
    print("Loading initial model:", args.initial_model)
    checkpoint = torch.load(args.initial_model, map_location=device)
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    elif 'model' in checkpoint:
        state_dict = checkpoint['model']
    else:
        state_dict = checkpoint

    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith("speaker_encoder."):
            new_key = k.replace("speaker_encoder.", "")
            new_state_dict[new_key] = v
        else:
            new_state_dict[k] = v

    missing, unexpected = model.load_state_dict(new_state_dict, strict=False)
    if missing:
        print("Missing keys:", missing)
    if unexpected:
        print("Unexpected keys:", unexpected)

train_dataset = trainDataset(args.train_list, args.data_path, noise_level=args.noise_level)
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=2)

print("Start Training")
for epoch in range(1, args.epochs + 1):
    model.train()
    aamsoftmax_layer.train()
    running_loss = 0.0
    for idx, (audios, labels) in enumerate(train_loader):
        audios = audios.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        embeddings = model(audios, aug=True)
        logits = aamsoftmax_layer(embeddings, labels)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if idx % 100 == 0:
            avg_loss = running_loss / (idx + 1)
            print(f"Epoch {epoch} [{idx}/{len(train_loader)}] Avg Loss: {avg_loss:.4f}")

    scheduler.step()
    torch.save({'model': model.state_dict(),
                'aamsoftmax': aamsoftmax_layer.state_dict()}, os.path.join(args.save_path, f'model_{epoch}.pt'))

torch.save({'model': model.state_dict(),
            'aamsoftmax': aamsoftmax_layer.state_dict()}, os.path.join(args.save_path, 'final.model'))


'''

with open("trainECAPAModel.py", "w") as f:
    f.write(train_code)


In [5]:
loss_code = '''
import torch
import torch.nn as nn
import torch.nn.functional as F

class AAMsoftmax(nn.Module):
    def __init__(self, n_class, m=0.2, s=30):
        super(AAMsoftmax, self).__init__()
        self.n_class = n_class
        self.m = m
        self.s = s
        self.weight = nn.Parameter(torch.randn(n_class, 192))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, x, label):
        x_norm = F.normalize(x, dim=1)
        w_norm = F.normalize(self.weight, dim=1)
        cosine = torch.matmul(x_norm, w_norm.t())
        phi = cosine - self.m

        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1, 1), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

'''
with open("lossFunction.py", "w") as f:
    f.write(loss_code)


## Train mô hình với tập train từ Vietnam-Celeb

In [None]:
!python trainECAPAModel.py \
  --initial_model /kaggle/working/ecapa-tdnn/exps/pretrain.model \
  --train_list /kaggle/input/traindatasv/traindata.txt \
  --save_path exps/vietnamceleb \
  --data_path /kaggle/input/d/vnhiulv/data-sv/data
