<a href="https://colab.research.google.com/github/jjbmsda/EnsembleModel/blob/main/EnsembleModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# soundfile 설치
!pip -q install soundfile

In [3]:
# torchcodec 설치
!pip -q install torchcodec

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.1 MB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/2.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m1.5/2.1 MB[0m [31m12.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m15.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torchaudio
from torchaudio.datasets import LIBRISPEECH

train_raw = LIBRISPEECH(
    root="/content/drive/MyDrive/datasets",
    url="dev-clean",
    download=False
)

test_raw = LIBRISPEECH(
    root="/content/drive/MyDrive/datasets",
    url="test-clean",
    download=False
)

In [6]:
import os, random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, Subset
from torchvision.models import resnet18, densenet121

import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import top_k_accuracy_score


# =========================
# Colab RAM-safe settings
# =========================
BATCH_SIZE = 8
EPOCHS = 1
MAX_SPEAKERS = 50
TOPK = (1, 3)


def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)


def pad_trim_2d(spec: torch.Tensor, target_frames: int, pad_value: float = 0.0) -> torch.Tensor:
    T = spec.size(-1)
    if T > target_frames:
        return spec[..., :target_frames]
    elif T < target_frames:
        return F.pad(spec, (0, target_frames - T), value=pad_value)
    return spec


class LibriSpeechSpeakerDataset(Dataset):
    def __init__(self, dataset, spk2idx, sample_rate=16000, n_mels=64, target_frames=256):
        self.dataset = dataset
        self.spk2idx = spk2idx
        self.sample_rate = sample_rate
        self.target_frames = target_frames
        self.melspec = MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)
        self.to_db = AmplitudeToDB(stype="power")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        waveform, sr, transcript, speaker_id, chapter_id, utterance_id = self.dataset[idx]
        spk = int(speaker_id)

        # max_speakers 밖 화자는 스킵
        if spk not in self.spk2idx:
            return None

        # mono
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # resample to 16k
        if sr != self.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sr, self.sample_rate)

        # mel -> db, then pad/trim time axis
        spec = self.to_db(self.melspec(waveform))     # [1, n_mels, T]
        spec = pad_trim_2d(spec, self.target_frames)  # [1, n_mels, target_frames]

        y = self.spk2idx[spk]
        return spec, torch.tensor(y, dtype=torch.long)


def collate_skip_none(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None
    xs, ys = zip(*batch)
    return torch.stack(xs, dim=0), torch.stack(ys, dim=0)


class ResNetModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.net = resnet18(weights=None, num_classes=num_classes)
        self.net.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.net.maxpool = nn.Identity()

    def forward(self, x):
        return self.net(x)


class DenseNetModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.net = densenet121(weights=None)
        self.net.features.conv0 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.net.features.pool0 = nn.Identity()
        self.net.classifier = nn.Linear(self.net.classifier.in_features, num_classes)

    def forward(self, x):
        return self.net(x)


def train_one_epoch(model, loader, criterion, optimizer, device, use_amp, model_name="model"):
    model.train()
    total_loss, steps = 0.0, 0
    scaler = torch.amp.GradScaler("cuda") if use_amp else None

    for i, batch in enumerate(loader):
        if batch is None:
            continue
        x, y = batch
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad(set_to_none=True)

        if use_amp:
            with torch.amp.autocast("cuda"):
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

        total_loss += float(loss.item())
        steps += 1

        if (i + 1) % 50 == 0:
            print(
                f"[{model_name}] "
                f"step {i+1}/{len(loader)} "
                f"loss={total_loss/steps:.4f}"
            )

    return total_loss / max(1, steps)


@torch.no_grad()
def eval_model(model, loader, device, num_classes, topk=TOPK):
    model.eval()
    probs_all, targets_all = [], []

    for batch in loader:
        if batch is None:
            continue
        x, y = batch
        x = x.to(device)

        logits = model(x)
        probs = logits.softmax(dim=1).cpu().numpy()

        probs_all.append(probs)
        targets_all.append(y.numpy())

    if len(targets_all) == 0:
        return None

    probs_all = np.concatenate(probs_all, axis=0)
    targets_all = np.concatenate(targets_all, axis=0)
    preds = probs_all.argmax(axis=1)

    out = {
        "acc": accuracy_score(targets_all, preds),
        "macro_f1": f1_score(targets_all, preds, average="macro"),
        "top1": top_k_accuracy_score(targets_all, probs_all, k=1, labels=list(range(num_classes)))
    }
    for k in topk:
        if k <= num_classes:
            out[f"top{k}"] = top_k_accuracy_score(targets_all, probs_all, k=k, labels=list(range(num_classes)))
    return out


@torch.no_grad()
def eval_ensemble(rnet, dnet, loader, device, num_classes, topk=TOPK, alpha=0.8):
    rnet.eval()
    dnet.eval()
    probs_all, targets_all = [], []

    for batch in loader:
        if batch is None:
            continue
        x, y = batch
        x = x.to(device)

        # (중요) logits_r/logits_d를 먼저 계산해야 함
        logits_r = rnet(x)
        logits_d = dnet(x)

        # 가중 앙상블
        logits = alpha * logits_r + (1 - alpha) * logits_d
        probs = logits.softmax(dim=1).cpu().numpy()

        probs_all.append(probs)
        targets_all.append(y.numpy())

    if len(targets_all) == 0:
        return None

    probs_all = np.concatenate(probs_all, axis=0)
    targets_all = np.concatenate(targets_all, axis=0)
    preds = probs_all.argmax(axis=1)

    out = {
        "acc": accuracy_score(targets_all, preds),
        "macro_f1": f1_score(targets_all, preds, average="macro"),
        "top1": top_k_accuracy_score(targets_all, probs_all, k=1, labels=list(range(num_classes)))
    }
    for k in topk:
        if k <= num_classes:
            out[f"top{k}"] = top_k_accuracy_score(targets_all, probs_all, k=k, labels=list(range(num_classes)))
    return out


def split_indices(n, seed=42, train=0.8, val=0.1):
    idxs = list(range(n))
    rng = random.Random(seed)
    rng.shuffle(idxs)
    n_train = int(n * train)
    n_val = int(n * val)
    train_idxs = idxs[:n_train]
    val_idxs = idxs[n_train:n_train + n_val]
    test_idxs = idxs[n_train + n_val:]
    return train_idxs, val_idxs, test_idxs


def main():
    os.makedirs("./data", exist_ok=True)

    # dev-clean만 사용
    raw = LIBRISPEECH("./data", url="dev-clean", download=True)

    # speaker mapping (최대 MAX_SPEAKERS명만 사용)
    all_speakers = sorted({int(spk) for _, _, _, spk, *_ in raw})
    speakers = all_speakers[:MAX_SPEAKERS]
    spk2idx = {spk: i for i, spk in enumerate(speakers)}
    num_classes = len(speakers)

    print(f"Using speakers: {num_classes}/{len(all_speakers)} (MAX_SPEAKERS={MAX_SPEAKERS})")

    full_ds = LibriSpeechSpeakerDataset(raw, spk2idx, n_mels=64, target_frames=256)

    # split (발화 단위)
    tr, va, te = split_indices(len(full_ds), seed=42, train=0.8, val=0.1)
    train_ds = Subset(full_ds, tr)
    val_ds   = Subset(full_ds, va)
    test_ds  = Subset(full_ds, te)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    use_amp = (device.type == "cuda")

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_skip_none)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_skip_none)
    test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_skip_none)

    rnet = ResNetModel(num_classes).to(device)
    dnet = DenseNetModel(num_classes).to(device)

    crit = nn.CrossEntropyLoss()
    opt_r = optim.Adam(rnet.parameters(), lr=1e-3)
    opt_d = optim.Adam(dnet.parameters(), lr=1e-3)

    for ep in range(1, EPOCHS + 1):
        loss_r = train_one_epoch(
            rnet, train_loader, crit, opt_r, device, use_amp,
            model_name="ResNet"
        )

        loss_d = train_one_epoch(
            dnet, train_loader, crit, opt_d, device, use_amp,
            model_name="DenseNet"
        )

        val_r = eval_model(rnet, val_loader, device, num_classes, topk=TOPK)
        val_d = eval_model(dnet, val_loader, device, num_classes, topk=TOPK)
        val_e = eval_ensemble(rnet, dnet, val_loader, device, num_classes, topk=TOPK, alpha=0.8)

        print(f"\n[Epoch {ep}/{EPOCHS}]")
        print(f"  ResNet   loss={loss_r:.4f}  val={val_r}")
        print(f"  DenseNet loss={loss_d:.4f}  val={val_d}")
        print(f"  Ensemble           val={val_e}")

    test_r = eval_model(rnet, test_loader, device, num_classes, topk=TOPK)
    test_d = eval_model(dnet, test_loader, device, num_classes, topk=TOPK)
    test_e = eval_ensemble(rnet, dnet, test_loader, device, num_classes, topk=TOPK, alpha=0.8)

    if test_r is None or test_d is None or test_e is None:
        print("\nTEST set이 비어있거나 모두 None으로 필터링됐어. (split/데이터 로딩 확인 필요)")
        return

    print("\n=== TEST RESULTS (dev-clean split) ===")
    print(f"ResNet   {test_r}")
    print(f"DenseNet {test_d}")
    print(f"Ensemble {test_e}")


if __name__ == "__main__":
    main()

Using speakers: 40/40 (MAX_SPEAKERS=50)
step 50/271 loss=3.3526
step 100/271 loss=2.9921
step 150/271 loss=2.7309
step 200/271 loss=2.4937
step 250/271 loss=2.2744
step 50/271 loss=3.6464
step 100/271 loss=3.3442


KeyboardInterrupt: 