In [1]:
!pip install torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [8]:
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import random_split, DataLoader, Subset
from torchaudio.datasets import SPEECHCOMMANDS
import os
import random
from tqdm import tqdm

In [3]:
class SubsetSC(SPEECHCOMMANDS) :
  def __init__(self, subset) :
    super().__init__(".", download=True)

    def load_list(filename) :
      path = os.path.join(self._path, filename)
      with open(path) as f:
        return [os.path.join(self._path, line.strip()) for line in f]

    if subset == "training" :
      self._walker = load_list("training_list.txt")
    elif subset == "validation" :
      self._walker = load_list("validation_list.txt")
    elif subset == "testing" :
      self._walker = load_list("testing_list.txt")


In [4]:
wake_word = "yes"

def label_to_binary(label):
    return 1 if label == wake_word else 0

def collate_fn(batch):
    specs, targets = [], []
    for waveform, sr, label, *_ in batch:
        if label == wake_word:
            include = True
        else:
            include = random.random() < 0.3
        if include:
            try:
                mel = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_mels=80)(waveform)
                mel = mel.squeeze(0).transpose(0, 1)
                if mel.shape[0] < 10:
                    continue
                specs.append(mel)
                targets.append(label_to_binary(label))
            except Exception as e:
                continue
    if len(specs) == 0:
        return None
    return nn.utils.rnn.pad_sequence(specs, batch_first=True), torch.tensor(targets, dtype=torch.float32)


In [5]:
class TinyTransformer(nn.Module) :
    def __init__(self, input_dim=128, num_classes=2) :
      super().__init__()
      # encoder_layer 한 층 정의
      self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=2, dim_feedforward=256)

      # 위에서 정의한 한 층을 여러 층 쌓은 걸 정의
      self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

      # 분류
      self.classifier = nn.Linear(input_dim, 1)

    def forward(self, x) :
      x = self.transformer(x)
      x = x.mean(dim=1)
      return self.classifier(x).squeeze(-1)




In [9]:
full_dataset = torchaudio.datasets.SPEECHCOMMANDS(root=".", download=True)

# 전체 길이의 1/50
subset_size = len(full_dataset) // 50
indices = random.sample(range(len(full_dataset)), subset_size)

# 작은 서브셋 만들기
small_dataset = Subset(full_dataset, indices)

train_size = int(0.8 * len(small_dataset))
val_size = len(small_dataset) - train_size
train_set, val_set = random_split(small_dataset, [train_size, val_size])

In [10]:
# ✅ 5. 모델/손실/최적화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = TinyTransformer(input_dim=80).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ✅ 6. DataLoader
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False, collate_fn=collate_fn)

# ✅ 7. 학습 루프 (validation 포함)
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss, epoch_acc, count = 0.0, 0.0, 0

    print(f"\n▶ Epoch {epoch+1}/{num_epochs}")
    for batch in tqdm(train_loader, desc="Training", leave=False):
        if batch is None:
            continue
        x, y = batch
        x, y = x.to(device), y.to(device)

        out = model(x)
        loss = criterion(out, y)

        preds = (torch.sigmoid(out) >= 0.5).float()
        acc = (preds == y).sum().item() / y.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc
        count += 1

    avg_loss = epoch_loss / count
    avg_acc = epoch_acc / count * 100
    print(f"🟢 Train: loss={avg_loss:.4f}, acc={avg_acc:.2f}%")

    # ✅ 8. validation 평가
    model.eval()
    val_acc, val_count = 0.0, 0
    with torch.no_grad():
        for batch in val_loader:
            if batch is None:
                continue
            x, y = batch
            x, y = x.to(device), y.to(device)
            out = model(x)
            preds = (torch.sigmoid(out) >= 0.5).float()
            acc = (preds == y).sum().item() / y.size(0)
            val_acc += acc
            val_count += 1
    print(f"🔵 Validation acc = {val_acc / val_count * 100:.2f}%")



cuda

▶ Epoch 1/10




🟢 Train: loss=0.3406, acc=89.11%
🔵 Validation acc = 92.17%

▶ Epoch 2/10




🟢 Train: loss=0.3331, acc=89.21%
🔵 Validation acc = 90.57%

▶ Epoch 3/10




🟢 Train: loss=0.3019, acc=89.63%
🔵 Validation acc = 91.27%

▶ Epoch 4/10




🟢 Train: loss=0.3194, acc=88.64%
🔵 Validation acc = 91.21%

▶ Epoch 5/10




🟢 Train: loss=0.3355, acc=87.39%
🔵 Validation acc = 91.54%

▶ Epoch 6/10




🟢 Train: loss=0.3034, acc=88.44%
🔵 Validation acc = 90.06%

▶ Epoch 7/10




🟢 Train: loss=0.2982, acc=88.63%
🔵 Validation acc = 87.12%

▶ Epoch 8/10




🟢 Train: loss=0.3212, acc=88.22%
🔵 Validation acc = 91.34%

▶ Epoch 9/10




🟢 Train: loss=0.2965, acc=88.78%
🔵 Validation acc = 91.66%

▶ Epoch 10/10




🟢 Train: loss=0.2618, acc=90.14%
🔵 Validation acc = 91.81%
