# Baseline_CNN_KWS.ipynb

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import DataLoader, random_split
import torchaudio.transforms as T
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

print("PyTorch:", torch.__version__)
print("Torchaudio:", torchaudio.__version__)
print("MPS Available:", torch.backends.mps.is_available())  # for M1 GPU

PyTorch: 2.0.1
Torchaudio: 2.0.2
MPS Available: True


In [2]:
# Path to your Speech Commands dataset
DATA_DIR = "../data"
assert os.path.exists(DATA_DIR), "Dataset folder not found!"

In [3]:
import os
print(os.listdir(DATA_DIR))

['.DS_Store', 'SpeechCommands']


In [4]:
print(os.listdir(os.path.join(DATA_DIR, 'SpeechCommands')))

['.Rhistory', 'speech_commands_v0.02']


In [5]:
# Choose a small subset for quicker training
CLASSES = ["yes", "no", "go", "stop", "down", "up"]

# MFCC parameters
SAMPLE_RATE = 16000
N_MFCC = 40

mfcc_transform = T.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=N_MFCC,
    melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40, "center": False}
)

In [6]:
import torchaudio

In [7]:
from torchaudio.datasets import SPEECHCOMMANDS

dataset = SPEECHCOMMANDS(DATA_DIR, download=False)
print(f"Total samples: {len(dataset)}")

waveform, sample_rate, label, *_ = dataset[0]
print("Label:", label)
print("Sample rate:", sample_rate)
print("Waveform shape:", waveform.shape)

Total samples: 64721
Label: bed
Sample rate: 16000
Waveform shape: torch.Size([1, 16000])


In [8]:
from torchaudio.datasets import SPEECHCOMMANDS
import os

class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset, classes):
        super().__init__(root=DATA_DIR, download=False)
        self.subset = subset
        self.classes = classes
        self._walker = self._load_list(subset)

    def _load_list(self, subset):
        base = os.path.join(self._path)
        val_list = os.path.join(base, "validation_list.txt")
        test_list = os.path.join(base, "testing_list.txt")

        def read_list(path):
            with open(path, "r") as f:
                return set(line.strip() for line in f)

        val_files = read_list(val_list)
        test_files = read_list(test_list)

        if subset == "validation":
            return [os.path.join(base, f) for f in val_files if f.split("/")[0] in self.classes]
        elif subset == "testing":
            return [os.path.join(base, f) for f in test_files if f.split("/")[0] in self.classes]
        else:
            # training = all files not in val or test
            all_files = []
            for label in self.classes:
                folder = os.path.join(base, label)
                if os.path.isdir(folder):
                    for file in os.listdir(folder):
                        path = os.path.join(label, file)
                        if path not in val_files and path not in test_files:
                            all_files.append(os.path.join(base, path))
            return all_files

    def __getitem__(self, n):
        path = self._walker[n]
        waveform, sr = torchaudio.load(path)
        label = path.split("/")[-2]
        label_idx = self.classes.index(label)
        mfcc = mfcc_transform(waveform).squeeze(0)
        return mfcc, label_idx

In [9]:
train_set = SubsetSC("training", CLASSES)
val_set   = SubsetSC("validation", CLASSES)
test_set  = SubsetSC("testing", CLASSES)

print(len(train_set), len(val_set), len(test_set))

11144 1561 1533


In [10]:
import torch.nn as nn

class CNN_KWS(nn.Module):
    def __init__(self, num_classes=len(CLASSES)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(16 * 10 * 20, 64),   # adjust based on MFCC shape
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        # x: [batch, 40, time]
        x = x.unsqueeze(1)  # add channel dim â†’ [batch, 1, 40, time]
        return self.net(x)

In [11]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = CNN_KWS().to(device)
print(model)

CNN_KWS(
  (net): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3200, out_features=64, bias=True)
    (8): ReLU()
    (9): Linear(in_features=64, out_features=6, bias=True)
  )
)


In [2]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
EPOCHS = 10

NameError: name 'nn' is not defined

In [13]:
from tqdm import tqdm

def train_epoch(loader):
    model.train()
    total_loss, correct = 0, 0
    for x, y in tqdm(loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (out.argmax(1) == y).sum().item()
    return total_loss / len(loader), correct / len(loader.dataset)

def evaluate(loader):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            total_loss += criterion(out, y).item()
            correct += (out.argmax(1) == y).sum().item()
    return total_loss / len(loader), correct / len(loader.dataset)

In [14]:
import torch.nn.functional as F

def pad_sequence(batch):
    # batch: list of tuples (mfcc, label)
    tensors, targets = zip(*batch)

    # Find max time dimension in batch
    max_len = max(t.shape[1] for t in tensors)

    # Pad each MFCC to max_len
    padded = []
    for t in tensors:
        pad_amount = max_len - t.shape[1]
        padded_t = F.pad(t, (0, pad_amount))  # pad on time axis
        padded.append(padded_t)

    padded = torch.stack(padded)  # [batch, 40, time]
    targets = torch.tensor(targets)
    return padded, targets

In [15]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_sequence)
val_loader   = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=pad_sequence)
test_loader  = DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=pad_sequence)

print(f"Train batches: {len(train_loader)} | Val batches: {len(val_loader)} | Test batches: {len(test_loader)}")

Train batches: 175 | Val batches: 25 | Test batches: 24


In [1]:
train_acc, val_acc = [], []

for epoch in range(EPOCHS):
    tr_loss, tr_acc = train_epoch(train_loader)
    v_loss, v_acc = evaluate(val_loader)
    train_acc.append(tr_acc)
    val_acc.append(v_acc)
    print(f"Epoch {epoch+1}/{EPOCHS}: Train Acc={tr_acc:.3f}, Val Acc={v_acc:.3f}")

NameError: name 'EPOCHS' is not defined