# Baseline_CNN_KWS.ipynb

In [2]:
# %% [markdown]
# # Baseline CNN for 6-class Keyword Spotting
# Local training on sample_data/speech_commands_v0.02
# Compatible with SNN_Conversion + Loihi notebooks.

# %%
import os
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as T
import soundfile as sf
from tqdm import tqdm

# If you run this notebook from baseline_cnn/, project root is its parent
PROJECT_ROOT = Path.cwd().resolve().parent
DATA_ROOT = PROJECT_ROOT / "sample_data" / "speech_commands_v0.02"
MODEL_DIR = PROJECT_ROOT / "saved_models"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_ROOT exists:", DATA_ROOT.exists())
print("MODEL_DIR exists:", MODEL_DIR.exists())

MODEL_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

PROJECT_ROOT: /Users/maddy/Desktop/PLEP/Project/CS-576-Final-Project
DATA_ROOT exists: True
MODEL_DIR exists: True
Using device: cpu


In [3]:
# %%
# Target classes (same as SNN + Loihi)
CLASSES = ["yes", "no", "go", "stop", "down", "up"]

SAMPLE_RATE = 16000
N_MFCC = 40

# MFCC pipeline (no torchcodec)
mfcc_transform = T.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=N_MFCC,
    melkwargs={
        "n_fft": 400,
        "hop_length": 160,
        "n_mels": 40,
        "center": False,
    },
)

def wav_to_mfcc(path: Path) -> torch.Tensor:
    """
    Load WAV using soundfile, resample with torch, then compute MFCC.
    Output: [40, T]
    """
    # soundfile returns numpy array
    waveform_np, sr = sf.read(str(path))
    waveform = torch.tensor(waveform_np, dtype=torch.float32)

    # Ensure shape [1, num_samples]
    if waveform.ndim == 1:
        waveform = waveform.unsqueeze(0)
    elif waveform.ndim == 2:
        # If stereo, convert to mono
        waveform = waveform.mean(dim=1, keepdim=True).transpose(0, 1)

    if sr != SAMPLE_RATE:
        waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)

    mfcc = mfcc_transform(waveform).squeeze(0)  # [40, T]

    # per-sample normalization (matches SNN notebook style)
    mfcc = (mfcc - mfcc.mean()) / (mfcc.std() + 1e-6)

    # clamp for stability
    mfcc = torch.clamp(mfcc, -2.0, 2.0)
    return mfcc

In [5]:
# %%
# We will replicate the Speech Commands official split:
#   - validation_list.txt
#   - testing_list.txt
#   - train = remaining files

val_list_path = DATA_ROOT / "validation_list.txt"
test_list_path = DATA_ROOT / "testing_list.txt"

def read_split_list(list_path: Path):
    """Returns a set of relative paths like 'yes/xxxx.wav'."""
    lines = []
    with open(list_path, "r") as f:
        for line in f:
            rel = line.strip()
            # Only keep our target classes
            if rel and rel.split("/")[0] in CLASSES:
                lines.append(rel)
    return set(lines)

val_rel = read_split_list(val_list_path)
test_rel = read_split_list(test_list_path)

print("Num val entries in file:", len(val_rel))
print("Num test entries in file:", len(test_rel))

# Collect full Paths for each split
train_files = []
val_files = []
test_files = []

# Use the actual folder structure under DATA_ROOT
for cls in CLASSES:
    cls_dir = DATA_ROOT / cls
    if not cls_dir.exists():
        print(f"Missing class folder: {cls_dir}")
        continue

    for wav_path in cls_dir.glob("*.wav"):
        rel = f"{cls}/{wav_path.name}"
        if rel in val_rel:
            val_files.append(wav_path)
        elif rel in test_rel:
            test_files.append(wav_path)
        else:
            train_files.append(wav_path)

print(f"Train files: {len(train_files)}")
print(f"Val files:   {len(val_files)}")
print(f"Test files:  {len(test_files)}")

Num val entries in file: 2252
Num test entries in file: 2468
Train files: 18657
Val files:   2252
Test files:  2468


In [6]:
# %%
class KWS_Dataset(Dataset):
    def __init__(self, file_list, classes):
        self.files = list(file_list)
        self.classes = classes

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        mfcc = wav_to_mfcc(path)  # [40, T]
        label = path.parent.name
        y = self.classes.index(label)
        return mfcc, y


def pad_collate(batch):
    """
    Batch: list of (mfcc [40,T_i], y)
    We pad along time dimension to max T.
    """
    xs, ys = zip(*batch)
    max_t = max(x.shape[1] for x in xs)
    xs_pad = [F.pad(x, (0, max_t - x.shape[1])) for x in xs]  # pad time dim
    xs_pad = torch.stack(xs_pad)  # [B, 40, T_max]
    ys = torch.tensor(ys, dtype=torch.long)
    return xs_pad, ys

In [7]:
# %%
BATCH_SIZE = 64

train_dataset = KWS_Dataset(train_files, CLASSES)
val_dataset   = KWS_Dataset(val_files, CLASSES)
test_dataset  = KWS_Dataset(test_files, CLASSES)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    collate_fn=pad_collate,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    collate_fn=pad_collate,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    collate_fn=pad_collate,
)

print("Train batches:", len(train_loader))
print("Val batches:  ", len(val_loader))
print("Test batches: ", len(test_loader))

Train batches: 292
Val batches:   36
Test batches:  39


In [8]:
# %%
class CNN_KWS(nn.Module):
    def __init__(self, num_classes=6):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        # LazyLinear will infer flatten_dim from first forward pass
        self.classifier = nn.Sequential(
            nn.LazyLinear(64),
            nn.ReLU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, x):
        # x: [B, 40, T]
        x = x.unsqueeze(1)  # [B,1,40,T]
        x = self.features(x)
        x = torch.flatten(x, 1)  # [B, F]
        x = self.classifier(x)
        return x

# Instantiate model
model = CNN_KWS(num_classes=len(CLASSES)).to(device)
print(model)

CNN_KWS(
  (features): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): LazyLinear(in_features=0, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=6, bias=True)
  )
)


In [9]:
# %%
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

def train_one_epoch(model, loader, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for mfcc_batch, y_batch in tqdm(loader, desc="Train", leave=False):
        mfcc_batch = mfcc_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        out = model(mfcc_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * y_batch.size(0)
        preds = out.argmax(dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    avg_loss = running_loss / max(total, 1)
    acc = correct / max(total, 1)
    return avg_loss, acc


def evaluate(model, loader, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for mfcc_batch, y_batch in tqdm(loader, desc="Eval", leave=False):
            mfcc_batch = mfcc_batch.to(device)
            y_batch = y_batch.to(device)

            out = model(mfcc_batch)
            loss = criterion(out, y_batch)

            running_loss += loss.item() * y_batch.size(0)
            preds = out.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

    avg_loss = running_loss / max(total, 1)
    acc = correct / max(total, 1)
    return avg_loss, acc

In [10]:
# %%
EPOCHS = 10
best_val_acc = 0.0
best_state = None

for epoch in range(1, EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{EPOCHS} ===")
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, device)
    scheduler.step()

    print(
        f"Train: loss={train_loss:.4f}, acc={train_acc*100:.2f}% | "
        f"Val: loss={val_loss:.4f}, acc={val_acc*100:.2f}%"
    )

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state = model.state_dict()
        torch.save(best_state, MODEL_DIR / "baseline_cnn_kws_vfinal.pt")
        print(f"Saved new best model (val acc = {best_val_acc*100:.2f}%)")

print("\nTraining finished.")
print(f"Best validation accuracy: {best_val_acc*100:.2f}%")


=== Epoch 1/10 ===


                                                                                                                        

Train: loss=1.1990, acc=52.74% | Val: loss=0.9640, acc=63.63%
✅ Saved new best model (val acc = 63.63%)

=== Epoch 2/10 ===


                                                                                                                        

Train: loss=0.8448, acc=68.64% | Val: loss=0.8051, acc=69.45%
✅ Saved new best model (val acc = 69.45%)

=== Epoch 3/10 ===


                                                                                                                        

Train: loss=0.7070, acc=73.96% | Val: loss=0.6788, acc=75.71%
✅ Saved new best model (val acc = 75.71%)

=== Epoch 4/10 ===


                                                                                                                        

Train: loss=0.6043, acc=77.82% | Val: loss=0.6452, acc=76.78%
✅ Saved new best model (val acc = 76.78%)

=== Epoch 5/10 ===


                                                                                                                        

Train: loss=0.5301, acc=80.69% | Val: loss=0.6137, acc=78.51%
✅ Saved new best model (val acc = 78.51%)

=== Epoch 6/10 ===


                                                                                                                        

Train: loss=0.4385, acc=84.46% | Val: loss=0.5875, acc=79.80%
✅ Saved new best model (val acc = 79.80%)

=== Epoch 7/10 ===


                                                                                                                        

Train: loss=0.4033, acc=85.74% | Val: loss=0.5830, acc=80.55%
✅ Saved new best model (val acc = 80.55%)

=== Epoch 8/10 ===


                                                                                                                        

Train: loss=0.3758, acc=86.56% | Val: loss=0.5866, acc=80.64%
✅ Saved new best model (val acc = 80.64%)

=== Epoch 9/10 ===


                                                                                                                        

Train: loss=0.3448, acc=87.99% | Val: loss=0.5790, acc=81.22%
✅ Saved new best model (val acc = 81.22%)

=== Epoch 10/10 ===


                                                                                                                        

Train: loss=0.3189, acc=88.69% | Val: loss=0.5861, acc=81.79%
✅ Saved new best model (val acc = 81.79%)

Training finished.
Best validation accuracy: 81.79%




In [11]:
# %%
best_path = MODEL_DIR / "baseline_cnn_kws_vfinal.pt"
assert best_path.exists(), f"Checkpoint not found: {best_path}"

best_model = CNN_KWS(num_classes=len(CLASSES)).to(device)
# trigger LazyLinear init
with torch.no_grad():
    # use one batch to infer flatten_dim
    for mfcc_batch, _ in train_loader:
        mfcc_batch = mfcc_batch.to(device)
        _ = best_model(mfcc_batch)
        break

best_state = torch.load(best_path, map_location=device)
best_model.load_state_dict(best_state)
best_model.eval()

test_loss, test_acc = evaluate(best_model, test_loader, device)
print(f"Final Test Accuracy: {test_acc*100:.2f}%")

                                                                                                                        

✅ Final Test Accuracy: 80.43%




In [12]:
# %%
# Save final CNN model
save_path = MODEL_DIR / "baseline_cnn_kws_vfinal.pt"
torch.save(model.state_dict(), save_path)

print(f"CNN model saved to: {save_path}")

CNN model saved to: /Users/maddy/Desktop/PLEP/Project/CS-576-Final-Project/saved_models/baseline_cnn_kws_vfinal.pt
