In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import os

import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import torchmetrics
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

from waveform_encoder import WaveformEncoder

In [2]:
TRAIN_DIR_PATH = 'voice-commands-classification-2025/train'
TEST_DIR_PATH = 'voice-commands-classification-2025/adv_test'

In [3]:
BATCH_SIZE = 512
N_WORKERS = 6
N_CLASSES = 35
EPOCHS = 50
LR = 0.01

N_MFCC = 120
NOISE_AMPLITUDE = 0.000
MASK_PROB = 0.1

DEVICE = torch.device('cpu')
if torch.cuda.is_available():
    DEVICE = torch.device('cuda:0')
elif torch.backends.mps.is_available():
    DEVICE = torch.device('mps')

DEVICE

device(type='cuda', index=0)

In [4]:
# Dataset

def noise_waveform(waveform: torch.Tensor, noise_amplitude: float = 0.05) -> torch.Tensor:
    noise = noise_amplitude * torch.randn(waveform.shape).to(waveform.device)
    noisy_waveform = waveform + noise
    noisy_waveform = torch.clamp(noisy_waveform, -1.0, 1.0)
    return noisy_waveform

class SpeechCommandDataset(Dataset):
    def __init__(self, dir_path, data, labels=None, dict_label_to_index=None, transform=None, noise_amplitude=0.00):
        self.dir_path = dir_path
        self.data = data
        self.labels = labels
        self.dict_label_to_index = dict_label_to_index
        self.transform = transform
        self.noise_amplitude = noise_amplitude

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_name = self.data[idx]
        waveform = np.load(os.path.join(self.dir_path, file_name))
        if waveform.shape[1] < 16000:
            waveform = np.pad(
                waveform, pad_width=((0, 0), (0, 16000 - waveform.shape[1])),
                mode='constant',
                constant_values=0
            )

        waveform = torch.from_numpy(waveform).float()

        if self.transform != None:
            spectrogram = self.transform(waveform)
        else:
            spectrogram = None

        if self.noise_amplitude > 0:
            waveform = noise_waveform(waveform, self.noise_amplitude)
        
        out_labels = []
        if self.labels is not None:
            if self.labels[idx] in self.dict_label_to_index:
                out_labels = self.dict_label_to_index[self.labels[idx]]

        return waveform, spectrogram, out_labels, int(file_name.split('.')[0])

In [5]:
df_train = pd.read_csv(
    os.path.join(TRAIN_DIR_PATH, 'metadata.csv')
)
dict_label_to_index = {}
dict_index_to_label = {}
for index, key in enumerate(df_train['label'].unique()):
    dict_label_to_index[key] = index
    dict_index_to_label[index] = key

dict_label_to_index

{'stop': 0,
 'go': 1,
 'right': 2,
 'dog': 3,
 'left': 4,
 'yes': 5,
 'zero': 6,
 'four': 7,
 'bird': 8,
 'cat': 9,
 'five': 10,
 'off': 11,
 'learn': 12,
 'six': 13,
 'two': 14,
 'on': 15,
 'up': 16,
 'three': 17,
 'nine': 18,
 'one': 19,
 'follow': 20,
 'wow': 21,
 'seven': 22,
 'sheila': 23,
 'down': 24,
 'no': 25,
 'bed': 26,
 'eight': 27,
 'house': 28,
 'tree': 29,
 'visual': 30,
 'forward': 31,
 'marvin': 32,
 'backward': 33,
 'happy': 34}

In [6]:
df_train_data, df_val_data = train_test_split(
    df_train,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

train_data = df_train_data.file_name.values
train_labels = df_train_data.label.values

val_data = df_val_data.file_name.values
val_labels = df_val_data.label.values

In [7]:
# DataLoader, transform

train_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TRAIN_DIR_PATH,
        data=train_data,
        labels=train_labels,
        dict_label_to_index=dict_label_to_index,
        transform=torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True),
        noise_amplitude=NOISE_AMPLITUDE
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=N_WORKERS
)

valid_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TRAIN_DIR_PATH,
        data=val_data,
        labels=val_labels,
        dict_label_to_index=dict_label_to_index,
        transform=torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True),
        noise_amplitude=0.0
    ),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=N_WORKERS
)


In [8]:
for item in train_dataloader:
    print(item[0].shape, item[1].shape)
    break

torch.Size([512, 1, 16000]) torch.Size([512, 1, 120, 81])


In [24]:
class M5(nn.Module):
    def __init__(self, n_input, n_output, stride=160, n_channel=32):
        super().__init__()
        self.wf_enc = WaveformEncoder(n_input, stride=stride, kernel_size=400, n_channel=n_channel)
        self.out = nn.Linear(n_channel, n_output)

    def forward(self, sg):
        sg = self.wf_enc(sg)
        
        sg = sg.transpose(-1, -2)
        sg = F.avg_pool1d(sg, sg.shape[-1])
        sg = sg.transpose(-1, -2)
        sg = self.out(sg)
        return F.log_softmax(sg, dim=2)

In [25]:
model = M5(n_input=1, n_output=35, stride=160, n_channel=N_MFCC).to(DEVICE)

In [26]:
input_image = torch.rand(4, 1, 16000).to(DEVICE)
model = model.to(DEVICE)
result = model(input_image)
print(result.size())

torch.Size([4, 1, 35])


In [27]:
def lr_lambda(current_step):
    return max(0.0, float(EPOCHS - current_step) / EPOCHS)


def train_model(model: nn.Module, train_data: DataLoader, valid_data: DataLoader):
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=2e-5)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    criterion = nn.NLLLoss()

    accuracy_train = torchmetrics.classification.Accuracy(task="multiclass", num_classes=N_CLASSES).to(DEVICE)
    accuracy_val = torchmetrics.classification.Accuracy(task="multiclass", num_classes=N_CLASSES).to(DEVICE)

    for epoch in range(EPOCHS):
        train_loss = 0.0
        val_loss = 0.0

        model.train()
        for x, x_sp, y, _ in train_data:
            x = x.to(DEVICE)
            x_sp = x_sp.to(DEVICE)
            y = y.to(DEVICE)

            optimizer.zero_grad()

            y_hat = model(x).squeeze()
            loss = criterion(y_hat, y)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * x.size(0)
            _, preds = torch.max(y_hat, 1)

            accuracy_train(
                y_hat,
                y
            )

        model.eval()
        for x, x_sp, y, _ in valid_data:
            x = x.to(DEVICE)
            x_sp = x_sp.to(DEVICE)
            y = y.to(DEVICE)

            y_hat = model(x).squeeze()
            loss = criterion(y_hat, y)

            val_loss += loss.item() * x.size(0)
            _, preds = torch.max(y_hat, 1)

            accuracy_val(
                y_hat,
                y
            )

        train_loss = train_loss / len(train_dataloader.dataset)
        val_loss = val_loss / len(valid_dataloader.dataset)

        scheduler.step()

        print(f"Epoch {epoch + 1}/{EPOCHS}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {accuracy_train.compute():.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {accuracy_val.compute():.4f}")
        

In [28]:
train_model(
    model=model,
    train_data=train_dataloader,
    valid_data=valid_dataloader
)
torch.save(model.state_dict(), 'waveform_encoder_1.pt')

Epoch 1/50
Train Loss: 2.3397, Train Acc: 0.3305
Val Loss: 1.6559, Val Acc: 0.5123
Epoch 2/50
Train Loss: 1.2853, Train Acc: 0.4718
Val Loss: 1.1106, Val Acc: 0.5939
Epoch 3/50
Train Loss: 0.9170, Train Acc: 0.5559
Val Loss: 0.8415, Val Acc: 0.6446
Epoch 4/50
Train Loss: 0.7404, Train Acc: 0.6107
Val Loss: 0.7632, Val Acc: 0.6764
Epoch 5/50
Train Loss: 0.6263, Train Acc: 0.6501
Val Loss: 0.7212, Val Acc: 0.6978
Epoch 6/50
Train Loss: 0.5729, Train Acc: 0.6789
Val Loss: 0.6916, Val Acc: 0.7135
Epoch 7/50
Train Loss: 0.5176, Train Acc: 0.7018
Val Loss: 0.6568, Val Acc: 0.7261
Epoch 8/50
Train Loss: 0.4734, Train Acc: 0.7206
Val Loss: 0.6151, Val Acc: 0.7372
Epoch 9/50
Train Loss: 0.4601, Train Acc: 0.7356
Val Loss: 0.6181, Val Acc: 0.7455
Epoch 10/50
Train Loss: 0.4118, Train Acc: 0.7491
Val Loss: 0.5990, Val Acc: 0.7529
Epoch 11/50
Train Loss: 0.3904, Train Acc: 0.7607
Val Loss: 0.5930, Val Acc: 0.7592
Epoch 12/50
Train Loss: 0.3635, Train Acc: 0.7710
Val Loss: 0.5791, Val Acc: 0.7650
E

In [29]:
train_model(
    model=model,
    train_data=train_dataloader,
    valid_data=valid_dataloader
)
torch.save(model.state_dict(), 'waveform_encoder_2.pt')

Epoch 1/50
Train Loss: 0.3769, Train Acc: 0.8847
Val Loss: 0.6035, Val Acc: 0.8340
Epoch 2/50
Train Loss: 0.2919, Train Acc: 0.8953
Val Loss: 0.6378, Val Acc: 0.8303
Epoch 3/50
Train Loss: 0.3055, Train Acc: 0.8974
Val Loss: 0.5844, Val Acc: 0.8325
Epoch 4/50
Train Loss: 0.2746, Train Acc: 0.9010
Val Loss: 0.5613, Val Acc: 0.8345
Epoch 5/50
Train Loss: 0.2684, Train Acc: 0.9034
Val Loss: 0.5738, Val Acc: 0.8360
Epoch 6/50
Train Loss: 0.2568, Train Acc: 0.9057
Val Loss: 0.5781, Val Acc: 0.8368
Epoch 7/50
Train Loss: 0.2467, Train Acc: 0.9075
Val Loss: 0.5614, Val Acc: 0.8382
Epoch 8/50
Train Loss: 0.2359, Train Acc: 0.9096
Val Loss: 0.5866, Val Acc: 0.8390
Epoch 9/50
Train Loss: 0.2284, Train Acc: 0.9114
Val Loss: 0.5762, Val Acc: 0.8394
Epoch 10/50
Train Loss: 0.2186, Train Acc: 0.9131
Val Loss: 0.5801, Val Acc: 0.8401
Epoch 11/50
Train Loss: 0.2018, Train Acc: 0.9151
Val Loss: 0.5679, Val Acc: 0.8408
Epoch 12/50
Train Loss: 0.2086, Train Acc: 0.9166
Val Loss: 0.5632, Val Acc: 0.8414
E

In [30]:
train_model(
    model=model,
    train_data=train_dataloader,
    valid_data=valid_dataloader
)
torch.save(model.state_dict(), 'waveform_encoder.pt')

Epoch 1/50
Train Loss: 0.3065, Train Acc: 0.9041
Val Loss: 0.5998, Val Acc: 0.8432
Epoch 2/50
Train Loss: 0.2419, Train Acc: 0.9133
Val Loss: 0.5784, Val Acc: 0.8431
Epoch 3/50
Train Loss: 0.2396, Train Acc: 0.9161
Val Loss: 0.5869, Val Acc: 0.8434
Epoch 4/50
Train Loss: 0.2221, Train Acc: 0.9191
Val Loss: 0.5635, Val Acc: 0.8445
Epoch 5/50
Train Loss: 0.2120, Train Acc: 0.9214
Val Loss: 0.5595, Val Acc: 0.8457
Epoch 6/50
Train Loss: 0.2055, Train Acc: 0.9234
Val Loss: 0.5485, Val Acc: 0.8467
Epoch 7/50
Train Loss: 0.2002, Train Acc: 0.9249
Val Loss: 0.5517, Val Acc: 0.8472
Epoch 8/50
Train Loss: 0.1911, Train Acc: 0.9265
Val Loss: 0.5588, Val Acc: 0.8481
Epoch 9/50
Train Loss: 0.1807, Train Acc: 0.9281
Val Loss: 0.5635, Val Acc: 0.8486
Epoch 10/50
Train Loss: 0.1697, Train Acc: 0.9298
Val Loss: 0.5802, Val Acc: 0.8492
Epoch 11/50
Train Loss: 0.1700, Train Acc: 0.9311
Val Loss: 0.5960, Val Acc: 0.8491
Epoch 12/50
Train Loss: 0.1710, Train Acc: 0.9321
Val Loss: 0.5670, Val Acc: 0.8498
E

In [31]:
df_test = pd.read_csv(
    os.path.join(TEST_DIR_PATH, 'metadata.csv')
)
test_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TEST_DIR_PATH,
        data=df_test.file_name.values,
        labels=None,
        dict_label_to_index=dict_label_to_index,
        transform=torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True),
        noise_amplitude=0.0
    ),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=N_WORKERS
)


In [32]:
# ENSEMBLE PREDICTIONS AND SUBMIT
results = {
    'id': [],
    'label': []
}

model.eval()
for x, x_sp, y, ids in test_dataloader:
    x = x.to(DEVICE)
    x_sp = x_sp.to(DEVICE)
    with torch.no_grad():
        y_hat = model(x).squeeze()
        _, preds = torch.max(y_hat, 1)
        for i in range(len(preds)):
            results["id"].append(ids[i].item())
            results["label"].append(dict_index_to_label[int(preds[i].item())])
        

pd.DataFrame(results).to_csv(
    'submission_waveform_encoder.csv',
    columns=['id', 'label'],
    index=False
)