In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import os

import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import torchmetrics
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

from spectrogram_encoder import SpectrogramEncoder

In [2]:
TRAIN_DIR_PATH = 'voice-commands-classification-2025/train'
TEST_DIR_PATH = 'voice-commands-classification-2025/adv_test'

In [3]:
BATCH_SIZE = 512
N_WORKERS = 6
N_CLASSES = 35
EPOCHS = 50
LR = 0.01

N_MFCC = 120
NOISE_AMPLITUDE = 0.00
MASK_PROB = 0.1

DEVICE = torch.device('cpu')
if torch.cuda.is_available():
    DEVICE = torch.device('cuda:0')
elif torch.backends.mps.is_available():
    DEVICE = torch.device('mps')

DEVICE

device(type='cuda', index=0)

In [4]:
# Dataset

def noise_waveform(waveform: torch.Tensor, noise_amplitude: float = 0.05) -> torch.Tensor:
    noise = noise_amplitude * torch.randn(waveform.shape).to(waveform.device)
    noisy_waveform = waveform + noise
    noisy_waveform = torch.clamp(noisy_waveform, -1.0, 1.0)
    return noisy_waveform

class SpeechCommandDataset(Dataset):
    def __init__(self, dir_path, data, labels=None, dict_label_to_index=None, transform=None, noise_amplitude=0.00):
        self.dir_path = dir_path
        self.data = data
        self.labels = labels
        self.dict_label_to_index = dict_label_to_index
        self.transform = transform
        self.noise_amplitude = noise_amplitude

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_name = self.data[idx]
        waveform = np.load(os.path.join(self.dir_path, file_name))
        if waveform.shape[1] < 16000:
            waveform = np.pad(
                waveform, pad_width=((0, 0), (0, 16000 - waveform.shape[1])),
                mode='constant',
                constant_values=0
            )

        waveform = torch.from_numpy(waveform)
        if self.noise_amplitude > 0:
            waveform = noise_waveform(waveform, self.noise_amplitude)

        if self.transform != None:
            spectrogram = self.transform(waveform.float())
        else:
            spectrogram = None
        
        out_labels = []
        if self.labels is not None:
            if self.labels[idx] in self.dict_label_to_index:
                out_labels = self.dict_label_to_index[self.labels[idx]]

        return waveform, spectrogram, out_labels, int(file_name.split('.')[0])

In [5]:
df_train = pd.read_csv(
    os.path.join(TRAIN_DIR_PATH, 'metadata.csv')
)
dict_label_to_index = {}
dict_index_to_label = {}
for index, key in enumerate(df_train['label'].unique()):
    dict_label_to_index[key] = index
    dict_index_to_label[index] = key

dict_label_to_index

{'stop': 0,
 'go': 1,
 'right': 2,
 'dog': 3,
 'left': 4,
 'yes': 5,
 'zero': 6,
 'four': 7,
 'bird': 8,
 'cat': 9,
 'five': 10,
 'off': 11,
 'learn': 12,
 'six': 13,
 'two': 14,
 'on': 15,
 'up': 16,
 'three': 17,
 'nine': 18,
 'one': 19,
 'follow': 20,
 'wow': 21,
 'seven': 22,
 'sheila': 23,
 'down': 24,
 'no': 25,
 'bed': 26,
 'eight': 27,
 'house': 28,
 'tree': 29,
 'visual': 30,
 'forward': 31,
 'marvin': 32,
 'backward': 33,
 'happy': 34}

In [6]:
df_train_data, df_val_data = train_test_split(
    df_train,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

train_data = df_train_data.file_name.values
train_labels = df_train_data.label.values

val_data = df_val_data.file_name.values
val_labels = df_val_data.label.values

In [7]:
# DataLoader, transform

train_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TRAIN_DIR_PATH,
        data=train_data,
        labels=train_labels,
        dict_label_to_index=dict_label_to_index,
        transform=torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True),
        noise_amplitude=NOISE_AMPLITUDE
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=N_WORKERS
)

valid_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TRAIN_DIR_PATH,
        data=val_data,
        labels=val_labels,
        dict_label_to_index=dict_label_to_index,
        transform=torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True),
        noise_amplitude=0.0
    ),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=N_WORKERS
)


In [8]:
for item in train_dataloader:
    print(item[0].shape, item[1].shape)
    break

torch.Size([512, 1, 16000]) torch.Size([512, 1, 120, 81])


In [9]:
class M5(nn.Module):
    def __init__(self, n_class, hidden_dim=96, mask_prob=0.1):
        super().__init__()
        self.sg_enc = SpectrogramEncoder(n_layer=4, n_head=6, hidden_dim=hidden_dim, mask_prob=mask_prob)
        self.lm_head = nn.Linear(hidden_dim, n_class)

    def forward(self, sg):
        sg = self.sg_enc(sg)
        sg = sg.transpose(-1, -2)
        sg = F.avg_pool1d(sg, sg.shape[-1])
        sg = sg.transpose(-1, -2)
        sg = self.lm_head(sg)
        return F.log_softmax(sg, dim=2)

In [10]:
model = M5(hidden_dim=N_MFCC, n_class=35, mask_prob=MASK_PROB)
model = model.to(DEVICE)

In [11]:
input_image = torch.rand(4, 1, 16000)
input_sp = torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True)(input_image).squeeze(1).transpose(-1, -2).to(DEVICE)
model = model.to(DEVICE)
result = model(input_sp)
print(result.size())

torch.Size([4, 1, 35])


In [None]:
def lr_lambda(current_step):
    return max(0.0, float(EPOCHS - current_step) / EPOCHS)


def train_model(model: nn.Module, train_data: DataLoader, valid_data: DataLoader):
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    criterion = nn.NLLLoss()

    accuracy_train = torchmetrics.classification.Accuracy(task="multiclass", num_classes=N_CLASSES).to(DEVICE)
    accuracy_val = torchmetrics.classification.Accuracy(task="multiclass", num_classes=N_CLASSES).to(DEVICE)

    for epoch in range(EPOCHS):
        train_loss = 0.0
        val_loss = 0.0

        model.train()
        for x, x_sp, y, _ in train_data:
            x = x.to(DEVICE)
            x_sp = x_sp.to(DEVICE)
            y = y.to(DEVICE)

            optimizer.zero_grad()

            y_hat = model(x_sp.squeeze(1).transpose(-1, -2)).squeeze()
            loss = criterion(y_hat, y)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * x.size(0)
            _, preds = torch.max(y_hat, 1)

            accuracy_train(
                y_hat,
                y
            )

        model.eval()
        for x, x_sp, y, _ in valid_data:
            x = x.to(DEVICE)
            x_sp = x_sp.to(DEVICE)
            y = y.to(DEVICE)

            y_hat = model(x_sp.squeeze(1).transpose(-1, -2)).squeeze()
            loss = criterion(y_hat, y)

            val_loss += loss.item() * x.size(0)
            _, preds = torch.max(y_hat, 1)

            accuracy_val(
                y_hat,
                y
            )

        train_loss = train_loss / len(train_dataloader.dataset)
        val_loss = val_loss / len(valid_dataloader.dataset)

        scheduler.step()

        print(f"Epoch {epoch + 1}/{EPOCHS}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {accuracy_train.compute():.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {accuracy_val.compute():.4f}")
        

In [13]:
train_model(
    model=model,
    train_data=train_dataloader,
    valid_data=valid_dataloader
)

Epoch 1/50
Train Loss: 1.1797, Train Acc: 0.6603
Val Loss: 0.4985, Val Acc: 0.8515
Epoch 2/50
Train Loss: 0.4507, Train Acc: 0.7630
Val Loss: 0.3791, Val Acc: 0.8680
Epoch 3/50
Train Loss: 0.3559, Train Acc: 0.8065
Val Loss: 0.3408, Val Acc: 0.8784
Epoch 4/50
Train Loss: 0.3022, Train Acc: 0.8320
Val Loss: 0.2885, Val Acc: 0.8867
Epoch 5/50
Train Loss: 0.2778, Train Acc: 0.8488
Val Loss: 0.2735, Val Acc: 0.8929
Epoch 6/50
Train Loss: 0.2570, Train Acc: 0.8609
Val Loss: 0.2621, Val Acc: 0.8978
Epoch 7/50
Train Loss: 0.2364, Train Acc: 0.8706
Val Loss: 0.2462, Val Acc: 0.9016
Epoch 8/50
Train Loss: 0.2265, Train Acc: 0.8781
Val Loss: 0.2471, Val Acc: 0.9048
Epoch 9/50
Train Loss: 0.2163, Train Acc: 0.8843
Val Loss: 0.2282, Val Acc: 0.9080
Epoch 10/50
Train Loss: 0.2089, Train Acc: 0.8894
Val Loss: 0.2298, Val Acc: 0.9104
Epoch 11/50
Train Loss: 0.2001, Train Acc: 0.8939
Val Loss: 0.2309, Val Acc: 0.9124
Epoch 12/50
Train Loss: 0.1861, Train Acc: 0.8979
Val Loss: 0.2161, Val Acc: 0.9144
E

In [19]:
df_test = pd.read_csv(
    os.path.join(TEST_DIR_PATH, 'metadata.csv')
)
test_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TEST_DIR_PATH,
        data=df_test.file_name.values,
        labels=None,
        dict_label_to_index=dict_label_to_index,
        transform=torchaudio.transforms.MFCC(n_mfcc=N_MFCC, log_mels=True),
        noise_amplitude=0.0
    ),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=N_WORKERS
)


In [23]:
# ENSEMBLE PREDICTIONS AND SUBMIT
results = {
    'id': [],
    'label': []
}

model.eval()
for x, x_sp, y, ids in test_dataloader:
    x = x.to(DEVICE)
    x_sp = x_sp.to(DEVICE)
    with torch.no_grad():
        y_hat = model(x_sp.squeeze(1).transpose(-1, -2)).squeeze()
        _, preds = torch.max(y_hat, 1)
        for i in range(len(preds)):
            results["id"].append(ids[i].item())
            results["label"].append(dict_index_to_label[int(preds[i].item())])
        

pd.DataFrame(results).to_csv(
    'submission.csv',
    columns=['id', 'label'],
    index=False
)

In [24]:
torch.save(model.state_dict(), 'spectrogram_encoder.pt')