In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import torchmetrics
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
TRAIN_DIR_PATH = 'voice-commands-classification-2025/train'
TEST_DIR_PATH = 'voice-commands-classification-2025/adv_test'


In [3]:
BATCH_SIZE = 512
N_WORKERS = 8
N_CLASSES = 35
EPOCHS = 50
LR = 0.01


In [4]:
DEVICE = torch.device('cpu')
if torch.cuda.is_available():
    DEVICE = torch.device('cuda:0')
elif torch.backends.mps.is_available():
    DEVICE = torch.device('mps')

DEVICE


device(type='cuda', index=0)

In [5]:
class SpeechCommandDataset(Dataset):
    def __init__(self, dir_path, data, labels=None, dict_label_to_index=None, transform=None):
        self.dir_path = dir_path
        self.data = data
        self.labels = labels
        self.dict_label_to_index = dict_label_to_index
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_name = self.data[idx]
        waveform = np.load(os.path.join(self.dir_path, file_name))
        if waveform.shape[1] < 16000:
            waveform = np.pad(
                waveform, pad_width=((0, 0), (0, 16000 - waveform.shape[1])),
                mode='constant',
                constant_values=0
            )

        waveform = torch.from_numpy(waveform)

        if self.transform != None:
            spectrogram = self.transform(waveform)
        
        out_labels = []
        if self.labels is not None:
            if self.labels[idx] in self.dict_label_to_index:
                out_labels = self.dict_label_to_index[self.labels[idx]]

        return waveform, spectrogram, out_labels, int(file_name.split('.')[0])

In [6]:
df_train = pd.read_csv(
    os.path.join(TRAIN_DIR_PATH, 'metadata.csv')
)
dict_label_to_index = {}
dict_index_to_label = {}
for index, key in enumerate(df_train['label'].unique()):
    dict_label_to_index[key] = index
    dict_index_to_label[index] = key

dict_label_to_index

{'stop': 0,
 'go': 1,
 'right': 2,
 'dog': 3,
 'left': 4,
 'yes': 5,
 'zero': 6,
 'four': 7,
 'bird': 8,
 'cat': 9,
 'five': 10,
 'off': 11,
 'learn': 12,
 'six': 13,
 'two': 14,
 'on': 15,
 'up': 16,
 'three': 17,
 'nine': 18,
 'one': 19,
 'follow': 20,
 'wow': 21,
 'seven': 22,
 'sheila': 23,
 'down': 24,
 'no': 25,
 'bed': 26,
 'eight': 27,
 'house': 28,
 'tree': 29,
 'visual': 30,
 'forward': 31,
 'marvin': 32,
 'backward': 33,
 'happy': 34}

In [7]:
df_train_data, df_val_data = train_test_split(
    df_train,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

train_data = df_train_data.file_name.values
train_labels = df_train_data.label.values

val_data = df_val_data.file_name.values
val_labels = df_val_data.label.values



In [8]:
train_transforms = torch.nn.Sequential(
    # torchaudio.transforms.MelSpectrogram(f_min=125, f_max=7500, normalized=True, n_mels=32)
    torchaudio.transforms.MFCC(n_mfcc=32, log_mels=True)
)

val_transform = torch.nn.Sequential(
    torchaudio.transforms.MelSpectrogram(),
    # torchaudio.transforms.MFCC()
)

train_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TRAIN_DIR_PATH,
        data=train_data,
        labels=train_labels,
        dict_label_to_index=dict_label_to_index,
        transform=train_transforms
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=N_WORKERS
)

valid_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TRAIN_DIR_PATH,
        data=val_data,
        labels=val_labels,
        dict_label_to_index=dict_label_to_index,
        transform=train_transforms
    ),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=N_WORKERS
)


In [None]:
for item in train_dataloader:
    # print(item)
    print(item[0].shape, item[1].shape)
    print(item[0].max(), item[0].min())
    break

torch.Size([512, 1, 16000]) torch.Size([512, 1, 32, 81])
tensor(1.0000) tensor(-1.)


In [10]:
class M5(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32, sp_channel=32):
        super().__init__()
        # self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        # self.bn1 = nn.BatchNorm1d(n_channel)
        # self.pool1 = nn.MaxPool1d(3)
        self.sp_conv2 = nn.Conv1d(n_channel, sp_channel, kernel_size=3, padding=1)
        self.sp_bn2 = nn.BatchNorm1d(sp_channel)
        self.sp_pool2 = nn.MaxPool1d(3)
        self.sp_conv3 = nn.Conv1d(sp_channel, 2 * sp_channel, kernel_size=3, padding=1)
        self.sp_bn3 = nn.BatchNorm1d(2 * sp_channel)
        self.sp_pool3 = nn.MaxPool1d(3)
        self.sp_conv4 = nn.Conv1d(2 * sp_channel, 2 * sp_channel, kernel_size=3)
        self.sp_bn4 = nn.BatchNorm1d(2 * sp_channel)
        self.sp_pool4 = nn.MaxPool1d(2)
        self.sp_fc1 = nn.Linear(2 * sp_channel, n_output)

        self.conv1 = nn.Conv1d(n_input, 64, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(64)
        # self.pool1 = nn.MaxPool1d(4)
        # self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        # self.bn2 = nn.BatchNorm1d(n_channel)
        # self.pool2 = nn.MaxPool1d(4)
        # self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        # self.bn3 = nn.BatchNorm1d(2 * n_channel)
        # self.pool3 = nn.MaxPool1d(4)
        # self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        # self.bn4 = nn.BatchNorm1d(2 * n_channel)
        # self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)
        self.lstm = nn.LSTM(64, 32, num_layers=2, batch_first=True, bidirectional=True)

    def forward(self, x, sp):
        # x = self.conv1(x)
        
        # x = F.relu(self.bn1(x))
        # x = self.pool1(x)
        sp = sp.squeeze(1)
        # print(sp.shape)
        sp = self.sp_conv2(sp)
        
        
        sp = F.relu(self.sp_bn2(sp))
        sp = self.sp_pool2(sp)
        
        
        sp = self.sp_conv3(sp)
        sp = F.relu(self.sp_bn3(sp))
        sp = self.sp_pool3(sp)
        # print(x.shape)
        sp = self.sp_conv4(sp)
        sp = F.relu(self.sp_bn4(sp))
        sp = self.sp_pool4(sp)
        # print(sp.shape)
        sp = F.avg_pool1d(sp, sp.shape[-1])
        # print(sp.shape)
        sp = sp.permute(0, 2, 1)
        # print(sp.shape)
        # sp = self.sp_fc1(sp)

        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = x.transpose(-1, -2)
        x, _ = self.lstm(x)
        # print(x.shape)
        x = x[:, -1, :].unsqueeze(1)
        # print(x.shape)



        # x = self.pool1(x)
        # x = self.conv2(x)
        # x = F.relu(self.bn2(x))
        # x = self.pool2(x)
        # x = self.conv3(x)
        # x = F.relu(self.bn3(x))
        # x = self.pool3(x)
        # x = self.conv4(x)
        # x = F.relu(self.bn4(x))
        # x = self.pool4(x)
        # x = F.avg_pool1d(x, x.shape[-1])
        # x = x.permute(0, 2, 1)

        # x = torch.stack((x, sp), dim=-1).reshape(x.shape[0], x.shape[1], 128)
        # print(x.shape)
        x = self.fc1(x)
        
        return F.log_softmax(x, dim=2)

In [11]:
model = M5(n_input=1, n_channel=32)
model = model.to(DEVICE)


In [12]:
input_image = torch.rand(4, 1, 16000)

# print((torchaudio.transforms.MFCC()(input_image).shape))

input_sp = train_transforms(input_image).to(DEVICE)
# print(input_sp.shape)
model = model.to(DEVICE)
result = model(input_image.to(DEVICE), input_sp)

print(result.size())


torch.Size([4, 1, 35])


In [13]:
def train_model(model: nn.Module, train_data: DataLoader, valid_data: DataLoader):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
    criterion = nn.NLLLoss()

    accuracy_train = torchmetrics.classification.Accuracy(task="multiclass", num_classes=N_CLASSES).to(DEVICE)
    accuracy_val = torchmetrics.classification.Accuracy(task="multiclass", num_classes=N_CLASSES).to(DEVICE)

    for epoch in range(EPOCHS):
        train_loss = 0.0
        val_loss = 0.0

        model.train()
        for x, x_sp, y, _ in train_data:
            x = x.to(DEVICE)
            x_sp = x_sp.to(DEVICE)
            y = y.to(DEVICE)

            optimizer.zero_grad()

            y_hat = model(x, x_sp).squeeze()
            loss = criterion(y_hat, y)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * x.size(0)
            _, preds = torch.max(y_hat, 1)

            accuracy_train(
                y_hat,
                y
            )

        model.eval()
        for x, x_sp, y, _ in valid_data:
            x = x.to(DEVICE)
            x_sp = x_sp.to(DEVICE)
            y = y.to(DEVICE)

            y_hat = model(x, x_sp).squeeze()
            loss = criterion(y_hat, y)

            val_loss += loss.item() * x.size(0)
            _, preds = torch.max(y_hat, 1)

            accuracy_val(
                y_hat,
                y
            )

        train_loss = train_loss / len(train_dataloader.dataset)
        val_loss = val_loss / len(valid_dataloader.dataset)

        scheduler.step()

        print(f"Epoch {epoch + 1}/{EPOCHS}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {accuracy_train.compute():.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {accuracy_val.compute():.4f}")
        

In [None]:
train_model(
    model=model,
    train_data=train_dataloader,
    valid_data=valid_dataloader
)

Epoch 1/50
Train Loss: 3.5005, Train Acc: 0.0363
Val Loss: 3.4996, Val Acc: 0.0357
Epoch 2/50
Train Loss: 3.4985, Train Acc: 0.0370
Val Loss: 3.4946, Val Acc: 0.0370
Epoch 3/50
Train Loss: 3.4957, Train Acc: 0.0374
Val Loss: 3.4890, Val Acc: 0.0377
Epoch 4/50
Train Loss: 3.4934, Train Acc: 0.0378
Val Loss: 3.4896, Val Acc: 0.0377
Epoch 5/50
Train Loss: 3.4907, Train Acc: 0.0380
Val Loss: 3.4914, Val Acc: 0.0380
Epoch 6/50
Train Loss: 3.4905, Train Acc: 0.0381
Val Loss: 3.4864, Val Acc: 0.0381
Epoch 7/50
Train Loss: 3.4813, Train Acc: 0.0386
Val Loss: 3.4236, Val Acc: 0.0413
Epoch 8/50
Train Loss: 3.3871, Train Acc: 0.0413
Val Loss: 3.3392, Val Acc: 0.0444
Epoch 9/50
Train Loss: 3.3400, Train Acc: 0.0438
Val Loss: 3.3311, Val Acc: 0.0470
Epoch 10/50
Train Loss: 3.3205, Train Acc: 0.0461
Val Loss: 3.3367, Val Acc: 0.0489
Epoch 11/50
Train Loss: 3.3233, Train Acc: 0.0480
Val Loss: 3.2939, Val Acc: 0.0505
Epoch 12/50
Train Loss: 3.3822, Train Acc: 0.0486
Val Loss: 3.4861, Val Acc: 0.0496
E

In [None]:
print(model.fc1.weight.shape)
print(torch.chunk(model.fc1.weight,2, dim=-1))
assert 0

torch.Size([35, 128])
(tensor([[ 1.2025e-02,  2.2407e-02,  1.5784e-01,  ..., -3.0753e-01,
          1.4213e-03,  1.3600e-01],
        [-3.2666e-03,  7.6779e-01,  4.3384e-02,  ..., -1.7164e-01,
          8.5756e-04,  3.5857e-02],
        [-4.5114e-03,  1.5476e-01, -1.0518e-01,  ..., -1.6196e-01,
          4.2925e-04,  3.3386e-01],
        ...,
        [-8.9999e-03,  6.1591e-02, -1.9614e-01,  ..., -6.4551e-01,
         -2.7019e-04, -4.4770e-01],
        [ 4.5856e-03,  2.4794e-01, -2.6212e-01,  ..., -4.3530e-01,
          1.1133e-03,  6.2596e-01],
        [-3.1596e-03, -3.0311e-01,  6.0339e-02,  ..., -9.0924e-03,
         -8.0231e-04, -4.6914e-01]], device='cuda:0', grad_fn=<SplitBackward0>), tensor([[ 1.6582e-04,  5.4839e-01,  4.0255e-06,  ...,  4.9793e-01,
          1.2330e-05, -1.2917e+00],
        [ 2.5950e-05, -9.4370e-02,  3.1954e-06,  ..., -2.7839e-01,
         -1.6746e-05, -3.2822e-01],
        [-2.6792e-05,  5.8419e-01,  1.4849e-06,  ..., -1.3326e-01,
         -3.5774e-05, -7.442

AssertionError: 

In [None]:
df_test = pd.read_csv(
    os.path.join(TEST_DIR_PATH, 'metadata.csv')
)
test_dataloader = DataLoader(
    SpeechCommandDataset(
        dir_path=TEST_DIR_PATH,
        data=df_test.file_name.values,
        labels=None,
        dict_label_to_index=dict_label_to_index,
        # transform=train_transforms
    ),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=N_WORKERS
)


In [None]:
# ENSEMBLE PREDICTIONS AND SUBMIT
results = {
    'id': [],
    'label': []
}

model.eval()
for x, y, ids in test_dataloader:
    x = x.float().to(DEVICE)
    with torch.no_grad():
        y_hat = model(x).squeeze()
        _, preds = torch.max(y_hat, 1)
        for i in range(len(preds)):
            results["id"].append(ids[i].item())
            results["label"].append(dict_index_to_label[int(preds[i].item())])
        

pd.DataFrame(results).to_csv(
    'submission.csv',
    columns=['id', 'label'],
    index=False
)

In [None]:
from IPython.display import FileLink

FileLink(r'submission.csv')