In [25]:
import os
import random
from pathlib import Path

import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from python_speech_features import mfcc
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

from src.const import MAIN_LABELS, SEED

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Model for predicting silence

In [None]:
class Silence_GRU(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers=4, units=64, dropout_rate=0.3):
        torch.manual_seed(SEED)
        random.seed(SEED)
        np.random.seed(SEED)

        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True,
            num_layers=num_layers,
            dropout=dropout_rate,
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_size, units)
        self.fc2 = nn.Linear(units, 1)
        self.bc1 = nn.BatchNorm1d(hidden_size)
        self.bc2 = nn.BatchNorm1d(units)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.gru(x)[0][:, -1, :]
        out = self.dropout(out)
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

Model for predicting unknown 

In [3]:
class Unknown_BiLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers=4, units=512, dropout_rate=0.3):
        torch.manual_seed(SEED)
        random.seed(SEED)
        np.random.seed(SEED)

        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=num_layers,
            dropout=dropout_rate,
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(2*hidden_size, units)
        self.fc2 = nn.Linear(units, 1)
        self.bc1 = nn.BatchNorm1d(2*hidden_size)
        self.bc2 = nn.BatchNorm1d(units)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.lstm(x)[0][:, -1, :]
        out = self.dropout(out)
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

Model for classifying main labels

In [5]:
class Main_GRU(nn.Module):

    def __init__(self, input_size, hidden_size, num_class, dropout, num_layers):

        GRU_SEED=420
        torch.manual_seed(GRU_SEED)
        random.seed(GRU_SEED)
        np.random.seed(GRU_SEED)

        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True,
            num_layers=num_layers,
            dropout=dropout,
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_class)

    def forward(self, x):
        out = self.gru(x)[0][:, -1, :]
        out = self.fc(out)
        return out

Load models

In [6]:
input_size = 20
hidden_size = 32
num_layers = 4

unknown_bilstm_model = Unknown_BiLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers).to(device)
unknown_bilstm_model.load_state_dict(torch.load("models/unknown_best_model"))

<All keys matched successfully>

In [7]:
input_size = 20
hidden_size = 16
num_layers = 2

silence_gru_model = Silence_GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers).to(device)
silence_gru_model.load_state_dict(torch.load("models/silence_best_model"))

<All keys matched successfully>

In [8]:
input_size = 20
hidden_size = 64
dropout = 0.5
num_layers = 4
num_class = 10

main_gru_model = Main_GRU(input_size, hidden_size, num_class, dropout, num_layers).to(device)
main_gru_model.load_state_dict(torch.load("models/main_best_model"))

<All keys matched successfully>

In [None]:
silence_gru_model.eval()
unknown_bilstm_model.eval()
main_gru_model.eval()

### Evaluation on kaggle test data

In [None]:
def final_prediction(X, silence_model, unknown_model, main_model, device):
    """Prediction for single observation."""
    X_tensor = torch.from_numpy(X)

    # Silence prediction
    y_pred_silence = (
        (1 * (silence_model(X_tensor.float().to(device)) > 0.5)).cpu().numpy().squeeze()
    )
    if y_pred_silence == 1:
        return 'silence'

    # Unknown prediction
    y_pred_unknown = (
        (1 * (unknown_model(X_tensor.float().to(device)) > 0.5)).cpu().numpy().squeeze()
    )
    if y_pred_unknown == 1:
        return 'unknown'

    # Main classes prediction
    y_pred_main = torch.argmax(main_model(X_tensor.float().to(device)), 1).cpu().numpy()
    return MAIN_LABELS[y_pred_main[0]]

In [19]:
def predict_on_kaggle_data(
    test_path, 
    numcep, 
    silence_gru_model, 
    unknown_bilstm_model, 
    main_gru_model, 
    device,
):
    """Generate predictions for Kaggle competition and save them in csv file."""
    Path('predictions.csv').unlink(missing_ok=True)

    with open(Path('predictions.csv'), 'a') as f:
        f.write("fname,label\n")

        for filename in tqdm(os.listdir(test_path), "Processing..."):
            # Perform the same preprocessing as in training
            audio, samplerate = librosa.load(os.path.join(test_path, filename))
            audio = librosa.resample(audio, orig_sr=samplerate, target_sr=16000)

            mfcc_feat = mfcc(
                librosa.util.fix_length(audio, size=16000),
                samplerate=16000,
                numcep=numcep,
            )

            scaler = MinMaxScaler(feature_range=(0, 1))
            scaler = scaler.fit(mfcc_feat)
            normalized = scaler.transform(mfcc_feat)

            # Make prediction
            pred = final_prediction(
                np.expand_dims(normalized, axis=0), 
                silence_gru_model, 
                unknown_bilstm_model, 
                main_gru_model, 
                device
            )
            f.write(f"{filename},{pred}\n")

In [21]:
test_path = "data/test_kaggle/test/audio/"
numcep = 20

predict_on_kaggle_data(test_path, numcep, silence_gru_model, unknown_bilstm_model, main_gru_model, device)

Processing...: 100%|██████████| 158538/158538 [22:06<00:00, 119.51it/s]


Count the occurences of each class in our predictions

In [24]:
y_pred = pd.read_csv("predictions.csv")
np.unique(y_pred["label"], return_counts=True)

(array(['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence',
        'stop', 'unknown', 'up', 'yes'], dtype=object),
 array([ 5499,  8581,  7680,  7063,  6347,  6375,  6424, 18500,  6786,
        70331,  8634,  6318], dtype=int64))