In [1]:
import json
import os
import random
import warnings
from argparse import ArgumentParser
from math import ceil

import librosa
import numpy as np
import pandas as pd
import torch
from datasets import concatenate_datasets, load_dataset
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from datasets import Dataset

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
speech_data = load_dataset(
    "speech_commands", "v0.02", cache_dir="only_selected/data_here"
)

In [3]:
def compute_mfcc(data, sample_rate=16000, n_mfcc=12):
    # Extract MFCC features
    # https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
    mfcc = librosa.feature.mfcc(
        y=data,
        sr=sample_rate,
        n_mfcc=n_mfcc,  # How many mfcc features to use? 12 at most.
        # https://dsp.stackexchange.com/questions/28898/mfcc-significance-of-number-of-features
    )
    return mfcc


def extract_fields(example):
    x = example["audio"]["array"]
    return {
        "label": example["label"],
        "array": np.pad(x, (0, 16000 - len(x)), constant_values=0),
    }

def cut_audio(example):

    if len(example["audio"]["array"]) >= 16000:
        example["audio"]["array"] = example["audio"]["array"][:16000]

    return example


def split_audio(example, duration=1):
    audio = example["audio"]["array"]
    sr = 16_000
    n_samples = len(audio)
    n_samples_per_chunk = sr * duration
    n_chunks = ceil(n_samples / n_samples_per_chunk)
    chunks = []
    for i in range(n_chunks):
        start = i * n_samples_per_chunk
        end = (i + 1) * n_samples_per_chunk
        chunk = audio[start:end]
        if len(chunk) < n_samples_per_chunk:
            chunk = np.pad(chunk, (0, n_samples_per_chunk - len(chunk)))
        chunks.append(chunk)
    return np.array(chunks)


def create_new_data(data):
    new_records = []
    for silence_record in data:
        temp = split_audio(silence_record)

        for record in temp:
            data_row = {
                "file": [silence_record["file"]],
                "audio": [{
                    "array": record,
                    "path": silence_record["audio"]["path"],
                    "sampling_rate": silence_record["audio"]["sampling_rate"],
                }],
                "label": [silence_record["label"]],
                "is_unknown": [silence_record["is_unknown"]],
                "speaker_id": [silence_record["speaker_id"]],
                "utterance_id": [silence_record["utterance_id"]]
            }
            new_records.append(data_row)
    return new_records


def swap_labels_for_data_split(data):
    # map the labels: 0 if <35 else 1
    data = data.map(lambda x: {"label": 0 if x["label"] < 35 else 1})
    # split audio when label is 1
    silence = data.filter(lambda x: x["label"] == 1)
    # create new data with split audio
    temp = create_new_data(silence)
    for el in temp:
        silence = concatenate_datasets([Dataset.from_dict(el, features=silence.features), silence])
    data = concatenate_datasets([data.filter(lambda x: x["label"] == 0), silence])
    data = data.map(cut_audio)
    return data


def preprocess(speech_data):
    train = speech_data["train"]
    validation = speech_data["validation"]
    test = speech_data["test"]

    train = swap_labels_for_data_split(train)
    validation = swap_labels_for_data_split(validation)
    test = swap_labels_for_data_split(test)


    train_silence = train.filter(lambda x: x["label"] == 1)
    train_no_silence = train.filter(lambda x: x["label"] == 0)
    train_no_silence = train_no_silence.shuffle(seed=42).select(range(len(train_silence) * 2))

    train = concatenate_datasets([train_no_silence, train_silence])

    train = train.map(
        extract_fields, remove_columns=["file", "audio", "speaker_id", "utterance_id"]
    )
    validation = validation.map(
        extract_fields, remove_columns=["file", "audio", "speaker_id", "utterance_id"]
    )
    test = test.map(
        extract_fields, remove_columns=["file", "audio", "speaker_id", "utterance_id"]
    )

    return (
        train.with_format("torch"),
        validation.with_format("torch"),
        test.with_format("torch"),
    )

In [4]:
train, validation, test = preprocess(speech_data)

In [5]:
class SilenceModel(nn.Module):
    def __init__(self, input_size=12, hidden_size=64, num_layers=3, output_size=2):
        super(SilenceModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, dropout=0.15
        )

        self.fc = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size // 2, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))

        out = self.fc2(self.relu(self.fc(out[:, -1, :])))
        return out

def calculate_accuracy(preds, y):
    preds = torch.nn.functional.softmax(preds, dim=1)
    preds = torch.argmax(preds, dim=1)
    return (torch.sum(preds == y) / len(y)).item()


In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_size = 12
hidden_size = 64
num_layers = 3
output_size = 2

model = SilenceModel(input_size, hidden_size, num_layers, output_size).to(device)

torch.cuda.empty_cache()
NUM_EPOCHS = 20
SEED = 42
BATCH_SIZE = 64
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer=optimizer, T_max=NUM_EPOCHS, eta_min=0
)
best_val_loss = float("inf")

results = pd.DataFrame(
                columns=[
                    "epoch",
                    "train_loss",
                    "train_accuracy",
                    "val_loss",
                    "val_accuracy",
                ]
            )

train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(validation, batch_size=BATCH_SIZE)
test_loader = DataLoader(test, batch_size=BATCH_SIZE)

for epoch in tqdm(range(1, NUM_EPOCHS + 1)):
    model.train()
    train_loss = 0
    train_accuracy = 0
    for batch in train_loader:
        x = batch["array"]
        x = (
            torch.Tensor(compute_mfcc(np.array(x), 16_000))
            .permute(0, 2, 1)
            .to(device)
        )
        y = batch["label"].to(device)
        y_pred = model(x.float())
        loss = criterion(y_pred, y)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_accuracy += calculate_accuracy(y_pred, y)
    train_loss /= len(train_loader)
    train_accuracy = train_accuracy / len(train_loader)

    model.eval()
    with torch.no_grad():
        val_loss = 0
        val_accuracy = 0
        for batch in valid_loader:
            x = batch["array"]
            x = (
                torch.Tensor(compute_mfcc(np.array(x), 16_000))
                .permute(0, 2, 1)
                .to(device)
            )
            y = batch["label"].to(device)
            y_pred = model(x.float())
            val_loss += criterion(y_pred, y).item()
            val_accuracy += calculate_accuracy(y_pred, y)
        val_loss /= len(valid_loader)
        val_accuracy = val_accuracy / len(valid_loader)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")
        row = pd.DataFrame(
            {
                "epoch": [epoch],
                "train_loss": [loss.item()],
                "train_accuracy": [train_accuracy],
                "val_loss": [val_loss],
                "val_accuracy": [val_accuracy],
            }
        )
        results = pd.merge(results, row, how="outer")
    print(
        f"Epoch {epoch} train loss: {train_loss}, train accuracy: {train_accuracy} val loss: {val_loss}, val accuracy: {val_accuracy}"
    )
results.to_csv("results.csv", index=False)

  5%|▌         | 1/20 [00:20<06:29, 20.52s/it]

Epoch 1 train loss: 0.6971408619600183, train accuracy: 0.49131016170277314 val loss: 0.6404100292047877, val accuracy: 0.9805069002233873


 10%|█         | 2/20 [00:40<06:01, 20.09s/it]

Epoch 2 train loss: 0.6277366806479061, train accuracy: 0.8018048160216388 val loss: 0.4096057209998939, val accuracy: 0.9930334394904459


 15%|█▌        | 3/20 [01:00<05:41, 20.09s/it]

Epoch 3 train loss: 0.4973644824589, train accuracy: 0.8155915807275211 val loss: 0.20379060658679646, val accuracy: 0.9923367834394905


 20%|██        | 4/20 [01:20<05:19, 19.97s/it]

Epoch 4 train loss: 0.4148288176340215, train accuracy: 0.8551136395510506 val loss: 0.22696271329928355, val accuracy: 0.9906449044585988


 25%|██▌       | 5/20 [01:40<04:59, 19.96s/it]

Epoch 5 train loss: 0.3084099770468824, train accuracy: 0.8841911764705882 val loss: 0.1337330299103336, val accuracy: 0.9716361464968153


 30%|███       | 6/20 [01:59<04:37, 19.81s/it]

Epoch 6 train loss: 0.09974124162074398, train accuracy: 0.9715073529411765 val loss: 0.122590503685034, val accuracy: 0.9643710191082803


 35%|███▌      | 7/20 [02:19<04:18, 19.92s/it]

Epoch 7 train loss: 0.06237641146735234, train accuracy: 0.9871323529411765 val loss: 0.09236824759252511, val accuracy: 0.9786027070063694


 40%|████      | 8/20 [02:39<03:56, 19.74s/it]

Epoch 8 train loss: 0.05373248849611949, train accuracy: 0.9862132352941176 val loss: 0.07284360306017149, val accuracy: 0.9806926751592356


 45%|████▌     | 9/20 [02:58<03:36, 19.67s/it]

Epoch 9 train loss: 0.031512798029271996, train accuracy: 0.9926470588235294 val loss: 0.05261510595750467, val accuracy: 0.986265923566879


 50%|█████     | 10/20 [03:18<03:17, 19.72s/it]

Epoch 10 train loss: 0.02608255488688455, train accuracy: 0.9935661764705882 val loss: 0.04392614586806744, val accuracy: 0.9903463375796179


 55%|█████▌    | 11/20 [03:38<02:59, 19.90s/it]

Epoch 11 train loss: 0.02175150765106082, train accuracy: 0.9917279411764706 val loss: 0.05914299606173566, val accuracy: 0.9839769108280255


 60%|██████    | 12/20 [03:58<02:38, 19.82s/it]

Epoch 12 train loss: 0.02573767935802393, train accuracy: 0.9944852941176471 val loss: 0.04141574316349616, val accuracy: 0.991640127388535


 65%|██████▌   | 13/20 [04:18<02:18, 19.78s/it]

Epoch 13 train loss: 0.019618729992276606, train accuracy: 0.9917279411764706 val loss: 0.10248683459736455, val accuracy: 0.969081740992464


 70%|███████   | 14/20 [04:38<01:59, 19.92s/it]

Epoch 14 train loss: 0.01766585383345099, train accuracy: 0.9963235294117647 val loss: 0.058078505020220855, val accuracy: 0.9852707006369427


 75%|███████▌  | 15/20 [04:57<01:38, 19.76s/it]

Epoch 15 train loss: 0.009229441762299222, train accuracy: 0.9981617647058824 val loss: 0.062147755774218515, val accuracy: 0.9831940021342153


 80%|████████  | 16/20 [05:18<01:20, 20.05s/it]

Epoch 16 train loss: 0.007417183657012442, train accuracy: 0.9972426470588235 val loss: 0.05665045276144816, val accuracy: 0.9846802017040504


 85%|████████▌ | 17/20 [05:38<01:00, 20.00s/it]

Epoch 17 train loss: 0.00668832372791846, train accuracy: 0.9981617647058824 val loss: 0.07578076248184724, val accuracy: 0.9805069002233873


 90%|█████████ | 18/20 [05:58<00:40, 20.06s/it]

Epoch 18 train loss: 0.01557174013749532, train accuracy: 0.9928141727167017 val loss: 0.07148109919912105, val accuracy: 0.9815021231533236


 95%|█████████▌| 19/20 [06:19<00:20, 20.25s/it]

Epoch 19 train loss: 0.018609817143228343, train accuracy: 0.9898897058823529 val loss: 0.07163513208751324, val accuracy: 0.9746947983267961


100%|██████████| 20/20 [06:40<00:00, 20.01s/it]

Epoch 20 train loss: 0.013493874137673308, train accuracy: 0.9944852941176471 val loss: 0.039693933522705414, val accuracy: 0.9829087049053733





In [8]:
model.eval()
test_accuracy = 0.0
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        x = batch["array"]
        x = (
            torch.Tensor(compute_mfcc(np.array(x), 16_000))
            .permute(0, 2, 1)
            .to(device)
        )
        y = batch["label"].to(device)
        y_pred = model(x.float())
        test_accuracy += calculate_accuracy(y_pred, y)
        predictions.append(y_pred)
        true_labels.append(y)
test_accuracy = test_accuracy / len(test_loader)

print(f"Test accuracy: {test_accuracy}")

Test accuracy: 0.9654292169823704


In [11]:
model2 = SilenceModel(input_size, hidden_size, num_layers, output_size)

# load the model
model2.load_state_dict(torch.load("best_model.pth"))
model2.to(device)

SilenceModel(
  (lstm): LSTM(12, 64, num_layers=3, batch_first=True, dropout=0.15)
  (fc): Linear(in_features=64, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)

In [12]:
model2.eval()
test_accuracy = 0.0
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        x = batch["array"]
        x = (
            torch.Tensor(compute_mfcc(np.array(x), 16_000))
            .permute(0, 2, 1)
            .to(device)
        )
        y = batch["label"].to(device)
        y_pred = model2(x.float())
        test_accuracy += calculate_accuracy(y_pred, y)
        predictions.append(y_pred)
        true_labels.append(y)
test_accuracy = test_accuracy / len(test_loader)

print(f"Test accuracy: {test_accuracy}")

Test accuracy: 0.9654292169823704
