In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
from scipy.io import wavfile

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm

from torchaudio.datasets import SPEECHCOMMANDS
import os
import glob
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import numpy as np

from ignite.engine import Engine, Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, precision, recall, MetricsLambda, confusion_matrix
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import TensorboardLogger, global_step_from_engine, ProgressBar
from ignite.contrib.handlers.tensorboard_logger import GradsHistHandler, OutputHandler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
labels = ['forward', 'backward', 'up', 'down',
          'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero',
          'left', 'right', 'go', 'stop', 'yes', 'no', 'on', 'off', 'unknown']
# The following dataset labels are considered unkonwn
# unknown = ['bed', 'bird', 'cat', 'dog', 'follow', 'happy', 'house', 'learn', 'marvin',
#            'sheila', 'visual', 'wow', 'tree']

In [3]:
SPEECH_DATA_ROOT = "/Users/invincibleo/Leo/Projects/Datasets/SpeechCommands"
# Load the speech command dataset from pytorch dataset
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__(os.path.dirname(SPEECH_DATA_ROOT), download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

In [4]:
# Create training and testing split of the data. We do not use validation in this tutorial.
train_set = SubsetSC("training")
valid_set = SubsetSC("validation")
test_set = SubsetSC("testing")

In [None]:
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.plot(waveform.t().numpy())

In [None]:
print("Number of labels:", len(labels))
print("Number of training examples:", len(train_set))
print("Number of validation examples:", len(valid_set))
print("Number of testing examples:", len(test_set))

In [5]:
def label_to_index(word):
    if word in labels:
        return torch.tensor(labels.index(word))
    else:
        return torch.tensor(labels.index("unknown"))

def index_to_label(index):
    # Return the word corresponding to the index in labels
    # This is the inverse of label_to_index
    return labels[index]

In [17]:
train_count = torch.tensor([ 1251.,  1342.,  2935.,  3121.,  3127.,  3093.,  2956.,  2942.,  3221.,
         3074.,  3190.,  3019.,  3158.,  3237.,  3022.,  3006.,  3091.,  3099.,
         3221.,  3121.,  3076.,  2951., 20227.])
# Customize a weighted random sampler for the training set
class WeightedRandomSampler(torch.utils.data.Sampler):
    """Samples elements from [0,..,len(weights)-1] with given probabilities (weights).

    Args:
        weights (list) : a list of weights, not necessary summing up to one
        num_samples (int) : number of samples to draw
        replacement (bool): if True, samples are drawn with replacement.

    """

    def __init__(self, train_count, num_samples, replacement=True):
        actual_dist = train_count / torch.sum(train_count)
        desired_dist = torch.tensor([1/(len(labels)+4)] * len(labels))
        desired_dist[-1] = desired_dist[-1] * 5
        print("Actual distribution:", desired_dist)
        assert desired_dist[-1] == 5 * desired_dist[0]
        self.per_class_weights = desired_dist / actual_dist
        self.num_samples = num_samples
        self.replacement = replacement

        # Assign a weight to each example
        # Check if the weights is saved in the disk
        if os.path.exists("weights_per_sample.pt"):
            self.weights = torch.load("weights_per_sample.pt")
        else:
            self.weights = torch.tensor([self.per_class_weights[label_to_index(i)] for _, _, i, *_ in train_set])
            torch.save(self.weights, "weights_per_sample.pt")

    def __iter__(self):
        return iter(torch.multinomial(self.weights, self.num_samples, self.replacement))

    def __len__(self):
        return self.num_samples

In [18]:
import random
sample_rate = 16000
# FILEPATH: /Users/invincibleo/Library/Mobile Documents/com~apple~CloudDocs/Leo/Postdoc/Trips/1_SOUNDS_4TH_Seasonal_school/Project/SOUNDS_4th_school_practical/speech_command_recognition_ex_v3_ignite.ipynb
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)

# MFCC feature extraction and save to disk
def extract_mfcc(waveform):
    mfcc = torchaudio.transforms.MFCC(
        sample_rate=sample_rate,
        n_mfcc=16,
        melkwargs={"n_fft": int(0.03*sample_rate), "hop_length": int(0.03*0.5*sample_rate), "n_mels": 64,
                   "window_fn": torch.hamming_window, "center": False, "pad_mode": "reflect"},
    )
    return mfcc(waveform)

def add_exist_noise(waveform):
    # Apply existing noise
    noise_files = glob.glob(os.path.join(SPEECH_DATA_ROOT, "speech_commands_v0.02", "_background_noise_", "*.wav"))
    noise_file = noise_files[torch.randint(0, len(noise_files), (1,)).item()]
    noise_waveform, _ = torchaudio.load(noise_file)
    # Make waveform and noise the same length by padding with zeros
    if noise_waveform.size(-1) < waveform.size(-1):
        noise_waveform = F.pad(noise_waveform, (0, waveform.size(-1) - noise_waveform.size(-1)))
    elif noise_waveform.size(-1) >= waveform.size(-1):
        # randomly crop noise
        max_offset = noise_waveform.size(-1) - waveform.size(-1)
        offset = torch.randint(0, max_offset, (1,))
        noise_waveform = noise_waveform[..., offset:offset+waveform.size(-1)]
    waveform = torchaudio.transforms.AddNoise()(waveform, noise_waveform, snr=torch.randint(-5, 10, (1,)))
    return waveform

def time_shift(waveform):
    # Apply time shift
    shift_amount = int(sample_rate*0.3*torch.randint(-1, 1, (1,)).item())
    # Apply random time shift to waveform and zero pad at the beginning or at the end
    if shift_amount > 0:
        waveform = waveform[..., :-shift_amount]
        waveform = F.pad(waveform, (shift_amount, 0))
    else:
        waveform = waveform[..., -shift_amount:]
        waveform = F.pad(waveform, (0, -shift_amount))
    return waveform

# Data Augmentation
def data_augment(waveform):
    augmentations = [
        torchaudio.transforms.Vol(gain=torch.randint(low=-10, high=10, size=(1,)), gain_type='db'),
        torchaudio.transforms.TimeMasking(time_mask_param=int(0.02*16000), p=0.1),
        lambda w: torchaudio.transforms.AddNoise()(w, torch.randn_like(w), snr=torch.randint(low=-5, high=10, size=(1,))),
        add_exist_noise,
        time_shift,
        lambda w: w
    ]
    selected_augmentations = random.sample(augmentations, 2)
    for augmentation in selected_augmentations:
        waveform = augmentation(waveform)
    return waveform


def collate_fn_extract_feature_train(batch):
    # A data tuple has the form:
    # waveform, sample_rate, label, speaker_id, utterance_number
    tensors, targets = [], []
    # Gather in lists, and encode labels as indices
    for waveform, sample_rate, label, speaker_id, utterance_number in batch:
        # Apply data augmentation
        waveform = data_augment(waveform)
        tensors += [waveform]
        targets += [label_to_index(label)]
    # Group the list of tensors into a batched tensor
    tensors = pad_sequence(tensors)
    # Extract MFCC features
    tensors = extract_mfcc(tensors)
    # Squeeze and permute 
    tensors = tensors.squeeze(1).permute(0, 2, 1)
    targets = torch.stack(targets)

    return tensors, targets

def collate_fn_extract_feature_eval(batch):
    # A data tuple has the form:
    # waveform, sample_rate, label, speaker_id, utterance_number
    tensors, targets = [], []
    # Gather in lists, and encode labels as indices
    for waveform, sample_rate, label, speaker_id, utterance_number in batch:
        tensors += [waveform]
        targets += [label_to_index(label)]
    # Group the list of tensors into a batched tensor
    tensors = pad_sequence(tensors)
    # Extract MFCC features
    tensors = extract_mfcc(tensors)
    # Squeeze and permute 
    tensors = tensors.squeeze(1).permute(0, 2, 1)
    targets = torch.stack(targets)

    return tensors, targets

batch_size = 512

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    # shuffle=True,
    sampler=WeightedRandomSampler(train_count, num_samples=len(train_set), replacement=True),
    collate_fn=collate_fn_extract_feature_train,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
train_eval_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn_extract_feature_eval,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
valid_loader = torch.utils.data.DataLoader(
    valid_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn_extract_feature_eval,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

Actual distribution: tensor([0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370,
        0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370, 0.0370,
        0.0370, 0.0370, 0.0370, 0.0370, 0.1852])


In [None]:
# with torch.no_grad():
#     for tensors, targets in train_loader:
#         print(index_to_label(targets[0].item()))
#         print(tensors.size())
#         # # Plot the MFCC feature
#         # plt.figure(figsize=(10, 5))
#         # plt.imshow(tensors[0].squeeze().numpy(), cmap='hot', interpolation='nearest')
#         # plt.title("MFCC")
#         # plt.show()
#         # Play the audio file
#         break

In [None]:
# class LSTM(nn.Module):
#     def __init__(self, n_input=16, n_output=23, n_channel=64):
#         super().__init__()
#         self.LSTM1 = nn.LSTM(n_input, n_channel, num_layers=1, batch_first=True, bidirectional=True)
#         self.fc1 = nn.Linear(n_channel*2, n_output)
#         self.global_avg_pool = nn.AdaptiveAvgPool1d(1)

#     def forward(self, x):
#         x, (h_n, c_n) = self.LSTM1(x)
#         x = self.fc1(x)
#         x = self.global_avg_pool(torch.transpose(x, 1, 2)).squeeze(2)
#         return F.log_softmax(x, dim=1)

# model = LSTM(n_input=16, n_output=len(labels))
# model.to(device)
# print(model)

# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

# n = count_parameters(model)
# print("Number of parameters: %s" % n)

In [None]:
class CNN(nn.Module):
    def __init__(self, n_input=16, n_output=23, n_channel=32):
        super().__init__()
        self.cnn1 = nn.Conv1d(n_input, n_channel, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(3)
        self.cnn2 = nn.Conv1d(n_channel, 2*n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(2*n_channel)
        self.pool2 = nn.MaxPool1d(3)
        self.conv3 = nn.Conv1d(2*n_channel, 4 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(4 * n_channel)
        self.pool3 = nn.MaxPool1d(3)
        self.fc1 = nn.Linear(4 * n_channel, n_output)

    def forward(self, x):
        x = torch.permute(x, [0, 2, 1])
        x = self.pool1(F.relu(self.bn1(self.cnn1(x))))
        x = self.pool2(F.relu(self.bn2(self.cnn2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.squeeze(-1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)

model = CNN(n_input=16, n_output=len(labels))
model.to(device)
print(model)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

n = count_parameters(model)
print("Number of parameters: %s" % n)

In [19]:
# class M5(nn.Module):
#     def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
#         super().__init__()
#         self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
#         self.bn1 = nn.BatchNorm1d(n_channel)
#         self.pool1 = nn.MaxPool1d(4)
#         self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
#         self.bn2 = nn.BatchNorm1d(n_channel)
#         self.pool2 = nn.MaxPool1d(4)
#         self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
#         self.bn3 = nn.BatchNorm1d(2 * n_channel)
#         self.pool3 = nn.MaxPool1d(4)
#         self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
#         self.bn4 = nn.BatchNorm1d(2 * n_channel)
#         self.pool4 = nn.MaxPool1d(4)
#         self.fc1 = nn.Linear(2 * n_channel, n_output)

#     def forward(self, x):
#         x = self.conv1(x)
#         x = F.relu(self.bn1(x))
#         x = self.pool1(x)
#         x = self.conv2(x)
#         x = F.relu(self.bn2(x))
#         x = self.pool2(x)
#         x = self.conv3(x)
#         x = F.relu(self.bn3(x))
#         x = self.pool3(x)
#         x = self.conv4(x)
#         x = F.relu(self.bn4(x))
#         x = self.pool4(x)
#         x = F.avg_pool1d(x, x.shape[-1])
#         x = x.permute(0, 2, 1)
#         x = self.fc1(x)
#         return F.log_softmax(x, dim=2).squeeze(1)


# model = M5(n_input=1, n_output=len(labels))
# model.to(device)
# print(model)


# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)


# n = count_parameters(model)
# print("Number of parameters: %s" % n)

M5(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(16,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=23, bias=True)
)
Numbe

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)  # reduce the learning after 20 epochs by a factor of 10

In [None]:
def number_of_correct(pred, target):
    # count number of correct predictions
    return pred.squeeze().eq(target).sum().item()

def get_likely_index(tensor):
    # find most likely label index for each element in the batch
    return tensor.argmax(dim=-1)

In [None]:
def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()
    x, y = batch
    x, y = x.to(device), y.to(device)
    y_pred = model(x)
    loss = F.nll_loss(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.item()

trainer = Engine(train_step)
ProgressBar().attach(trainer)

In [None]:
def validation_step(engine, batch):
    model.eval()
    with torch.no_grad():
        x, y = batch
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        return y_pred, y
    
train_evaluator = Engine(validation_step)
valid_evaluator = Engine(validation_step)

In [None]:
# Attach all the evaluation metrics to the evaluators
def attach_metrics(evaluator):
      Loss(F.nll_loss).attach(evaluator, "nll")
      Accuracy().attach(evaluator, "accuracy")
      my_recall = recall.Recall(average=True)
      my_recall.attach(evaluator, "recall")
      my_precision = precision.Precision(average=True)
      my_precision.attach(evaluator, "precision")
      # F1 score is the harmonic mean of precision and recall
      # f1 = (MetricsLambda(lambda t: 2*(t["precision"]*t["recall"])/(t["precision"]+t["recall"]), {"precision": my_precision, "recall": my_recall}))
      # f1.attach(evaluator, "f1")
      
      confusion = confusion_matrix.ConfusionMatrix(num_classes=len(labels))
      confusion.attach(evaluator, "cm")      

attach_metrics(train_evaluator)
attach_metrics(valid_evaluator)

In [None]:
def generate_CM_png(cm):
    cm = pd.DataFrame(cm, index=labels, columns=labels)
    fig = plt.figure(figsize=(25, 10))
    fig.tight_layout()
    # Normalize the confusion matrix
    cm = cm / (np.sum(cm.to_numpy(), axis=1)[:, None] + 1e-10)
    sn.heatmap(cm, annot=True, cmap='Blues')
    # Put x-axis label and y-axis label
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    # Calculate accuracy and put in the title
    accuracy = np.trace(cm.to_numpy()) / np.sum(cm.to_numpy())
    plt.title("Accuracy: {:.1f}%".format(accuracy*100)) 
    return fig

In [None]:
def tb_log(model, trainer, evaluator, tag, root_dir="./tb_logs"):
    # Define a Tensorboard train logger
    tb_logger = TensorboardLogger(log_dir=os.path.join(root_dir, tag))

    # tb_logger.attach(
    #     trainer,
    #     event_name=Events.EPOCH_COMPLETED,
    #     log_handler=GradsHistHandler(model)
    # )
    tb_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_STARTED,
        optimizer=optimizer
    )
    # Attach handler for plotting both evaluators' metrics after every epoch completes
    tb_logger.attach_output_handler(
        evaluator,
        event_name=Events.EPOCH_COMPLETED,
        tag="",
        metric_names=["nll", "accuracy", "recall", "precision"],
        global_step_transform=global_step_from_engine(trainer),
    )
    # Attach handler to plot the confusion matrix after every epoch completes
    @evaluator.on(Events.EPOCH_COMPLETED)
    def image_logger():
        metrics = evaluator.state.metrics
        cm = metrics["cm"]
        res = generate_CM_png(cm)
        global_step = global_step_from_engine(trainer)(evaluator, Events.EPOCH_COMPLETED)
        tb_logger.writer.add_figure(tag=tag, figure=res, global_step=global_step)

validate_every = 1
# Evaluate on the training set every validate_every epochs
@trainer.on(Events.EPOCH_COMPLETED(every=validate_every))
def run_train_eval():
    train_evaluator.run(train_eval_loader)

# Evaluate on the validation set every validate_every epochs
@trainer.on(Events.EPOCH_COMPLETED(every=validate_every))
def run_valid_eval():
    valid_evaluator.run(valid_loader)

# Save the model after every epoch
checkpointer = ModelCheckpoint(
    "./models", "speech_commands", n_saved=50, create_dir=True, save_as_state_dict=True, require_empty=False
)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {"model": model})

tb_log(model, trainer, train_evaluator, tag="training")
tb_log(model, trainer, valid_evaluator, tag="validation")
    
# The transform needs to live on the same device as the model and the data.
max_epochs = 60
trainer.run(train_loader, max_epochs)

In [None]:
# Load the mode at 44*166 iterations
model = M5(n_input=1, n_output=len(labels))
model.load_state_dict(torch.load(f"./models/speech_commands_model_{47*166}.pt"))

In [None]:
def predict(tensor):
    model.eval()
    # Use the model to predict the label of the waveform
    tensor = tensor.to(device)
    # tensor = transform(tensor)
    # if tensor shape is not 16000, then the waveform is too short and we need to pad it
    if tensor.numel() != 16000:
        tensor = F.pad(tensor, (0, 16000 - tensor.numel()), "constant", 0.0)

    # # Calculate MFCC
    tensor = extract_mfcc(tensor)
    tensor = torch.squeeze(tensor, (0, 1)).t()

    tensor = model(tensor.unsqueeze(0))
    tensor = get_likely_index(tensor)
    tensor = index_to_label(tensor.squeeze())
    return tensor

correct = 0
test_pred = []
test_target = []
for i, (waveform, sample_rate, label, speaker_id, utterance_number) in enumerate(test_set):
    output = predict(waveform)
    test_pred.append(output)
    test_target.append(label)
    if output == label:
        correct += 1
    if output != label:
        ipd.Audio(waveform.numpy(), rate=sample_rate)
        print(f"Data point spk {speaker_id} utt {utterance_number}. Expected: {label}. Predicted: {output}.")
        # break
# else:
#     print("All examples in this dataset were correctly classified!")
#     print("In this case, let's just look at the last data point")
#     ipd.Audio(waveform.numpy(), rate=sample_rate)
#     print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
        
print(f"Accuracy: {correct}/{len(test_set)} ({100. * correct / len(test_set):.0f}%)")

In [None]:
import sounddevice as sd
print(sd.query_devices())
sd.default.device = "Leo's iPhone 13 Microphone"
def record(seconds=5, sample_rate=16000):
    # Make a 1s recording
    print("Start recording.")
    recording = sd.rec(int(seconds * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()
    
    # Define the file format
    fileformat = "wav"
    filename = f"_audio.{fileformat}"
    # Write the recording to a file using scipy wavfile
    wavfile.write(filename, sample_rate, recording)
    return torchaudio.load(filename)

# Detect whether notebook runs in google colab
record_wav, sample_rate = record()
# sample_rate, record_wav = wavfile.read("_audio.wav")
# Check if record_wav is a torch tensor
if not isinstance(record_wav, torch.Tensor):
    record_wav = torch.tensor(record_wav, dtype=torch.float32)
record_wav = torch.reshape(record_wav, (1, -1))
print(f"Predicted: {predict(record_wav)}.")
ipd.Audio(record_wav, rate=sample_rate)

In [None]:
if not isinstance(record_wav, torch.Tensor):
    record_wav = torch.tensor(record_wav, dtype=torch.float32)
record_wav = torch.reshape(record_wav, (1, -1))
print(f"Predicted: {predict(record_wav)}.")