In [43]:
Db = 'U8k'
Db = 'K20'

if Db == 'U8k':
    local_config = {
        'data_path'     : 'UrbanSound8K/audio',
        'metadata_file' : 'UrbanSound8K/metadata/UrbanSound8K.csv',
        'nb_classes'    :  10,}
else:
    local_config = {
        'data_path'     : '/',
        'metadata_file' : 'kitchen20b.csv',
        'nb_classes'    :  20,}  
        

In [44]:
# ----------------------------
# Prepare training data from Metadata file
# ----------------------------

import pandas as pd
from pathlib import Path

data_path = local_config['data_path']

# Read metadata file
metadata_file = local_config['metadata_file']

df = pd.read_csv(metadata_file)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'kitchen20b.csv'

In [3]:
# Construct file path by concatenating fold and file name
if Db == 'U8k':
    df['path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str) # Urban8k

# Take relevant columns
df = df[['path', 'target']]

df.head()

Unnamed: 0,relative_path,classID
0,audio/0-158737-A-0.wav,0
1,audio/0-158737-B-0.wav,0
2,audio/0-158737-C-0.wav,0
3,audio/0-173319-A-0.wav,0
4,audio/0-173319-B-0.wav,0


In [4]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class AudioUtil():
    # ----------------------------
    # Load an audio file. Return the signal as a tensor and the sample rate
    # ----------------------------
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)
    # ----------------------------
    # Convert the given audio to the desired number of channels
    # ----------------------------
    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud

        if (sig.shape[0] == new_channel):
          # Nothing to do
          return aud

        if (new_channel == 1):
          # Convert from stereo to mono by selecting only the first channel
          resig = sig[:1, :]
        else:
          # Convert from mono to stereo by duplicating the first channel
          resig = torch.cat([sig, sig])

        return ((resig, sr))
    # ----------------------------
    # Since Resample applies to a single channel, we resample one channel at a time
    # ----------------------------
    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud

        if (sr == newsr):
            # Nothing to do
            return aud

        num_channels = sig.shape[0]
        # Resample first channel
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            # Resample the second channel and merge both channels
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])

        return ((resig, newsr))
    # ----------------------------
    # Pad (or truncate) the signal to a fixed length 'max_ms' in milliseconds
    # ----------------------------
    @staticmethod
    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms

        if (sig_len > max_len):
            # Truncate the signal to the given length
            sig = sig[:,:max_len]

        elif (sig_len < max_len):
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len

            # Pad with 0s
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))

            sig = torch.cat((pad_begin, sig, pad_end), 1)

        return (sig, sr)
    # ----------------------------
    # Shifts the signal to the left or right by some percent. Values at the end
    # are 'wrapped around' to the start of the transformed signal.
    # ----------------------------
    @staticmethod
    def time_shift(aud, shift_limit):
        sig,sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)
    # ----------------------------
    # Generate a Spectrogram
    # ----------------------------
    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig,sr = aud
        top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

        # Convert to decibels
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)
    # ----------------------------
    # Augment the Spectrogram by masking out some sections of it in both the frequency
    # dimension (ie. horizontal bars) and the time dimension (vertical bars) to prevent
    # overfitting and to help the model generalise better. The masked sections are
    # replaced with the mean value.
    # ----------------------------
    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec

In [5]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4

    # ----------------------------
    # Number of items in dataset
    # ----------------------------
    def __len__(self):
        return len(self.df)    

    # ----------------------------
    # Get i'th item in dataset
    # ----------------------------
    def __getitem__(self, idx):
        # Absolute file path of the audio file - concatenate the audio directory with
        # the relative path
        audio_file = self.data_path + self.df.loc[idx, 'path']
        # Get the Class ID
        class_id = self.df.loc[idx, 'target']

        aud = AudioUtil.open(audio_file)
        # Some sounds have a higher sample rate, or fewer channels compared to the
        # majority. So make all sounds have the same number of channels and same 
        # sample rate. Unless the sample rate is the same, the pad_trunc will still
        # result in arrays of different lengths, even though the sound duration is
        # the same.
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)

        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

        return aug_sgram, class_id

In [6]:
from torch.utils.data import random_split

myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [7]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

# ----------------------------
# Audio Classification Model
# ----------------------------
class AudioClassifier (nn.Module):
    # ----------------------------
    # Build the model architecture
    # ----------------------------
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

#         # Second Convolution Block
#         self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
#         self.relu4 = nn.ReLU()
#         self.bn4 = nn.BatchNorm2d(64)
#         init.kaiming_normal_(self.conv4.weight, a=0.1)
#         self.conv4.bias.data.zero_()
#         conv_layers += [self.conv4, self.relu4, self.bn4]

#         # Linear Classifier
#         self.ap = nn.AdaptiveAvgPool2d(output_size=1)
#         self.lin = nn.Linear(in_features=64, out_features=local_config['nb_classes'])

        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=32, out_features=local_config['nb_classes'])

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # ----------------------------
    # Forward pass computations
    # ----------------------------
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x


In [8]:
#pip install torchsummary 

In [12]:
from torchsummary import summary
myModel = AudioClassifier()
summary(myModel,(2, 4000, 64))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 8, 2000, 32]             408
            Conv2d-2          [-1, 8, 2000, 32]             408
              ReLU-3          [-1, 8, 2000, 32]               0
              ReLU-4          [-1, 8, 2000, 32]               0
       BatchNorm2d-5          [-1, 8, 2000, 32]              16
       BatchNorm2d-6          [-1, 8, 2000, 32]              16
            Conv2d-7         [-1, 16, 1000, 16]           1,168
            Conv2d-8         [-1, 16, 1000, 16]           1,168
              ReLU-9         [-1, 16, 1000, 16]               0
             ReLU-10         [-1, 16, 1000, 16]               0
      BatchNorm2d-11         [-1, 16, 1000, 16]              32
      BatchNorm2d-12         [-1, 16, 1000, 16]              32
           Conv2d-13           [-1, 32, 500, 8]           4,640
           Conv2d-14           [-1, 32,

In [2]:
# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)

# Check that it is on Cuda
next(myModel.parameters()).device

NameError: name 'local_config' is not defined

In [44]:
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
    # Loss Function, Optimizer and Scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.05,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

    # Repeat for each epoch
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_prediction = 0
        total_prediction = 0

        # Repeat for each batch in the training set
        for i, data in enumerate(train_dl):
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Keep stats for Loss and Accuracy
            running_loss += loss.item()

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs,1)
            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

            #if i % 10 == 0:    # print every 10 mini-batches
            #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))

        # Print stats at the end of the epoch
        num_batches = len(train_dl)
        avg_loss = running_loss / num_batches
        acc = correct_prediction/total_prediction
        print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

    print('Finished Training')


In [None]:
num_epochs=500   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 2.80, Accuracy: 0.11
Epoch: 1, Loss: 2.55, Accuracy: 0.17
Epoch: 2, Loss: 2.36, Accuracy: 0.24
Epoch: 3, Loss: 2.18, Accuracy: 0.28
Epoch: 4, Loss: 2.09, Accuracy: 0.33
Epoch: 5, Loss: 1.96, Accuracy: 0.34
Epoch: 6, Loss: 1.88, Accuracy: 0.39
Epoch: 7, Loss: 1.82, Accuracy: 0.36
Epoch: 8, Loss: 1.79, Accuracy: 0.40
Epoch: 9, Loss: 1.74, Accuracy: 0.41
Epoch: 10, Loss: 1.70, Accuracy: 0.41
Epoch: 11, Loss: 1.64, Accuracy: 0.44
Epoch: 12, Loss: 1.63, Accuracy: 0.45
Epoch: 13, Loss: 1.61, Accuracy: 0.45
Epoch: 14, Loss: 1.47, Accuracy: 0.51
Epoch: 15, Loss: 1.53, Accuracy: 0.47
Epoch: 16, Loss: 1.44, Accuracy: 0.51
Epoch: 17, Loss: 1.40, Accuracy: 0.52
Epoch: 18, Loss: 1.45, Accuracy: 0.53
Epoch: 19, Loss: 1.37, Accuracy: 0.56
Epoch: 20, Loss: 1.36, Accuracy: 0.52
Epoch: 21, Loss: 1.29, Accuracy: 0.57
Epoch: 22, Loss: 1.28, Accuracy: 0.57
Epoch: 23, Loss: 1.35, Accuracy: 0.55
Epoch: 24, Loss: 1.21, Accuracy: 0.60
Epoch: 25, Loss: 1.26, Accuracy: 0.58
Epoch: 26, Loss: 1.26,

Epoch: 213, Loss: 0.59, Accuracy: 0.81
Epoch: 214, Loss: 0.52, Accuracy: 0.83
Epoch: 215, Loss: 0.57, Accuracy: 0.81
Epoch: 216, Loss: 0.67, Accuracy: 0.80
Epoch: 217, Loss: 0.70, Accuracy: 0.78
Epoch: 218, Loss: 0.68, Accuracy: 0.77
Epoch: 219, Loss: 0.60, Accuracy: 0.80
Epoch: 220, Loss: 0.57, Accuracy: 0.82
Epoch: 221, Loss: 0.58, Accuracy: 0.81
Epoch: 222, Loss: 0.62, Accuracy: 0.81
Epoch: 223, Loss: 0.57, Accuracy: 0.81
Epoch: 224, Loss: 0.54, Accuracy: 0.82
Epoch: 225, Loss: 0.47, Accuracy: 0.84
Epoch: 226, Loss: 0.60, Accuracy: 0.81
Epoch: 227, Loss: 0.50, Accuracy: 0.84
Epoch: 228, Loss: 0.51, Accuracy: 0.81
Epoch: 229, Loss: 0.57, Accuracy: 0.81
Epoch: 230, Loss: 0.66, Accuracy: 0.79
Epoch: 231, Loss: 0.58, Accuracy: 0.82
Epoch: 232, Loss: 0.58, Accuracy: 0.81
Epoch: 233, Loss: 0.55, Accuracy: 0.83
Epoch: 234, Loss: 0.61, Accuracy: 0.79
Epoch: 235, Loss: 0.58, Accuracy: 0.81
Epoch: 236, Loss: 0.63, Accuracy: 0.81
Epoch: 237, Loss: 0.58, Accuracy: 0.80
Epoch: 238, Loss: 0.49, A

Epoch: 424, Loss: 0.28, Accuracy: 0.91
Epoch: 425, Loss: 0.27, Accuracy: 0.90
Epoch: 426, Loss: 0.31, Accuracy: 0.90
Epoch: 427, Loss: 0.26, Accuracy: 0.92
Epoch: 428, Loss: 0.25, Accuracy: 0.92
Epoch: 429, Loss: 0.22, Accuracy: 0.93
Epoch: 430, Loss: 0.31, Accuracy: 0.89
Epoch: 431, Loss: 0.26, Accuracy: 0.92
Epoch: 432, Loss: 0.28, Accuracy: 0.91
Epoch: 433, Loss: 0.22, Accuracy: 0.93
Epoch: 434, Loss: 0.23, Accuracy: 0.92
Epoch: 435, Loss: 0.26, Accuracy: 0.92
Epoch: 436, Loss: 0.22, Accuracy: 0.92
Epoch: 437, Loss: 0.26, Accuracy: 0.91
Epoch: 438, Loss: 0.28, Accuracy: 0.91
Epoch: 439, Loss: 0.24, Accuracy: 0.91
Epoch: 440, Loss: 0.26, Accuracy: 0.91
Epoch: 441, Loss: 0.26, Accuracy: 0.92
Epoch: 442, Loss: 0.22, Accuracy: 0.92
Epoch: 443, Loss: 0.30, Accuracy: 0.90
Epoch: 444, Loss: 0.24, Accuracy: 0.92
Epoch: 445, Loss: 0.20, Accuracy: 0.93
Epoch: 446, Loss: 0.24, Accuracy: 0.92
Epoch: 447, Loss: 0.23, Accuracy: 0.93
Epoch: 448, Loss: 0.23, Accuracy: 0.93
Epoch: 449, Loss: 0.20, A

In [23]:
# ----------------------------
# Inference
# ----------------------------
def inference (model, val_dl):
    correct_prediction = 0
    total_prediction = 0

    # Disable gradient updates
    with torch.no_grad():
        for data in val_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs,1)
            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

    acc = correct_prediction/total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')


In [34]:
# Run inference on trained model with the validation set
inference(myModel, val_dl)

Accuracy: 0.58, Total items: 218


In [None]:
# Taux de reconnaissance par classe et confussion matrix