In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split, DataLoader
import matplotlib.pyplot as plt
import scipy
import scipy.io
from scipy.signal import spectrogram
import os
import soundfile as sf
import librosa
import math
from librosa.feature import melspectrogram
from librosa.display import specshow
import cv2
import random
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report

In [2]:
# Try with 1D convolution

# Get dataset
class_dict = {'siren': 0,
             'jackhammer': 1,
             'air_conditioner': 2,
             'drilling': 3,
             'children_playing': 4,
             'street_music': 5,
             'dog_bark': 6,
             'engine_idling': 7,
             'gun_shot': 8,
             'car_horn': 9}

class SoundDataset(Dataset):
    def __init__(self, data_dir):
        self.sample_list = []
        
        for filename in os.listdir(data_dir):
            data, samplerate = librosa.load(f'{data_dir}/{filename}')
            label = int(filename.split('-')[0][-1])
            self.sample_list.append((label, torch.tensor(data), filename))
    
    def __len__(self):
        return len(self.sample_list)
    
    def __getitem__(self, idx):
        return self.sample_list[idx]

In [3]:
# Load the data (audio)
trainset = SoundDataset('../Datasets/train/Raw')
valset = SoundDataset('../Datasets/val/Raw')
testset = SoundDataset('../Datasets/test/Raw')

In [4]:
# Create Neural Network Model
class AudioNet(nn.Module):  #1D convolutions
    def __init__(self, num_classes):
        super(AudioNet, self).__init__()
        
        self.conv1 = nn.Conv1d(1, 16, kernel_size=23, stride=5, padding=0) #try making kernels larger, fewer layers, less maxp
        self.batchnorm1 = nn.BatchNorm1d(16)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=7, stride=3, padding=0)
        self.batchnorm2 = nn.BatchNorm1d(32)
        self.lrelu = nn.LeakyReLU()
        self.max2 = nn.MaxPool1d(kernel_size=3)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=5, stride=3, padding=0)
        self.batchnorm3 = nn.BatchNorm1d(64)
        self.drop1 = nn.Dropout(p=0.5)
        self.max3 = nn.MaxPool1d(kernel_size=3)
        self.conv4 = nn.Conv1d(64, 64, kernel_size=5, stride=3, padding=0)
        self.batchnorm4 = nn.BatchNorm1d(64)
        self.conv5 = nn.Conv1d(64, 64, kernel_size=7, stride=3, padding=0)
        self.fc1 = nn.Linear(64 * 22, 64)
        self.fc2 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.batchnorm1(x)
        #print(x.shape)
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.lrelu(x)
        #print(x.shape)
        x = self.max2(x)
        #print(x.shape)
        x = self.conv3(x)
        x = self.batchnorm3(x)
        x = self.drop1(x)
        #print(x.shape)
        x = self.max3(x)
        #print(x.shape)
        x = self.conv4(x)
        x = self.batchnorm4(x)
        #print(x.shape)
        x = self.conv5(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        #print(x.shape)
        out = self.fc2(x)
        #print(out.shape)
        return out
    
    
class AudioNetSpect(nn.Module):  #2D convolutions on spectrograms
    def __init__(self, num_classes):
        super(AudioNetSpect, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=7, stride=1, padding=0) #try making kernels larger, fewer layers, less maxp
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.drop1 = nn.Dropout(p=0.5)
        self.lrelu1 = nn.LeakyReLU()
        self.max1 = nn.MaxPool2d(kernel_size=3)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=0)
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.drop2 = nn.Dropout(p=0.5)
        self.lrelu2 = nn.LeakyReLU()
        self.max2 = nn.MaxPool2d(kernel_size=3)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=0)
        self.batchnorm3 = nn.BatchNorm2d(64)
        self.fc1 = nn.Linear(64 * 8 * 13, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.drop1(x)
        x = self.lrelu1(x)
        x = self.max1(x)
        #print(x.shape)
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.drop2(x)
        x = self.lrelu2(x)
        #print(x.shape)
        x = self.max2(x)
        #print(x.shape)
        x = self.conv3(x)
        x = self.batchnorm3(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        #print(x.shape)
        x = self.fc1(x)
        #print(x.shape)
        out = self.fc2(x)
        #print(out.shape)
        return out

In [5]:
# Load model
model = AudioNet(num_classes=9)
model.cuda()

AudioNet(
  (conv1): Conv1d(1, 16, kernel_size=(23,), stride=(5,))
  (batchnorm1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(16, 32, kernel_size=(7,), stride=(3,))
  (batchnorm2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lrelu): LeakyReLU(negative_slope=0.01)
  (max2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(5,), stride=(3,))
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop1): Dropout(p=0.5, inplace=False)
  (max3): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(5,), stride=(3,))
  (batchnorm4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv1d(64, 64, kernel_size=(7,), stride=(3,))
  (fc1): Linear(in_features=1408, out_features=64, bias=True)
  (fc2

In [6]:
# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [7]:
# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

In [8]:
# Train the model
best_val_loss = math.inf
num_epochs = 50

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, _) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, _) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples
        
    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/audio_net.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 2 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 1.8439535913260088
Val Loss: 1.9279757867688718 

Epoch 3
Training Loss: 1.5188240072001582
Val Loss: 2.0070491057375204 

Epoch 5
Training Loss: 1.3413328662298727
Val Loss: 2.060727837293044 

Epoch 7
Training Loss: 1.2941167043602986
Val Loss: 1.9427455624808436 

Epoch 9
Training Loss: 1.1509858542594358
Val Loss: 2.265263145060643 

Epoch 11
Training Loss: 1.0799903476583785
Val Loss: 2.034434985855351 

Epoch 13
Training Loss: 1.1053624533224797
Val Loss: 2.2580907370733176 

Epoch 15
Training Loss: 1.0555038188678632
Val Loss: 1.8991042790205583 

Epoch 17
Training Loss: 1.0577878209127896
Val Loss: 2.4970634488955787 

Epoch 19
Training Loss: 0.9782850457274396
Val Loss: 2.2641718180283257 

Epoch 21
Training Loss: 0.886574650372284
Val Loss: 2.567906541020974 

Epoch 23
Training Loss: 0.9144020026576691
Val Loss: 2.564814975080283 

Epoch 25
Training Loss: 0.853685880916706
Val Loss: 2.366185043816981 

Epoch 27
Training Loss: 0.8641154847607233
Val Loss

In [9]:
# Evaluate the model on the test dataset
# Load the data (audio)
trainset = SoundDataset('../Datasets/train/Raw')
valset = SoundDataset('../Datasets/val/Raw')
testset = SoundDataset('../Datasets/test/Raw')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNet(num_classes=9)
model.load_state_dict(torch.load('models/audio_net.pt'))
model.cuda()

model.eval()

for i, (label, data, _) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())

# Print performance results
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.4638369886465714
recall: 0.445283359460947
f1: 0.4192510988346981
acc: 0.43365253077975374

              precision    recall  f1-score   support

           0       0.68      0.61      0.64       170
           1       0.40      0.63      0.49       162
           2       0.34      0.18      0.24       202
           3       0.63      0.51      0.56       179
           4       0.34      0.67      0.45       203
           5       0.35      0.38      0.36       184
           6       0.54      0.11      0.18       119
           7       0.51      0.28      0.36       204
           8       0.39      0.64      0.49        39

    accuracy                           0.43      1462
   macro avg       0.46      0.45      0.42      1462
weighted avg       0.46      0.43      0.42      1462



### Now evaluate the raw audio model on the noisy raw audio

In [10]:
# Load the data (audio)
trainset = SoundDataset('../Datasets/train/NoisyRaw')
valset = SoundDataset('../Datasets/val/NoisyRaw')
testset = SoundDataset('../Datasets/test/NoisyRaw')

# Get data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

# Evaluate the model on the test dataset
predictions = []
gt = []

model = AudioNet(num_classes=9)
model.load_state_dict(torch.load('models/audio_net.pt'))
model.cuda()

model.eval()
for i, (label, data, _) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.1826003042068713
recall: 0.1438400646632264
f1: 0.06976997025924174
acc: 0.15937072503419972

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       170
           1       0.13      0.15      0.14       162
           2       1.00      0.00      0.01       202
           3       0.15      0.98      0.26       179
           4       0.37      0.16      0.22       203
           5       0.00      0.00      0.00       184
           6       0.00      0.00      0.00       119
           7       0.00      0.00      0.00       204
           8       0.00      0.00      0.00        39

    accuracy                           0.16      1462
   macro avg       0.18      0.14      0.07      1462
weighted avg       0.22      0.16      0.08      1462



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Now train a model on the Noisy raw audio

In [11]:
# Load the data (audio)
trainset = SoundDataset('../Datasets/train/NoisyRaw')
valset = SoundDataset('../Datasets/val/NoisyRaw')
testset = SoundDataset('../Datasets/test/NoisyRaw')

# Load model
model = AudioNet(num_classes=9)
model.cuda()

# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)


# Train the model
best_val_loss = math.inf
num_epochs = 50

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, _) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, _) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples
        
    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/audio_net_noisy.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 2 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 1.941097106622613
Val Loss: 2.1866784393787384 

Epoch 3
Training Loss: 1.6412584669348123
Val Loss: 2.1670641380807627 

Epoch 5
Training Loss: 1.4389490018720212
Val Loss: 2.2892929911613464 

Epoch 7
Training Loss: 1.3362101649028668
Val Loss: 2.132575697225073 

Epoch 9
Training Loss: 1.2542747954527538
Val Loss: 2.4883524607057157 

Epoch 11
Training Loss: 1.1896734678226968
Val Loss: 2.2489837395108263 

Epoch 13
Training Loss: 1.0794713449650917
Val Loss: 2.268553307522898 

Epoch 15
Training Loss: 1.0439229393107952
Val Loss: 2.3620149856028347 

Epoch 17
Training Loss: 0.9565788539922864
Val Loss: 2.581942242124806 

Epoch 19
Training Loss: 1.028797712015069
Val Loss: 2.4926840015079663 

Epoch 21
Training Loss: 0.8476414230274225
Val Loss: 2.4482350401256396 

Epoch 23
Training Loss: 0.8995547851790553
Val Loss: 3.226463768793189 

Epoch 25
Training Loss: 0.9616107189137003
Val Loss: 3.8544176467086957 

Epoch 27
Training Loss: 0.8147532140863114
Val Lo

### Now evaluate the noisy-trained model

### Evaluate on noisy raw data

In [12]:
# Load the data (audio)
trainset = SoundDataset('../Datasets/train/NoisyRaw')
valset = SoundDataset('../Datasets/val/NoisyRaw')
testset = SoundDataset('../Datasets/test/NoisyRaw')

# Get data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

# Evaluate the model on the test dataset
predictions = []
gt = []

model = AudioNet(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy.pt'))
model.cuda()

model.eval()
for i, (label, data, _) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.445975524527047
recall: 0.3260443096231014
f1: 0.33204761721043996
acc: 0.3454172366621067

              precision    recall  f1-score   support

           0       0.74      0.51      0.61       170
           1       0.34      0.18      0.23       162
           2       0.25      0.15      0.19       202
           3       0.37      0.51      0.43       179
           4       0.32      0.37      0.34       203
           5       0.22      0.58      0.32       184
           6       0.50      0.07      0.12       119
           7       0.45      0.34      0.38       204
           8       0.82      0.23      0.36        39

    accuracy                           0.35      1462
   macro avg       0.45      0.33      0.33      1462
weighted avg       0.40      0.35      0.34      1462



### Evaluate on clean raw data

In [13]:
# Load the data (audio)
trainset = SoundDataset('../Datasets/train/Raw')
valset = SoundDataset('../Datasets/val/Raw')
testset = SoundDataset('../Datasets/test/Raw')

# Get data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

# Evaluate the model on the test dataset
predictions = []
gt = []

model = AudioNet(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy.pt'))
model.cuda()

model.eval()
for i, (label, data, _) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.4775651011296691
recall: 0.2981091038890537
f1: 0.29072049488557145
acc: 0.32694938440492477

              precision    recall  f1-score   support

           0       0.41      0.64      0.50       170
           1       0.54      0.09      0.16       162
           2       0.26      0.29      0.27       202
           3       0.62      0.26      0.37       179
           4       0.32      0.30      0.31       203
           5       0.20      0.54      0.29       184
           6       1.00      0.05      0.10       119
           7       0.46      0.38      0.42       204
           8       0.50      0.13      0.20        39

    accuracy                           0.33      1462
   macro avg       0.48      0.30      0.29      1462
weighted avg       0.45      0.33      0.31      1462



# Everything below is scratch work