In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split, DataLoader
import matplotlib.pyplot as plt
import scipy
import scipy.io
from scipy.signal import spectrogram
import os
import soundfile as sf
import librosa
import math
from librosa.feature import melspectrogram
from librosa.display import specshow
import cv2
import random
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report

# Demo Image Data Model

# Demo Audio Data Model

In [2]:
# Get spect_dataset
class SpectDataset(Dataset):
    def __init__(self, data_dir):
        self.sample_list = []
        
        for filename in os.listdir(data_dir):
            image = plt.imread(f'{data_dir}/{filename}')[:,:,0]
            label = int(filename.split('-')[0][-1])
            self.sample_list.append((label, torch.tensor(image), filename)) #class_label, data, filename
                
    
    def __len__(self):
        return len(self.sample_list)
    
    def __getitem__(self, idx):
        return self.sample_list[idx]

In [3]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

In [5]:
# Create Neural Network Model 
class AudioNetSpect(nn.Module):  #2D convolutions on spectrograms
    def __init__(self, num_classes):
        super(AudioNetSpect, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=7, stride=1, padding=0) #try making kernels larger, fewer layers, less maxp
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.drop1 = nn.Dropout(p=0.5)
        self.lrelu1 = nn.LeakyReLU()
        self.max1 = nn.MaxPool2d(kernel_size=3)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=0)
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.drop2 = nn.Dropout(p=0.5)
        self.lrelu2 = nn.LeakyReLU()
        self.max2 = nn.MaxPool2d(kernel_size=3)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=0)
        self.batchnorm3 = nn.BatchNorm2d(64)
        self.fc1 = nn.Linear(64 * 8 * 13, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.drop1(x)
        x = self.lrelu1(x)
        x = self.max1(x)
        #print(x.shape)
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.drop2(x)
        x = self.lrelu2(x)
        #print(x.shape)
        x = self.max2(x)
        #print(x.shape)
        x = self.conv3(x)
        x = self.batchnorm3(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        #print(x.shape)
        x = self.fc1(x)
        #print(x.shape)
        out = self.fc2(x)
        #print(out.shape)
        return out

In [5]:
# Load model
model = AudioNetSpect(num_classes=9)
model.cuda()

AudioNetSpect(
  (conv1): Conv2d(1, 16, kernel_size=(7, 7), stride=(1, 1))
  (batchnorm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop1): Dropout(p=0.5, inplace=False)
  (lrelu1): LeakyReLU(negative_slope=0.01)
  (max1): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (batchnorm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop2): Dropout(p=0.5, inplace=False)
  (lrelu2): LeakyReLU(negative_slope=0.01)
  (max2): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (batchnorm3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=6656, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)

In [6]:
# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [7]:
# Create data loaders
val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = True)

In [8]:
# Train the model
best_val_loss = math.inf
num_epochs = 50

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, filename) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, filename) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples
        
    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/audio_net_spect.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 5 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 2.2212227980295816
Val Loss: 4.0897000333213285 

Epoch 6
Training Loss: 0.9574459199456201
Val Loss: 3.2537564352235715 

Epoch 11
Training Loss: 0.6474790497534517
Val Loss: 2.046630311595357 

Epoch 16
Training Loss: 0.5200708242862121
Val Loss: 2.2128447222968806 

Epoch 21
Training Loss: 0.4108680505575475
Val Loss: 2.0192513517711475 

Epoch 26
Training Loss: 0.3892952429967514
Val Loss: 1.7079246279419116 

Epoch 31
Training Loss: 0.808049185913515
Val Loss: 3.92334936815314 

Epoch 36
Training Loss: 0.31203319824746123
Val Loss: 1.6939864286181072 

Epoch 41
Training Loss: 0.31455468402608583
Val Loss: 2.4970930934600206 

Epoch 46
Training Loss: 0.743855760689231
Val Loss: 3.5665388351063365 

Epoch 50
Training Loss: 0.39336921026309324
Val Loss: 3.502903089605515 



In [9]:
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.771919269354611
recall: 0.6570119411137009
f1: 0.678916937882043
acc: 0.6757865937072504

              precision    recall  f1-score   support

           0       0.95      0.54      0.68       170
           1       0.41      0.91      0.57       162
           2       0.67      0.77      0.72       202
           3       0.93      0.54      0.68       179
           4       0.63      0.64      0.64       203
           5       0.66      0.77      0.71       184
           6       0.81      0.55      0.65       119
           7       0.93      0.70      0.80       204
           8       0.95      0.51      0.67        39

    accuracy                           0.68      1462
   macro avg       0.77      0.66      0.68      1462
weighted avg       0.75      0.68      0.68      1462



### Evaluate on NoisyRawSpectrogram

In [10]:
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisyRawSpectrograms')
valset = SpectDataset('../Datasets/val/NoisyRawSpectrograms')
testset = SpectDataset('../Datasets/test/NoisyRawSpectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.7655531621657431
recall: 0.6308606210189561
f1: 0.6558567668955364
acc: 0.6436388508891929

              precision    recall  f1-score   support

           0       0.97      0.49      0.65       170
           1       0.36      0.96      0.52       162
           2       0.72      0.67      0.70       202
           3       0.85      0.58      0.69       179
           4       0.62      0.61      0.62       203
           5       0.67      0.74      0.70       184
           6       0.85      0.52      0.65       119
           7       0.91      0.58      0.71       204
           8       0.95      0.51      0.67        39

    accuracy                           0.64      1462
   macro avg       0.77      0.63      0.66      1462
weighted avg       0.75      0.64      0.66      1462



### Evaluate on NoisySpectrogram

In [11]:
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisySpectrograms')
valset = SpectDataset('../Datasets/val/NoisySpectrograms')
testset = SpectDataset('../Datasets/test/NoisySpectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.4258816994268162
recall: 0.21544657452650245
f1: 0.14958378745517759
acc: 0.20588235294117646

              precision    recall  f1-score   support

           0       1.00      0.01      0.01       170
           1       0.14      1.00      0.25       162
           2       0.11      0.03      0.05       202
           3       0.47      0.08      0.13       179
           4       0.57      0.23      0.32       203
           5       1.00      0.01      0.01       184
           6       0.55      0.59      0.57       119
           7       0.00      0.00      0.00       204
           8       0.00      0.00      0.00        39

    accuracy                           0.21      1462
   macro avg       0.43      0.22      0.15      1462
weighted avg       0.45      0.21      0.14      1462



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Train on NoisyRawSpectrogram

In [12]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisyRawSpectrograms')
valset = SpectDataset('../Datasets/val/NoisyRawSpectrograms')
testset = SpectDataset('../Datasets/test/NoisyRawSpectrograms')

# Load model
model = AudioNetSpect(num_classes=9)
model.cuda()

# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Create data loaders
val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = True)

# Train the model
best_val_loss = math.inf
num_epochs = 50

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, filename) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, filename) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples

    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/audio_net_noisy_raw_spect.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 5 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 2.2756702683974
Val Loss: 2.118216477010561 

Epoch 6
Training Loss: 1.0858948589235111
Val Loss: 3.3936241934804814 

Epoch 11
Training Loss: 0.8385121470344239
Val Loss: 1.6606252044439316 

Epoch 16
Training Loss: 0.6158556057059247
Val Loss: 1.7358367339424465 

Epoch 21
Training Loss: 0.4884058096702548
Val Loss: 1.1876676076132318 

Epoch 26
Training Loss: 0.49340139564710495
Val Loss: 1.5647293347865343 

Epoch 31
Training Loss: 0.422803268160509
Val Loss: 1.6660945479474638 

Epoch 36
Training Loss: 0.29783680719201977
Val Loss: 1.645279102148893 

Epoch 41
Training Loss: 0.3664660068006133
Val Loss: 1.5731258560781893 

Epoch 46
Training Loss: 0.5993773919948633
Val Loss: 1.0202833141969598 

Epoch 50
Training Loss: 0.43868957951232984
Val Loss: 1.93101949694202 



### Evaluate Noisy Raw Spect Model

In [13]:
################################################ EVALUATE ON RAW SPECT ##############################
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy_raw_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Raw Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

################################################ EVALUATE ON NOISY RAW SPECT ##############################
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisyRawSpectrograms')
valset = SpectDataset('../Datasets/val/NoisyRawSpectrograms')
testset = SpectDataset('../Datasets/test/NoisyRawSpectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy_raw_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Noisy Raw Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

################################################ EVALUATE ON NOISY SPECT ##############################
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisySpectrograms')
valset = SpectDataset('../Datasets/val/NoisySpectrograms')
testset = SpectDataset('../Datasets/test/NoisySpectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy_raw_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Noisy Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

Evaluate on Raw Spectrograms
precision: 0.7552627526947132
recall: 0.6632216974235827
f1: 0.6769958631058497
acc: 0.6607387140902873

              precision    recall  f1-score   support

           0       0.93      0.56      0.70       170
           1       0.41      0.93      0.57       162
           2       0.78      0.66      0.72       202
           3       0.92      0.43      0.59       179
           4       0.65      0.55      0.60       203
           5       0.68      0.71      0.69       184
           6       0.76      0.57      0.65       119
           7       0.68      0.85      0.75       204
           8       1.00      0.72      0.84        39

    accuracy                           0.66      1462
   macro avg       0.76      0.66      0.68      1462
weighted avg       0.73      0.66      0.66      1462


Evaluate on Noisy Raw Spectrograms
precision: 0.7617444365724185
recall: 0.6477097388222236
f1: 0.6660113190078233
acc: 0.6450068399452804

              precis

### Train on Noisy Spectrogram

In [14]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisySpectrograms')
valset = SpectDataset('../Datasets/val/NoisySpectrograms')
testset = SpectDataset('../Datasets/test/NoisySpectrograms')

# Load model
model = AudioNetSpect(num_classes=9)
model.cuda()

# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Create data loaders
val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = True)

# Train the model
best_val_loss = math.inf
num_epochs = 50

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, filename) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, filename) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples

    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/audio_net_noisy_spect.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 5 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 2.4051252387572024
Val Loss: 1.8614115818687107 

Epoch 6
Training Loss: 1.1810212649297023
Val Loss: 1.5157391312329664 

Epoch 11
Training Loss: 0.9479295735017977
Val Loss: 1.4484348063883574 

Epoch 16
Training Loss: 0.6555044700702032
Val Loss: 1.3646319738548736 

Epoch 21
Training Loss: 0.6922377730030954
Val Loss: 1.2742882681929546 

Epoch 26
Training Loss: 0.530817932786717
Val Loss: 1.1741504137930663 

Epoch 31
Training Loss: 0.3877294404783111
Val Loss: 1.1015700665505037 

Epoch 36
Training Loss: 0.3456068868222444
Val Loss: 1.1038007937047793 

Epoch 41
Training Loss: 0.28511003691001213
Val Loss: 1.129645327511041 

Epoch 46
Training Loss: 0.398265616428377
Val Loss: 1.3364705268455588 

Epoch 50
Training Loss: 0.22197384241468976
Val Loss: 1.1470362051673557 



### Evaluate Noisy Spect Model

In [15]:
################################################ EVALUATE ON RAW SPECT ##############################
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Raw Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

################################################ EVALUATE ON NOISY RAW SPECT ##############################
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisyRawSpectrograms')
valset = SpectDataset('../Datasets/val/NoisyRawSpectrograms')
testset = SpectDataset('../Datasets/test/NoisyRawSpectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Noisy Raw Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

################################################ EVALUATE ON NOISY SPECT ##############################
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/NoisySpectrograms')
valset = SpectDataset('../Datasets/val/NoisySpectrograms')
testset = SpectDataset('../Datasets/test/NoisySpectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_noisy_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Noisy Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

Evaluate on Raw Spectrograms
precision: 0.696816511488185
recall: 0.6366347036662606
f1: 0.6423070194735482
acc: 0.6272229822161423

              precision    recall  f1-score   support

           0       0.89      0.62      0.73       170
           1       0.79      0.73      0.76       162
           2       0.63      0.54      0.58       202
           3       0.73      0.50      0.60       179
           4       0.80      0.38      0.52       203
           5       0.49      0.77      0.60       184
           6       0.53      0.71      0.61       119
           7       0.51      0.81      0.62       204
           8       0.90      0.67      0.76        39

    accuracy                           0.63      1462
   macro avg       0.70      0.64      0.64      1462
weighted avg       0.68      0.63      0.63      1462


Evaluate on Noisy Raw Spectrograms
precision: 0.6943385086700133
recall: 0.6353098426709216
f1: 0.6413703269362652
acc: 0.6258549931600548

              precisi

### Try training for 100 Epochs for fair comparison with prune and retrained network

In [6]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Load model
model = AudioNetSpect(num_classes=9)
model.cuda()

# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Create data loaders
val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = True)

# Train the model
best_val_loss = math.inf
num_epochs = 100

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, filename) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, filename) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples

    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/audio_net_spect_100epoch.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 5 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 2.006156039842661
Val Loss: 4.392049090734319 

Epoch 6
Training Loss: 0.8998395737962447
Val Loss: 3.6370070893115 

Epoch 11
Training Loss: 0.7574916264966832
Val Loss: 5.223125362298046 

Epoch 16
Training Loss: 0.461643162952817
Val Loss: 2.02899823402581 

Epoch 21
Training Loss: 0.3770394570369651
Val Loss: 2.1836482844436946 

Epoch 26
Training Loss: 0.5138370963758316
Val Loss: 4.152632499812171 

Epoch 31
Training Loss: 0.39495099136146944
Val Loss: 2.83058318441592 

Epoch 36
Training Loss: 0.24477177371095488
Val Loss: 2.122712532463281 

Epoch 41
Training Loss: 0.6040730413751326
Val Loss: 2.779883631221626 

Epoch 46
Training Loss: 0.22652367899275344
Val Loss: 2.576101123071883 

Epoch 51
Training Loss: 0.5578013806995274
Val Loss: 2.085878755978268 

Epoch 56
Training Loss: 0.34497254798053834
Val Loss: 1.7634211011109469 

Epoch 61
Training Loss: 0.21479932074367403
Val Loss: 2.8806695084532965 

Epoch 66
Training Loss: 0.16350214592977494
Val Los

### Evaluate 100epoch trained spect model

In [7]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_spect_100epoch.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print('Evaluate on Raw Spectrograms')
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))
print()

Evaluate on Raw Spectrograms
precision: 0.768389362301778
recall: 0.6563937956901786
f1: 0.6633992651070547
acc: 0.6326949384404925

              precision    recall  f1-score   support

           0       0.79      0.82      0.80       170
           1       0.82      0.63      0.71       162
           2       0.38      0.94      0.54       202
           3       1.00      0.41      0.58       179
           4       0.75      0.50      0.60       203
           5       0.56      0.76      0.64       184
           6       0.80      0.59      0.68       119
           7       0.96      0.37      0.53       204
           8       0.85      0.90      0.88        39

    accuracy                           0.63      1462
   macro avg       0.77      0.66      0.66      1462
weighted avg       0.75      0.63      0.64      1462




# Everything below is scratch work