In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split, DataLoader
import matplotlib.pyplot as plt
import scipy
import scipy.io
from scipy.signal import spectrogram
import os
import soundfile as sf
import librosa
import math
from librosa.feature import melspectrogram
from librosa.display import specshow
import cv2
import random
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report
import torch.nn.utils.prune as prune

In [2]:
# Get spect_dataset
class SpectDataset(Dataset):
    def __init__(self, data_dir):
        self.sample_list = []
        
        for filename in os.listdir(data_dir):
            image = plt.imread(f'{data_dir}/{filename}')[:,:,0]
            label = int(filename.split('-')[0][-1])
            self.sample_list.append((label, torch.tensor(image), filename)) #class_label, data, filename
                
    
    def __len__(self):
        return len(self.sample_list)
    
    def __getitem__(self, idx):
        return self.sample_list[idx]

In [3]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

In [4]:
# Create Neural Network Model 
class AudioNetSpect(nn.Module):  #2D convolutions on spectrograms
    def __init__(self, num_classes):
        super(AudioNetSpect, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=7, stride=1, padding=0) #try making kernels larger, fewer layers, less maxp
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.drop1 = nn.Dropout(p=0.5)
        self.lrelu1 = nn.LeakyReLU()
        self.max1 = nn.MaxPool2d(kernel_size=3)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=0)
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.drop2 = nn.Dropout(p=0.5)
        self.lrelu2 = nn.LeakyReLU()
        self.max2 = nn.MaxPool2d(kernel_size=3)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=0)
        self.batchnorm3 = nn.BatchNorm2d(64)
        self.fc1 = nn.Linear(64 * 8 * 13, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.drop1(x)
        x = self.lrelu1(x)
        x = self.max1(x)
        #print(x.shape)
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.drop2(x)
        x = self.lrelu2(x)
        #print(x.shape)
        x = self.max2(x)
        #print(x.shape)
        x = self.conv3(x)
        x = self.batchnorm3(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        #print(x.shape)
        x = self.fc1(x)
        #print(x.shape)
        out = self.fc2(x)
        #print(out.shape)
        return out

In [5]:
# Load model
model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/audio_net_spect.pt'))
model.cuda()

AudioNetSpect(
  (conv1): Conv2d(1, 16, kernel_size=(7, 7), stride=(1, 1))
  (batchnorm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop1): Dropout(p=0.5, inplace=False)
  (lrelu1): LeakyReLU(negative_slope=0.01)
  (max1): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (batchnorm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop2): Dropout(p=0.5, inplace=False)
  (lrelu2): LeakyReLU(negative_slope=0.01)
  (max2): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (batchnorm3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=6656, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)

### Network Pruning

In [6]:
from collections.abc import Iterable

def get_prunable_modules(model):
    modules = nn.Sequential(*list(model.children()))
    print(modules)

    # print(modules)
    parameters_to_prune = (
        (modules[0], 'weight'),
        (modules[5], 'weight'),
        (modules[10], 'weight')
        )

    modules_pruned = (
        (modules[0]),
        (modules[5]),
        (modules[10])
        )

    return parameters_to_prune, modules_pruned

def do_pruning(pruned_model, value = 0.35):
    parameters_to_prune, modules_to_prune = get_prunable_modules(model)
    print(parameters_to_prune)

    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=value)

    for m in modules_to_prune:
        prune.remove(m, 'weight')

    return pruned_model

In [7]:
prune_percentage = 0.35
pruned_model = model

pruned_model = do_pruning(pruned_model, prune_percentage)
torch.save(pruned_model.state_dict(), 'models/pruned_audio_net_spect.pt')

Sequential(
  (0): Conv2d(1, 16, kernel_size=(7, 7), stride=(1, 1))
  (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): LeakyReLU(negative_slope=0.01)
  (4): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): Dropout(p=0.5, inplace=False)
  (8): LeakyReLU(negative_slope=0.01)
  (9): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (11): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (12): Linear(in_features=6656, out_features=128, bias=True)
  (13): Linear(in_features=128, out_features=9, bias=True)
)
((Conv2d(1, 16, kernel_size=(7, 7), stride=(1, 1)), 'weight'), (Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1)), 'we

### Evaluate Pruned Model

In [9]:
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/pruned_audio_net_spect.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.765887648474086
recall: 0.6187144662593559
f1: 0.6381678339888097
acc: 0.6374829001367989

              precision    recall  f1-score   support

           0       0.96      0.52      0.68       170
           1       0.40      0.90      0.55       162
           2       0.58      0.83      0.69       202
           3       0.94      0.36      0.52       179
           4       0.61      0.66      0.63       203
           5       0.64      0.74      0.69       184
           6       0.83      0.50      0.62       119
           7       0.93      0.58      0.72       204
           8       1.00      0.49      0.66        39

    accuracy                           0.64      1462
   macro avg       0.77      0.62      0.64      1462
weighted avg       0.74      0.64      0.64      1462



### Retrain Pruned Model

In [10]:
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Load the pruned model
model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/pruned_audio_net_spect.pt'))
model.cuda()

# Get loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Create data loaders
val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = True)

# Train the model
best_val_loss = math.inf
num_epochs = 50

for epoch in range(num_epochs):
    tot_train_loss = 0
    tot_train_samples = 0
    
    # Train Loop
    model.train()
    for i, (label, data, filename) in enumerate(train_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        optimizer.zero_grad() # clear gradients
        outputs = model(data) # get outputs
        train_loss = criterion(outputs, label) # get batch loss
        tot_train_loss += train_loss.item() # add batch loss to total loss
        tot_train_samples += data.shape[0] # add number of samples to total number of samples
        train_loss.backward() # get gradients
        optimizer.step() # update parameters
    avg_train_loss = tot_train_loss / len(train_loader)
    
    # Eval Loop
    tot_val_loss = 0
    tot_val_samples = 0
    pred_vec = []
    label_vec = []
    
    model.eval()
    for i, (label, data, filename) in enumerate(val_loader):
        data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
        label = label.long().cuda()
        
        outputs = model(data) # get outputs
        val_loss = criterion(outputs, label) # get batch loss
        tot_val_loss += val_loss.item() # add batch loss to total loss
        tot_val_samples += data.shape[0] # add number of samples to total number of samples
        
    avg_val_loss = tot_val_loss / len(val_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'models/pruned_audio_net_spect_retrained.pt')
        
    # print losses every 5 epochs
    if (epoch + 1) % 5 == 1 or (epoch + 1) == num_epochs:
        print('Epoch {}'.format(epoch + 1))
        print('Training Loss: {}'.format(avg_train_loss))
        print('Val Loss: {} \n'.format(avg_val_loss))

Epoch 1
Training Loss: 0.49400416572672734
Val Loss: 1.8222012539272723 

Epoch 6
Training Loss: 0.5490526497984926
Val Loss: 2.8393055799609535 

Epoch 11
Training Loss: 0.3650395260124967
Val Loss: 2.4020306514272387 

Epoch 16
Training Loss: 0.223817454610664
Val Loss: 1.4553745991509894 

Epoch 21
Training Loss: 0.25075326082499133
Val Loss: 1.6434339783761813 

Epoch 26
Training Loss: 0.20384872219849215
Val Loss: 1.3925489543572716 

Epoch 31
Training Loss: 0.35777548047295515
Val Loss: 1.592189267196733 

Epoch 36
Training Loss: 0.23417536537770098
Val Loss: 1.4877949379303532 

Epoch 41
Training Loss: 0.2977689635143548
Val Loss: 1.7477763474250778 

Epoch 46
Training Loss: 0.18233761492360637
Val Loss: 1.5056956456771686 

Epoch 50
Training Loss: 0.790597375711777
Val Loss: 5.79589884523296 



In [11]:
# Evaluate the model on the test dataset
# Load the data (spectrogram)
trainset = SpectDataset('../Datasets/train/Spectrograms')
valset = SpectDataset('../Datasets/val/Spectrograms')
testset = SpectDataset('../Datasets/test/Spectrograms')

# Create data loaders
train_loader = DataLoader(dataset = trainset,
                      batch_size = 32,
                      shuffle = True)

val_loader = DataLoader(dataset = valset,
                      batch_size = 32,
                      shuffle = False)

test_loader = DataLoader(dataset = testset,
                      batch_size = 32,
                      shuffle = False)

predictions = []
gt = []

model = AudioNetSpect(num_classes=9)
model.load_state_dict(torch.load('models/pruned_audio_net_spect_retrained.pt'))
model.cuda()

model.eval()



predictions = []
gt = []

for i, (label, data, filename) in enumerate(test_loader):
    data = data.type(torch.FloatTensor).cuda().unsqueeze(1)
    label = label.long().cuda()

    outputs = model(data) # get outputs
    val_loss = criterion(outputs, label) # get batch loss
    tot_val_loss += val_loss.item() # add batch loss to total loss
    tot_val_samples += data.shape[0] # add number of samples to total number of samples

    # Predictions
    _, predicted = torch.max(outputs.data, 1)
    predictions += list(np.array(predicted.cpu()).ravel())
    gt += list(np.array(label.cpu()).ravel())
    
# Print performance results
print(f'precision: {precision_score(gt, predictions, average="macro")}')
print(f'recall: {recall_score(gt, predictions, average="macro")}')
print(f'f1: {f1_score(gt, predictions, average="macro")}')
print(f'acc: {accuracy_score(gt, predictions)}')
print()
print(classification_report(gt, predictions))

precision: 0.7196189941106613
recall: 0.7164806746565913
f1: 0.6950046780627795
acc: 0.6976744186046512

              precision    recall  f1-score   support

           0       0.91      0.70      0.79       170
           1       0.90      0.68      0.77       162
           2       0.54      0.89      0.67       202
           3       0.82      0.65      0.73       179
           4       0.73      0.33      0.45       203
           5       0.64      0.77      0.70       184
           6       0.63      0.72      0.67       119
           7       0.74      0.82      0.78       204
           8       0.56      0.90      0.69        39

    accuracy                           0.70      1462
   macro avg       0.72      0.72      0.70      1462
weighted avg       0.73      0.70      0.69      1462



# Everything below is scratch work