In [1]:
import os
import random
import math
import shutil
import time
import warnings
import soundfile

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim

import torch.utils.data
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

torchaudio.set_audio_backend('soundfile')

In [2]:
os.getcwd()

'/data/DataSci251_FinalProject'

In [3]:
import wandb
wandb.init(project="FinalProject_SimpleModel")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
# Reading Data and CSV

import pandas as pd
from pathlib import Path

download_path = Path('DataSet/ESC-50-master')

# Read metadata file
metadata_file = download_path/'meta'/'esc50.csv'
df = pd.read_csv(metadata_file)
df.head()

# Take relevant columns
df = df[['filename', 'target']]
df.head()


Unnamed: 0,filename,target
0,1-100032-A-0.wav,0
1,1-100038-A-14.wav,14
2,1-100210-A-36.wav,36
3,1-100210-B-36.wav,36
4,1-101296-A-19.wav,19


In [5]:
# Define Data Transformations

class AudioUtil():
    
  # Load an audio file. Return the signal as a tensor and the sample rate
  @staticmethod
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)

  # Convert the given audio to the desired number of channels
  @staticmethod
  def rechannel(aud, new_channel):
    sig, sr = aud

    if (sig.shape[0] == new_channel):
      # Nothing to do
      return aud

    if (new_channel == 1):
      # Convert from stereo to mono by selecting only the first channel
      resig = sig[:1, :]
    else:
      # Convert from mono to stereo by duplicating the first channel
      resig = torch.cat([sig, sig])

    return ((resig, sr))

  # Resample for proper channels
  @staticmethod
  def resample(aud, newsr):
    sig, sr = aud

    if (sr == newsr):
      # Nothing to do
      return aud

    num_channels = sig.shape[0]
    # Resample first channel
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      # Resample the second channel and merge both channels
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))

  # Pad the signal to a fixed length 'max_ms' in milliseconds
  @staticmethod
  def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      # Truncate the signal to the given length
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      # Length of padding to add at the beginning and end of the signal
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len

      # Pad with 0s
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
      
    return (sig, sr)

  # Shift the signal to the left or right by some percent
  @staticmethod
  def time_shift(aud, shift_limit):
    sig,sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)

  # Generate a Spectrogram
  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  # Augment the Spectrogram to prevent overfitting
  @staticmethod
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

In [6]:
# Data Path
data_path = Path('DataSet/ESC-50-master/audio')

In [7]:
torchaudio.load(data_path/'1-137-A-32.wav')

(tensor([[0.0089, 0.0089, 0.0089,  ..., 0.0034, 0.0038, 0.0038]]), 44100)

In [8]:
# Creating data set

from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4

  def __len__(self):
    return len(self.df)    

  def __getitem__(self, idx):
    audio_file = self.data_path + '/' + self.df.loc[idx, 'filename']
    # Get the Class ID
    class_id = self.df.loc[idx, 'target']

    aud = AudioUtil.open(audio_file)
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id

In [9]:
#Creating data loaders

from torch.utils.data import random_split

myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=8)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=8)

In [None]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [10]:
import torch.nn.functional as F
from torch.nn import init

# Audio Classification Model

class AudioClassifier (nn.Module):

    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=50)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # Forward pass
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)

# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [13]:
# Training

def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        


        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        
        
        acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 50 == 0:    # print every 50 mini-batches
            #print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    wandb.log({'Train Loss': loss, 'Train Acc': acc})
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Acc@1: {acc1[0].item():.2f}, Acc@5: {acc5[0].item():.2f}')


  print('Finished Training')
  
num_epochs=100
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 3.91, Acc@1: 0.00, Acc@5: 12.50
Epoch: 1, Loss: 3.86, Acc@1: 3.12, Acc@5: 6.25
Epoch: 2, Loss: 3.79, Acc@1: 6.25, Acc@5: 34.38
Epoch: 3, Loss: 3.72, Acc@1: 18.75, Acc@5: 40.62
Epoch: 4, Loss: 3.64, Acc@1: 9.38, Acc@5: 21.88
Epoch: 5, Loss: 3.56, Acc@1: 6.25, Acc@5: 28.12
Epoch: 6, Loss: 3.49, Acc@1: 9.38, Acc@5: 34.38
Epoch: 7, Loss: 3.40, Acc@1: 18.75, Acc@5: 31.25
Epoch: 8, Loss: 3.33, Acc@1: 15.62, Acc@5: 34.38
Epoch: 9, Loss: 3.27, Acc@1: 21.88, Acc@5: 62.50
Epoch: 10, Loss: 3.19, Acc@1: 21.88, Acc@5: 65.62
Epoch: 11, Loss: 3.11, Acc@1: 15.62, Acc@5: 43.75
Epoch: 12, Loss: 3.04, Acc@1: 21.88, Acc@5: 56.25
Epoch: 13, Loss: 2.97, Acc@1: 18.75, Acc@5: 53.12
Epoch: 14, Loss: 2.89, Acc@1: 37.50, Acc@5: 75.00
Epoch: 15, Loss: 2.83, Acc@1: 18.75, Acc@5: 56.25
Epoch: 16, Loss: 2.74, Acc@1: 12.50, Acc@5: 56.25
Epoch: 17, Loss: 2.68, Acc@1: 43.75, Acc@5: 75.00
Epoch: 18, Loss: 2.60, Acc@1: 28.12, Acc@5: 53.12
Epoch: 19, Loss: 2.53, Acc@1: 46.88, Acc@5: 78.12
Epoch: 20, Loss: 

In [14]:
# Validation

def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
    
  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set
inference(myModel, val_dl)

Accuracy: 0.57, Total items: 400


In [16]:
# Export the model weights and parameters to be used on the edge

# Move the model to CPU
myModel_CPU = myModel.to('cpu')

#Export model in TorchScript Format
model_scripted = torch.jit.script(myModel_CPU)
model_scripted.save('CNN_Model_cpu.pt')