In [91]:
import torch
import torch.optim as optim
import numpy as np
import pandas as pd 
import cv2
import os
import seaborn as sns
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt



In [92]:
# import dataset
datasetPath = '../input/urbansound8k/'
dataTable = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')

In [93]:
dataTable.head(5)

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [94]:
# inspect audio file information
audio = dataTable[:1]
print(audio)

    slice_file_name    fsID  start       end  salience  fold  classID  \
0  100032-3-0-0.wav  100032    0.0  0.317551         1     5        3   

      class  
0  dog_bark  


In [95]:
# assign corresponding class values
dataTable['class number'] = pd.Categorical(dataTable['class']).codes 
dataTable[:5]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,class number
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,3
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,2
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,2
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,2
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,2


In [96]:
classLabels = dataTable.iloc[:,8]
print(classLabels[:15])
classLabels.shape

0     3
1     2
2     2
3     2
4     2
5     2
6     2
7     2
8     2
9     1
10    1
11    1
12    1
13    1
14    3
Name: class number, dtype: int8


(8732,)

In [97]:
# create MFCCs for each audio file in the dataset
Xdata = []
for audio in dataTable.iterrows():
    filePath = os.path.join(os.path.abspath(datasetPath), 'fold' + str(audio[1][5])+'/', str(audio[1][0]))
    data , sampleRate = librosa.load(filePath, res_type = 'kaiser_fast')
    MFCC = librosa.feature.mfcc(y = data, sr = sampleRate, n_mfcc = 40)
    MFCC = cv2.resize(MFCC, (173,40), interpolation= cv2.INTER_LINEAR)
    Xdata.append(MFCC)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [98]:
print(data.shape)

(55787,)


In [99]:
Xdata[:5]

[array([[-335.0899    , -335.0899    , -335.0899    , ..., -423.59576   ,
         -423.59576   , -423.59576   ],
        [ 123.720276  ,  123.720276  ,  123.720276  , ...,   92.23881   ,
           92.23881   ,   92.23881   ],
        [-107.11308   , -107.11308   , -107.11308   , ...,  -93.354294  ,
          -93.354294  ,  -93.354294  ],
        ...,
        [  -0.73185635,   -0.73185635,   -0.73185635, ...,   -3.3754263 ,
           -3.3754263 ,   -3.3754263 ],
        [  -1.2285597 ,   -1.2285597 ,   -1.2285597 , ...,   -2.5173354 ,
           -2.5173354 ,   -2.5173354 ],
        [   1.3734663 ,    1.3734663 ,    1.3734663 , ...,   -2.8940563 ,
           -2.8940563 ,   -2.8940563 ]], dtype=float32),
 array([[-4.92137604e+02, -4.55067932e+02, -4.49950378e+02, ...,
         -4.29086792e+02, -4.26203949e+02, -4.38663940e+02],
        [ 9.90694962e+01,  1.09913002e+02,  1.10224457e+02, ...,
          8.66482697e+01,  8.26320801e+01,  7.85325165e+01],
        [-2.53395996e+01, -2.32777

In [100]:
len(Xdata)

8732

In [110]:
#totalArray = np.empty([40,173])
Xdata = np.array(Xdata)
#for i in Xdata:
  #  npArray = np.array(i)
  #  totalArray = np.hstack((totalArray,npArray))

In [111]:
shape = Xdata.shape
width = shape[1]
height = shape[0]
print(width)
print(Xdata.shape)

40
(8732, 40, 173, 1)


In [112]:
Xdata = Xdata.reshape(Xdata.shape[0], Xdata.shape[1], Xdata.shape[2], 1)
print("Xdata Shape:", Xdata.shape)

Xdata Shape: (8732, 40, 173, 1)


In [114]:
classLabels = np.array(classLabels)
print("Ydata Shape: ", Ydata.shape)

Ydata Shape:  (8732, 1)


In [116]:
# split training and validation data
xTrain , xTest , yTrain , yTest = train_test_split(Xdata , Ydata ,train_size = 0.8, test_size = 0.2)

In [2]:
# create model
from matplotlib.projections.polar import OrderedDict
import torch.nn as nn
from collections import OrderedDict

seq_model = nn.Sequential(OrderedDict([
            ('hidden_conv1', nn.Conv2d(10, 32, 3, padding = 1)),
            ('hidden_activation1', nn.ReLU()),
            ('2D_Pooling1', nn.MaxPool2d(2)),
            ('hidden_conv2', nn.Conv2d(32, 64, 3)),
            ('hidden_activation2', nn.ReLU()),
            ('2D_Pooling2', nn.MaxPool2d(2)),
            ('output_conv', nn.Conv2d(64, 10, 3)),
            ]))
seq_model

Sequential(
  (hidden_conv1): Conv2d(10, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (hidden_activation1): ReLU()
  (2D_Pooling1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (hidden_conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (hidden_activation2): ReLU()
  (2D_Pooling2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (output_conv): Conv2d(64, 10, kernel_size=(3, 3), stride=(1, 1))
)

In [140]:
trainLossHist = torch.zeros(200)
valLossHist = torch.zeros(200)
def neuralTrain(numEpochs, optimizer, model, lossFunc, xTrain, xVal, yTrain, yVal):
  for epoch in range(1, numEpochs + 1):

    modelTrain = model(xTrain)
    lossTrain = lossFunc(modelTrain, yTrain)

    modelValid = model(xVal)

    lossVal = lossFunc(modelValid, yVal)
    optimizer.zero_grad()
    lossTrain.backward()
    optimizer.step()

    trainLossHist[epoch] = lossTrain.item()
    valLossHist[epoch] = lossVal.item()

    if epoch % 5 == 0:
      print(f"Epoch {epoch}, Training loss {lossTrain.item():.4f},"
            f"Validation loss {lossVal.item():.4f}")


In [None]:
# initialize training on model
optimizer = optim.Adam(seq_model.parameters(), lr=1e-3)

neuralTrain(numEpochs = 30, 
            optimizer = optimizer, 
            model = seq_model, 
            lossFunc = nn.CrossEntropyLoss(),
            xTrain = xTrain,
            xVal = xTest,
            yTrain = yTrain,
            yVal = yTest
            )