Simple LSTM with audio_mnist

In [None]:
#@article{audiomnist2023,
#    title = {AudioMNIST: Exploring Explainable Artificial Intelligence for audio analysis on a simple benchmark},
#    journal = {Journal of the Franklin Institute},
#    year = {2023},
#    issn = {0016-0032},
#    doi = {https://doi.org/10.1016/j.jfranklin.2023.11.038},
#    url = {https://www.sciencedirect.com/science/article/pii/S0016003223007536},
#    author = {Sören Becker and Johanna Vielhaben and Marcel Ackermann and Klaus-Robert Müller and Sebastian Lapuschkin and Wojciech Samek},
#    keywords = {Deep learning, Neural networks, Interpretability, Explainable artificial intelligence, Audio classification, Speech recognition},
#}

In [None]:
# use kaggle audio mnist
# https://www.kaggle.com/datasets/sripaadsrinivasan/audio-mnist
# we need to make dataset class

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# check the data -> we need to make labels
import os
import torch
import torchaudio
path = '/content/drive/MyDrive/data'
dataset = []
for i in range(1,61):
  number = f'{i:02d}' # ex) 01
  audiofolder = os.path.join(path, number)
  for audio in os.listdir(audiofolder): # search all files in audiofolder
    audiopath = os.path.join(audiofolder, audio) # access to real audio data
    # In file name, there are label, speaker, repeat number
    filename = os.path.basename(audiopath)
    audioname = filename.replace('wav','')
    label = filename.split('_')[0]
    speaker = filename.split('_')[1]
    repeat = filename.split('_')[2]
    waveform, samplerate = torchaudio.load(audiopath)
    label = torch.tensor(int(label))
    dataset.append({'waveform':waveform, 'sr':samplerate,'speaker':speaker,'label':label})
  if i % 10 == 0:
    print(f'Label {i}/60 complete')



  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Label 10/60 complete
Label 20/60 complete
Label 30/60 complete
Label 40/60 complete
Label 50/60 complete
Label 60/60 complete


In [None]:
# Watch data example
print(dataset[0])
print(len(dataset))

{'waveform': tensor([[-0.0003, -0.0003, -0.0003,  ..., -0.0004, -0.0003, -0.0003]]), 'sr': 48000, 'speaker': '01', 'label': tensor(0)}
30000


In [None]:
# I need to watch all the length of waveform is same
# To watch the length od tensor, use shape[1]. because tensor = 1*14073
A=[]
for i in range(len(dataset)):
  w = dataset[i]['waveform']
  A.append(w.shape[1])
print(f' Min : {min(A)}, Max: {max(A)}')

 Min : 14073, Max: 47998


In [None]:
# So, I need to make same length -> just fit it as 20KHz. using nn.functional.pad
import torch.nn.functional as F
A=[]
for i in range(len(dataset)):
  w = dataset[i]['waveform']
  if w.shape[1] < 20000:
    w =F.pad(w,(0,20000-w.shape[1])) # pad(x,(number, length to pad))
    dataset[i]['waveform'] = w[:,:20000]
  else:
    dataset[i]['waveform'] = w[:,:20000]
for i in range(len(dataset)):
  w = dataset[i]['waveform']
  A.append(w.shape[1])
print(f' Min : {min(A)}, Max: {max(A)}')

 Min : 20000, Max: 20000


In [None]:
# make DATASET class to pytorch
from torch.utils.data import Dataset, DataLoader

# make init,len,getitem
class audiodataset(Dataset):
  def __init__(self, files, transform = None):
    self.files = files
    self.transform = transform

  def __len__(self):
    return len(self.files)

  def __getitem__(self,idx): # Need idx!
    data = self.files[idx]
    waveform = data['waveform']
    label = data['label']
    # I don't need sr and speaker
    if self.transform:
      waveform = self.transform(waveform)

    return waveform, label



In [None]:
# Make dataloader
traindataset = audiodataset(dataset)
print(traindataset[0]) # waveform, label
train_loader = DataLoader(traindataset, batch_size = 32, shuffle = True)
for batch in train_loader:
  waveform, label = batch
  print(waveform.shape)
  print(label.shape)
  break # watch just one example

(tensor([[-0.0003, -0.0003, -0.0003,  ...,  0.0085,  0.0085,  0.0087]]), tensor(0))
torch.Size([32, 1, 20000])
torch.Size([32])


In [None]:
# using GPU
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('GPU is availble')
else:
  device = torch.device('cpu')
  print(f'YOU CANNOT NOT USE GPU!!!')


GPU is availble


In [None]:
# Make simple LSTM model
# the big difference of LSTM and RNN is Long term memory (cell state)
import torch.nn as nn
class LSTM(nn.Module):
  def __init__ (self, input_dim, hidden_size, num_layers, num_classes):
    super(LSTM, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_dim, hidden_size, num_layers)
    self.fc = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out, (hn,cn) = self.lstm(x) # last time hidden state, last time cell state
    out = self.fc(out[:,-1,:]) # need only last time step -> -1
    return out


In [None]:
# Without transform, it has very low accuracy and so much time spent
# Using MFCC
import torchaudio.transforms as T
mfcc = T.MFCC(16000,40) # sample_rate, dimension of vector of each frame-> feature , actually I want to use raw data, but it has too much length
traindataset = audiodataset(dataset,transform = mfcc)
print(traindataset[0]) # waveform, label
train_loader = DataLoader(traindataset, batch_size = 32, shuffle = True)
for batch in train_loader:
  waveform, label = batch
  print(waveform.shape)
  print(label.shape)
  break # watch just one example

(tensor([[[-782.5766, -723.1873, -759.4866,  ..., -611.1515, -597.2868,
          -587.9297],
         [  12.3939,  -27.6590,  -12.8135,  ...,  125.8783,  132.2648,
           109.6659],
         [  34.7157,   37.2915,   30.8310,  ...,   51.4044,   67.8076,
            37.6513],
         ...,
         [  -1.5433,   -9.1804,   -4.3317,  ...,   -8.1542,   -8.0709,
           -12.1726],
         [ -10.6302,   -8.4885,    1.1315,  ...,  -23.6739,  -12.7377,
           -11.2347],
         [  -5.1951,   -4.0581,   -4.2072,  ...,   -8.1796,   -5.6248,
           -11.8580]]]), tensor(0))
torch.Size([32, 1, 40, 101])
torch.Size([32])




In [None]:
# make another model
model1 = LSTM(40,256, 2, 10)
model1.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model1.parameters(),lr = 0.0001)
print(model1)

LSTM(
  (lstm): LSTM(40, 256, num_layers=2)
  (fc): Linear(in_features=256, out_features=10, bias=True)
)


In [None]:
# to into the model, I need to change(batch, input_dim, seq_length)
# training
epochs = 7

for epoch in range(1,epochs+1):
  loss_epoch = 0
  accuracy_epoch = 0

  for batch in train_loader:
    wave,label = batch
    wave = wave.squeeze(1) # (batch, input_dim, seq_len)
    wave = wave.transpose(1,2) #(batch, seq_len. input_dim)
    wave = wave.to(device)
    label = label.to(device)
    optimizer.zero_grad()
    output = model1(wave)
    loss = criterion(output,label)
    loss.backward()
    optimizer.step()
    loss_epoch += loss.item()
    y_pred = torch.argmax(output,dim=1)
    accuracy_epoch += (y_pred==label).sum().item()

  loss_avg = loss_epoch/len(train_loader)
  accuracy_avg = accuracy_epoch/len(traindataset)
  print(f'Epoch : {epoch}, Loss : {loss_avg:10.8f}, Accuracy : {accuracy_avg:10.8f}')

Epoch : 1, Loss : 1.97898130, Accuracy : 0.26900000
Epoch : 2, Loss : 1.71708978, Accuracy : 0.36246667
Epoch : 3, Loss : 1.64677118, Accuracy : 0.39673333
Epoch : 4, Loss : 1.60159504, Accuracy : 0.41486667
Epoch : 5, Loss : 1.56891212, Accuracy : 0.42770000
Epoch : 6, Loss : 1.53923784, Accuracy : 0.44056667
Epoch : 7, Loss : 1.51543413, Accuracy : 0.44550000


I want to compare with the accuracy of raw audio, but it spent too much resources. And, This model has only 0.44 Accuracy of train dataset. This project is just using  simple LSTM with AUdiomnist, so end here and later, I need to study more deep in the data preprocessing and LSTM structure