# **Audio Classification with LSTM and Torch Audio**

## **Libraries Required**

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
import torchaudio
import pandas as pd
import torch.nn.functional as F

## **About Dataset**

This dataset contains 8732 labeled sound excerpts ( <= 4s ) of urban sounds from 10 classes :`air_conditioner`, `car_horn`, `children_playing`, `dog_bark`, `drilling`, `engine_idling`, `gun_shot`, `jackhammer`, `siren` and `street_music`. The classes are drawn from the urban sound taxonomy.

In [44]:
class UrbanSoundDataset(Dataset):
    # Wrapper for the UrbanSound8K dataset
    # Argument List
    # path to the UrbanSound8K csv file
    # path to the UrbanSound8K audio files
    # list of folders to use in the dataset

    def __init__(self, csv_path, file_path, folderList):
        csvData = pd.read_csv(csv_path)
        # initialize lists to hold file names, labels, and folder numbers
        self.file_names = []
        self.labels = []
        self.folders = []
        # loop through the csv entries and only add entries from folders in the folder list
        for i in range(0, len(csvData)):
            if csvData.iloc[i, 5] in folderList:
                self.file_names.append(csvData.iloc[i, 0]) # extracting & appending each element from column 1- filenames
                self.labels.append(csvData.iloc[i, 6]) # extracting & appending each element from column 7 - classID
                self.folders.append(csvData.iloc[i, 5]) # extracting & appending elements from column 6 -folderNumber

        self.file_path = file_path
        self.folderList = folderList

    def __getitem__(self, index):
        # format the file path and load the file
        path = self.file_path + "fold" + str(self.folders[index]) + "/" + self.file_names[index]
        sound, sample_rate = torchaudio.load(path, normalize=True)
        soundData = torch.mean(sound, dim=0, keepdim=True)
        tempData = torch.zeros([1, 160000])  # tempData accounts for audio clips that are too short

        if soundData.numel() < 160000: # checks the number of elements in soundData tensor
            tempData[:, :soundData.numel()] = soundData
        else:
            tempData = soundData[:, :160000]

        soundData = tempData

        mel_specgram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)(soundData)  # (channel, n_mels, time)
        mel_specgram_norm = (mel_specgram - mel_specgram.mean()) / mel_specgram.std()
        mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate)(soundData)  # (channel, n_mfcc, time)
        mfcc_norm = (mfcc - mfcc.mean()) / mfcc.std()
        
        feature = torch.cat([mel_specgram, mfcc], axis=1)
        
        return feature[0].permute(1, 0), self.labels[index]

    def __len__(self):
        return len(self.file_names)

In [37]:
class AudioLSTM(nn.Module):

    def __init__(self, n_feature=5, out_feature=5, n_hidden=256, n_layers=2, drop_prob=0.5):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_feature = n_feature

        self.lstm = nn.LSTM(self.n_feature, self.n_hidden, self.n_layers, dropout=self.drop_prob, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)

        self.fc = nn.Linear(n_hidden, out_feature)

    def forward(self, x, hidden):
        # x.shape (batch, seq_len, n_features)
        l_out, l_hidden = self.lstm(x, hidden)

        # out.shape (batch, seq_len, n_hidden*direction)
        out = self.dropout(l_out)

        # out.shape (batch, out_feature)
        out = self.fc(out[:, -1, :])

        # return the final output and the hidden state
        return out, l_hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        return hidden

In [38]:
def train(model, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)

        model.zero_grad()
        output, hidden_state = model(data, model.init_hidden(hyperparameters["batch_size"]))
        
        loss = criterion(output, target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))

In [39]:
def test(model, epoch):
    model.eval()
    correct = 0
    y_pred, y_target = [], []
    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        
        output, hidden_state = model(data, model.init_hidden(hyperparameters["batch_size"]))
        
        pred = torch.max(output, dim=1).indices
        correct += pred.eq(target).cpu().sum().item()
        y_pred = y_pred + pred.tolist()
        y_target = y_target + target.tolist()
        
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [40]:
hyperparameters = {
    "lr": 0.01, 
    "weight_decay": 0.0001,
    "batch_size": 128, 
    "in_feature": 168, # number of features per timesteps (e.g., MFCCs, spectrogram bins)
    "out_feature": 10 # number of output units ( e.g., number of classes )
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

csv_path =  '../data/urbansound8k/UrbanSound8K.csv'
file_path = '../data/urbansound8k/'

train_set = UrbanSoundDataset(csv_path, file_path, range(1, 10))
test_set = UrbanSoundDataset(csv_path, file_path, [10])
print("Train set size: " + str(len(train_set)))
print("Test set size: " + str(len(test_set)))

cuda
Train set size: 7895
Test set size: 837


1. **num_workers**

    * What it means: Nu mber of subprocesses to use for data loading.
    * `num_workers=0`: data loading is done in the main process (slower).
    * `num_workers>0`: multiple worker processes load data in parallel (faster).

👉 Here it’s set to 1, meaning one extra worker process will load data in parallel to the training loop.

2. **pin_memory**

    * What it means: If True, the data loader will copy tensors into page-locked (pinned) memory.

    * Why? When using a GPU (device == 'cuda'), pinned memory allows faster and more efficient transfer of data from CPU → GPU.

    * Without it, transfers may be slower because regular memory can be swapped out by the OS.

👉 It’s only useful when training on GPU. That’s why it’s conditionally set only if device == 'cuda'.

In [45]:
kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {}  # needed for using datasets on gpu

train_loader = torch.utils.data.DataLoader(train_set, batch_size=hyperparameters["batch_size"], shuffle=True, drop_last=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=hyperparameters["batch_size"], shuffle=True, drop_last=True, **kwargs)

In [42]:
model = AudioLSTM(
    n_feature=hyperparameters["in_feature"],
    out_feature=hyperparameters["out_feature"])

model.to(device)
print(model)

optimizer = optim.Adam(
    model.parameters(),
    lr=hyperparameters['lr'],
    weight_decay=hyperparameters['weight_decay'])

criterion = nn.CrossEntropyLoss()
clip = 5  # gradient clipping

log_interval = 10

AudioLSTM(
  (lstm): LSTM(168, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=10, bias=True)
)


In [46]:
for epoch in range(1, 101):
    train(model, epoch)
    test(model, epoch)

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)



Test set: Accuracy: 405/837 (48%)


Test set: Accuracy: 435/837 (52%)


Test set: Accuracy: 428/837 (51%)


Test set: Accuracy: 469/837 (56%)


Test set: Accuracy: 452/837 (54%)


Test set: Accuracy: 472/837 (56%)


Test set: Accuracy: 436/837 (52%)


Test set: Accuracy: 407/837 (49%)


Test set: Accuracy: 438/837 (52%)


Test set: Accuracy: 461/837 (55%)


Test set: Accuracy: 405/837 (48%)


Test set: Accuracy: 413/837 (49%)


Test set: Accuracy: 474/837 (57%)


Test set: Accuracy: 411/837 (49%)


Test set: Accuracy: 437/837 (52%)


Test set: Accuracy: 467/837 (56%)


Test set: Accuracy: 415/837 (50%)


Test set: Accuracy: 441/837 (53%)


Test set: Accuracy: 392/837 (47%)


Test set: Accuracy: 409/837 (49%)


Test set: Accuracy: 433/837 (52%)


Test set: Accuracy: 477/837 (57%)


Test set: Accuracy: 477/837 (57%)


Test set: Accuracy: 456/837 (54%)


Test set: Accuracy: 402/837 (48%)


Test set: Accuracy: 416/837 (50%)


Test set: Accuracy: 470/837 (56%)


Test set: Accuracy: 465/837