# Environmental Sound Classification (ESC)

## IMPORTS

In [16]:
import glob
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import torchaudio
import torch.optim as optim
import torch.nn.functional as F

## CONFIG

In [19]:
config = {
    "batch" : 64,
    "lr" : 1e-5,
    'epochs' : 20
}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## DATA

In [3]:
esc50_list = [f.split("-")[-1].replace(".wav","")
for f in
glob.glob("ESC-50/audio/*.wav")]
Counter(esc50_list)
# data = "/kaggle/input/environmental-sound-classification-esc-dataset/ESC-50/audio"
train = "/kaggle/input/environmental-sound-classification-esc-dataset/ESC-50/audio/train"
test = "/kaggle/input/environmental-sound-classification-esc-dataset/ESC-50/audio/test"
valid = "/kaggle/input/environmental-sound-classification-esc-dataset/ESC-50/audio/valid"

In [4]:
class ESC50(Dataset):
    def __init__(self, path):
        # Get the directory listing from path
        files = Path(path).glob('*.wav')
        # Iterate through the listing and create a list of tuples (filename, label)
        self.items = [(f,int(f.name.split("-")[-1].replace(".wav",""))) for f in files]
        self.length = len(self.items)
    def __getitem__(self, index):
        filename, label = self.items[index]
        waveform, _ = torchaudio.load(filename)
        return waveform, label
    def __len__(self):
        return self.length

In [8]:
# dataset = ESC50(data)
# waveform, label = dataset[0]
# print(waveform.shape)  # Check the waveform shape
# print(label)           # Check the label

torch.Size([1, 220500])
15


In [5]:
trainset = ESC50(train)
testset = ESC50(test)
validset = ESC50(valid)

In [6]:
traindataloader = DataLoader(trainset, batch_size = config["batch"], shuffle = True)
testdataloader = DataLoader(testset, batch_size = config["batch"])
validdataloader = DataLoader(validset, batch_size = config["batch"])

In [11]:
for feature, labels in traindataloader:
    print(feature.shape, labels.shape)
    break

torch.Size([64, 1, 220500]) torch.Size([64])


## Model

In [15]:
class AudioNet(nn.Module):
    def __init__(self):
        super(AudioNet, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(256)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30)
        self.fc1 = nn.Linear(512, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)


audioNet = AudioNet()
audioNet.to(device)

AudioNet(
  (conv1): Conv1d(1, 128, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=256, stride=256, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avgPool): AvgPool1d(kernel_size=(30,), strid

## OPTIMIZERS & LOSSES

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(audioNet.parameters(), lr = config["lr"])

## TRAINING LOOP

In [21]:
def train(model, dataloader, criterion, optimizer, epochs, scheduler=None):
    """
    Trains a PyTorch model for a specified number of epochs.

    Args:
        model (torch.nn.Module): The model to be trained.
        dataloader (torch.utils.data.DataLoader): The DataLoader providing the training data.
        criterion (callable): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer used to update the model's parameters.
        epochs (int): Number of epochs to train the model.
        scheduler (torch.optim.lr_scheduler, optional): Learning rate scheduler (default is None).
    """
    model.train()  # Set model to training mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            # Move data to the appropriate device
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss for logging
            running_loss += loss.item()
        
        # Adjust learning rate if scheduler is provided
        if scheduler:
            scheduler.step()

        # Logging epoch loss
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")


##  TRAINING & VALIDATION

In [22]:
train(audioNet, traindataloader, criterion, optimizer, config["epochs"])

RuntimeError: Given input size: (512x1x2). Calculated output size: (512x1x0). Output size is too small