In [1]:
!pip install opendatasets pandas -q


In [2]:
import opendatasets as od
import pandas
#{"username":"khwrali","key":"a2b23dfbac2443ab4db34e48318ce4ff"}
od.download(
    "https://www.kaggle.com/datasets/sripaadsrinivasan/audio-mnist")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: khwrali
Your Kaggle Key: ··········
Downloading audio-mnist.zip to ./audio-mnist


100%|██████████| 948M/948M [00:51<00:00, 19.4MB/s]





In [3]:
import librosa
import numpy as np
import os
from tqdm import tqdm


In [4]:
def extract_mfccs(file_path, num_mfcc=13):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc)
    return mfccs


In [5]:
num_mfcc = 13  # Number of MFCC coefficients to extract
num_instances = 50  # Number of instances per digit per speaker
num_speakers = 60
num_digits = 10

In [6]:
data_folder = '/content/audio-mnist/data'

In [7]:
# Determine a fixed length for MFCC arrays (e.g., 100)
fixed_length = 100

# Initialize arrays to hold MFCCs and labels
mfccs_data = []
label = []

# Loop through each speaker
for speaker_id in tqdm(range(1, num_speakers + 1), desc='Speakers'):
    speaker_folder = os.path.join(data_folder, f'{speaker_id:02d}')

    # Loop through each digit
    for digit in range(num_digits):
        # Loop through each instance
        for instance in tqdm(range(num_instances), desc=f'Digit {digit}'):
            file_name = f'{digit}_{speaker_id:02d}_{instance}.wav'
            file_path = os.path.join(speaker_folder, file_name)

            mfccs = extract_mfccs(file_path, num_mfcc)

            # Pad or truncate MFCCs to the fixed length
            if mfccs.shape[1] < fixed_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, fixed_length - mfccs.shape[1])))
            else:
                mfccs = mfccs[:, :fixed_length]

            mfccs_data.append(mfccs)
            label.append(digit)

# Convert data to numpy arrays
mfccs_data = np.array(mfccs_data)
label = np.array(label)



Speakers:   0%|          | 0/60 [00:00<?, ?it/s]
Digit 0:   0%|          | 0/50 [00:00<?, ?it/s][A
Digit 0:   2%|▏         | 1/50 [00:10<08:52, 10.87s/it][A
Digit 0:  12%|█▏        | 6/50 [00:10<00:59,  1.35s/it][A
Digit 0:  28%|██▊       | 14/50 [00:11<00:16,  2.17it/s][A
Digit 0:  42%|████▏     | 21/50 [00:11<00:07,  3.86it/s][A
Digit 0:  54%|█████▍    | 27/50 [00:11<00:03,  5.80it/s][A
Digit 0:  66%|██████▌   | 33/50 [00:11<00:02,  8.32it/s][A
Digit 0:  78%|███████▊  | 39/50 [00:11<00:00, 11.55it/s][A
Digit 0: 100%|██████████| 50/50 [00:11<00:00,  4.27it/s]

Digit 1:   0%|          | 0/50 [00:00<?, ?it/s][A
Digit 1:   8%|▊         | 4/50 [00:00<00:01, 36.93it/s][A
Digit 1:  20%|██        | 10/50 [00:00<00:00, 47.28it/s][A
Digit 1:  32%|███▏      | 16/50 [00:00<00:00, 50.64it/s][A
Digit 1:  44%|████▍     | 22/50 [00:00<00:00, 52.28it/s][A
Digit 1:  56%|█████▌    | 28/50 [00:00<00:00, 53.28it/s][A
Digit 1:  68%|██████▊   | 34/50 [00:00<00:00, 53.13it/s][A
Digit 1: 100%|

In [8]:
print("mfcc_data shape:",mfccs_data.shape)
print("labels shape:",label.shape)

mfcc_data shape: (30000, 13, 100)
labels shape: (30000,)


In [9]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import numpy as np
import torch.nn.functional as F

In [10]:
input_size = 13  # Number of MFCC coefficients
hidden_size = 128
num_classes = 10  # Number of classes (digits)
batch_size = 32
learning_rate = 0.001
num_epochs = 10

In [11]:
class AudioDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

In [12]:
dataset = AudioDataset(mfccs_data, label)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [13]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)



In [14]:
import torch
import torch.nn as nn

class EncoderDecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(EncoderDecoderLSTM, self).__init__()

        # Encoder LSTM
        self.encoder_lstm = nn.LSTM(input_size, hidden_size)

        # Decoder LSTM
        self.decoder_lstm = nn.LSTM(hidden_size, hidden_size)

        # Output layer
        self.output_layer = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Encode the input sequence
        encoder_outputs, (encoder_hidden, encoder_cell) = self.encoder_lstm(x)

        # Use the last hidden state of the encoder as the initial hidden state for the decoder
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        # Decode the sequence using the decoder
        decoder_outputs, _ = self.decoder_lstm(encoder_outputs, (decoder_hidden, decoder_cell))
        # Pass decoder outputs through the output layer
        output = self.output_layer(decoder_outputs)
        #output = torch.softmax(output, dim=2)
        output = torch.mean(output, dim=1)

        return output

# Define model parameters
input_size = 13
hidden_size = 128
num_classes = 10

# Create the model
model = EncoderDecoderLSTM(input_size, hidden_size, num_classes)


In [15]:
import torch.cuda as cuda

# Check if GPU is available
if cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print(f'Using {device} device')


Using cuda device


In [16]:
model.to('cuda')

EncoderDecoderLSTM(
  (encoder_lstm): LSTM(13, 128)
  (decoder_lstm): LSTM(128, 128)
  (output_layer): Linear(in_features=128, out_features=10, bias=True)
)

In [17]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs = inputs.permute(0, 2, 1).to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        num_classes = 10
        targets_one_hot = F.one_hot(targets, num_classes).float().to(device)

        loss = criterion(outputs, targets_one_hot)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    average_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_loss:.4f}")


Epoch [1/10], Batch [1/844], Loss: 0.6867
Epoch [1/10], Batch [101/844], Loss: 0.3223
Epoch [1/10], Batch [201/844], Loss: 0.3092
Epoch [1/10], Batch [301/844], Loss: 0.2955
Epoch [1/10], Batch [401/844], Loss: 0.2571
Epoch [1/10], Batch [501/844], Loss: 0.2597
Epoch [1/10], Batch [601/844], Loss: 0.2181
Epoch [1/10], Batch [701/844], Loss: 0.2029
Epoch [1/10], Batch [801/844], Loss: 0.1976
Epoch [1/10], Average Loss: 0.2683
Epoch [2/10], Batch [1/844], Loss: 0.2005
Epoch [2/10], Batch [101/844], Loss: 0.2068
Epoch [2/10], Batch [201/844], Loss: 0.1881
Epoch [2/10], Batch [301/844], Loss: 0.1762
Epoch [2/10], Batch [401/844], Loss: 0.1659
Epoch [2/10], Batch [501/844], Loss: 0.1685
Epoch [2/10], Batch [601/844], Loss: 0.1494
Epoch [2/10], Batch [701/844], Loss: 0.1282
Epoch [2/10], Batch [801/844], Loss: 0.1334
Epoch [2/10], Average Loss: 0.1576
Epoch [3/10], Batch [1/844], Loss: 0.1234
Epoch [3/10], Batch [101/844], Loss: 0.0962
Epoch [3/10], Batch [201/844], Loss: 0.1258
Epoch [3/10]

In [19]:
model.eval()  # Set model to evaluation mode

total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(val_loader):
        inputs = inputs.permute(0, 2, 1).to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)  # Get predicted class indices

        total_correct += (predicted == targets).sum().item()
        total_samples += targets.size(0)

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.9880
