In [1]:
!pip install opendatasets pandas -q


In [2]:
import opendatasets as od
import pandas
#{"username":"khwrali","key":"a2b23dfbac2443ab4db34e48318ce4ff"}
od.download(
    "https://www.kaggle.com/datasets/sripaadsrinivasan/audio-mnist")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: khwrali
Your Kaggle Key: ··········
Downloading audio-mnist.zip to ./audio-mnist


100%|██████████| 948M/948M [00:09<00:00, 106MB/s]





In [3]:
import os
import librosa
import numpy as np
from tqdm import tqdm

In [4]:
data_folder = '/content/audio-mnist/data'

def extract_mfccs(file_path, num_mfcc=13):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc)
    return mfccs


In [5]:
num_mfcc = 13
num_instances = 50
num_speakers = 60
num_digits = 10

In [6]:
fixed_length = 100

mfccs_data = []
labels = []

for speaker_id in tqdm(range(1, num_speakers + 1), desc='Speakers'):
    speaker_folder = os.path.join(data_folder, f'{speaker_id:02d}')

    for digit in range(num_digits):
        for instance in tqdm(range(num_instances), desc=f'Digit {digit}'):
            file_name = f'{digit}_{speaker_id:02d}_{instance}.wav'
            file_path = os.path.join(speaker_folder, file_name)
            mfccs = extract_mfccs(file_path, num_mfcc)
            if mfccs.shape[1] < fixed_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, fixed_length - mfccs.shape[1])))
            else:
                mfccs = mfccs[:, :fixed_length]

            mfccs_data.append(mfccs)
            labels.append(digit)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Digit 9:  44%|████▍     | 22/50 [00:00<00:00, 36.88it/s][A
Digit 9:  52%|█████▏    | 26/50 [00:00<00:00, 35.85it/s][A
Digit 9:  62%|██████▏   | 31/50 [00:00<00:00, 37.35it/s][A
Digit 9:  70%|███████   | 35/50 [00:00<00:00, 37.89it/s][A
Digit 9:  78%|███████▊  | 39/50 [00:01<00:00, 38.15it/s][A
Digit 9:  86%|████████▌ | 43/50 [00:01<00:00, 38.05it/s][A
Digit 9: 100%|██████████| 50/50 [00:01<00:00, 37.79it/s]
Speakers:  25%|██▌       | 15/60 [02:18<06:20,  8.45s/it]
Digit 0:   0%|          | 0/50 [00:00<?, ?it/s][A
Digit 0:   8%|▊         | 4/50 [00:00<00:01, 36.86it/s][A
Digit 0:  16%|█▌        | 8/50 [00:00<00:01, 38.27it/s][A
Digit 0:  26%|██▌       | 13/50 [00:00<00:00, 42.44it/s][A
Digit 0:  38%|███▊      | 19/50 [00:00<00:00, 47.95it/s][A
Digit 0:  48%|████▊     | 24/50 [00:00<00:00, 47.64it/s][A
Digit 0:  62%|██████▏   | 31/50 [00:00<00:00, 54.51it/s][A
Digit 0:  76%|███████▌  | 38/50 [00:00<00:00, 58.87

In [7]:
mfccs_data = np.array(mfccs_data)
labels = np.array(labels)
print("MFCCs shape:", mfccs_data.shape)
print("Labels shape:", labels.shape)

MFCCs shape: (30000, 13, 100)
Labels shape: (30000,)


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    mfccs_data, labels, test_size=0.1, random_state=42, stratify=labels
)

print("Train set shapes:", X_train.shape, y_train.shape)
print("Test set shapes:", X_test.shape, y_test.shape)


Train set shapes: (27000, 13, 100) (27000,)
Test set shapes: (3000, 13, 100) (3000,)


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Bidirectional

model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(num_mfcc, fixed_length)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", test_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9906666874885559


In [21]:
predictions = model.predict(X_test)




In [22]:
n=2000
print(y_test[n])
print(np.argmax(predictions[n]))

5
5
