In [71]:
!pip install opendatasets pandas -q


In [73]:
import opendatasets as od
import pandas
#{"username":"khwrali","key":"a2b23dfbac2443ab4db34e48318ce4ff"}
od.download(
    "https://www.kaggle.com/datasets/sripaadsrinivasan/audio-mnist")

Skipping, found downloaded files in "./audio-mnist" (use force=True to force download)


In [74]:
import os
import librosa
import numpy as np
from tqdm import tqdm

In [75]:
data_folder = '/content/audio-mnist/data'

def extract_mfccs(file_path, num_mfcc=13):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc)
    return mfccs


In [76]:
num_mfcc = 13
num_instances = 50
num_speakers = 60
num_digits = 10

In [77]:
fixed_length = 100

mfccs_data = []
labels = []

for speaker_id in tqdm(range(1, num_speakers + 1), desc='Speakers'):
    speaker_folder = os.path.join(data_folder, f'{speaker_id:02d}')

    for digit in range(num_digits):
        for instance in tqdm(range(num_instances), desc=f'Digit {digit}'):
            file_name = f'{digit}_{speaker_id:02d}_{instance}.wav'
            file_path = os.path.join(speaker_folder, file_name)
            mfccs = extract_mfccs(file_path, num_mfcc)
            if mfccs.shape[1] < fixed_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, fixed_length - mfccs.shape[1])))
            else:
                mfccs = mfccs[:, :fixed_length]

            mfccs_data.append(mfccs)
            labels.append(digit)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Digit 8:  24%|██▍       | 12/50 [00:00<00:00, 119.24it/s][A
Digit 8:  48%|████▊     | 24/50 [00:00<00:00, 98.31it/s] [A
Digit 8:  70%|███████   | 35/50 [00:00<00:00, 101.96it/s][A
Digit 8: 100%|██████████| 50/50 [00:00<00:00, 103.81it/s]

Digit 9:   0%|          | 0/50 [00:00<?, ?it/s][A
Digit 9:  24%|██▍       | 12/50 [00:00<00:00, 118.33it/s][A
Digit 9:  48%|████▊     | 24/50 [00:00<00:00, 111.31it/s][A
Digit 9:  72%|███████▏  | 36/50 [00:00<00:00, 106.59it/s][A
Digit 9: 100%|██████████| 50/50 [00:00<00:00, 103.99it/s]
Speakers:   7%|▋         | 4/60 [00:30<06:13,  6.67s/it]
Digit 0:   0%|          | 0/50 [00:00<?, ?it/s][A
Digit 0:  24%|██▍       | 12/50 [00:00<00:00, 111.95it/s][A
Digit 0:  48%|████▊     | 24/50 [00:00<00:00, 101.29it/s][A
Digit 0:  72%|███████▏  | 36/50 [00:00<00:00, 107.74it/s][A
Digit 0: 100%|██████████| 50/50 [00:00<00:00, 106.60it/s]

Digit 1:   0%|          | 0/50 [00:00<?, ?it/s][A


In [78]:
mfccs_data = np.array(mfccs_data)
labels = np.array(labels)
print("MFCCs shape:", mfccs_data.shape)
print("Labels shape:", labels.shape)

MFCCs shape: (30000, 13, 100)
Labels shape: (30000,)


In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    mfccs_data, labels, test_size=0.1, random_state=42, stratify=labels
)

print("Train set shapes:", X_train.shape, y_train.shape)
print("Test set shapes:", X_test.shape, y_test.shape)


Train set shapes: (27000, 13, 100) (27000,)
Test set shapes: (3000, 13, 100) (3000,)


In [91]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(128, input_shape=(13, 100)))
model.add(Dense(10, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", test_accuracy)

predictions = model.predict(X_test)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9263333082199097


In [102]:
print(X_test[0].shape)
print(np.argmax(predictions[0]))

(13, 100)
1
