In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [2]:
# Path to the IRMAS dataset
dataset_path = r'X:\CODING\PROJECTS\AUDIO_PROJECT\New folder\IRMAS-TrainingData'  # We use raw string to avoid issues with backslashes

# Function to load audio files and extract MFCC features
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    return mfccs_scaled

# Load data and labels
data = []
labels = []

# Iterate through the dataset
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)  # Build the full file path
            label = root.split(os.sep)[-1]  # Extract the label (instrument name)
            features = extract_features(file_path)  # Extract MFCC features from the audio file
            data.append(features)
            labels.append(label)

# Convert to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

In [3]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels_categorical, test_size=0.2, random_state=42)

print(f'Tamaño del conjunto de entrenamiento: {X_train.shape}')
print(f'Tamaño del conjunto de prueba: {X_test.shape}')

Tamaño del conjunto de entrenamiento: (5364, 40)
Tamaño del conjunto de prueba: (1341, 40)


In [4]:
# Define the model
model = Sequential()
# First layer
model.add(Dense(512, input_shape=(40,), activation='relu'))
model.add(Dropout(0.5))

# Layer2
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
#Layer3
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(len(np.unique(labels)), activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
# Train the model
history = model.fit(X_train, y_train, epochs=75, batch_size=32, validation_data=(X_test, y_test), verbose = 0)

In [11]:
# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=1)
print(f'Model accuracy: {score[1]*100:.2f}%')

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5575 - loss: 1.4456 
Model accuracy: 53.39%


In [12]:
# Function to predict the instrument in a new audio file
def predict_instrument(file_path, model, label_encoder):
    features = extract_features(file_path)
    features = np.expand_dims(features, axis=0)  # Expand dimensions to match the model's input
    prediction = model.predict(features)
    predicted_label = np.argmax(prediction, axis=1)
    predicted_instrument = label_encoder.inverse_transform(predicted_label)
    return predicted_instrument[0]

In [13]:
# Example prediction on a new audio file
new_file_path = r'X:\CODING\PROJECTS\AUDIO_PROJECT\New folder\audio1.mp3'  # Change this to the path of your new audio file
predicted_instrument = predict_instrument(new_file_path, model, label_encoder)
print(f'The predicted instrument is: {predicted_instrument}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
The predicted instrument is: pia
