In [94]:
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [95]:
# Definir parámetros
SAMPLE_RATE = 22050  # Frecuencia de muestreo estándar en audio
DURATION = 5  # Duración en segundos de cada audio
N_MELS = 128  # Número de bandas Mel para el espectrograma
FIXED_LENGTH = 216  # Longitud fija en frames del espectrograma

In [96]:
file_path = "datasets/ESC-50/meta/esc50.csv"
df = pd.read_csv(file_path)

In [97]:
# Definir sonidos críticos en la conducción
critical_sounds = [
    "siren",  # Sirenas de emergencia
    "car_horn",  # Bocina de automóvil
    "engine",  # Sonido del motor
    "brakes_squeaking",  # Frenos chirriando
    "jackhammer",  # Ruido fuerte de construcción
    "train",  # Tren (puede ser relevante en cruces ferroviarios)
    "fireworks",  # Explosiones fuertes (pueden distraer o asustar)
    "thunderstorm",  # Tormenta eléctrica (puede reducir visibilidad y generar distracción)
]

# Filtrar los datos con sonidos críticos
df_critical_sounds = df[df["category"].isin(critical_sounds)]
df_critical_sounds.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
5,1-101296-B-19.wav,1,19,thunderstorm,False,101296,B
16,1-115521-A-19.wav,1,19,thunderstorm,False,115521,A
17,1-115545-A-48.wav,1,48,fireworks,False,115545,A
18,1-115545-B-48.wav,1,48,fireworks,False,115545,B


In [98]:
# Mapear categorías a índices consecutivos
category_to_index = {category: idx for idx, category in enumerate(df_critical_sounds["category"].unique())}
index_to_category = {v: k for k, v in category_to_index.items()}  # Mapeo inverso para decodificar predicciones
df_critical_sounds["target_mapped"] = df_critical_sounds["category"].map(category_to_index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_critical_sounds["target_mapped"] = df_critical_sounds["category"].map(category_to_index)


In [99]:
df_critical_sounds.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take,target_mapped
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,0
5,1-101296-B-19.wav,1,19,thunderstorm,False,101296,B,0
16,1-115521-A-19.wav,1,19,thunderstorm,False,115521,A,0
17,1-115545-A-48.wav,1,48,fireworks,False,115545,A,1
18,1-115545-B-48.wav,1,48,fireworks,False,115545,B,1


In [100]:
audio_files = df_critical_sounds["filename"].tolist()
labels = df_critical_sounds["target_mapped"].tolist()

In [101]:
# Función para cargar un archivo de audio y convertirlo en un espectrograma Mel con padding
def extract_mel_spectrogram(file_path, sample_rate=SAMPLE_RATE, n_mels=N_MELS, fixed_length=FIXED_LENGTH):
    y, sr = librosa.load(file_path, sr=sample_rate, duration=DURATION)
    
    # Data Augmentation
    if np.random.rand() < 0.5:
        y = librosa.effects.time_stretch(y, rate=np.random.uniform(0.8, 1.2))  # Cambio de velocidad
    if np.random.rand() < 0.5:
        y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=np.random.uniform(-3, 3))  # Cambio de tono
    
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)  # Convertir a escala logarítmica
    
    # Ajustar la longitud del espectrograma (padding o recorte)
    if mel_spec_db.shape[1] < fixed_length:
        pad_width = fixed_length - mel_spec_db.shape[1]
        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_spec_db = mel_spec_db[:, :fixed_length]
    
    return mel_spec_db

In [None]:
# # Función para cargar un archivo de audio y convertirlo en un espectrograma Mel
# def extract_mel_spectrogram(file_path, sample_rate=SAMPLE_RATE, n_mels=N_MELS):
#     y, sr = librosa.load(file_path, sr=sample_rate, duration=DURATION)
#     mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
#     mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)  # Convertir a escala logarítmica
#     return mel_spec_db

In [102]:
# Crear una lista para almacenar espectrogramas Mel
mel_spectrograms = []

# Iterar sobre las filas del DataFrame de sonidos críticos
for index, row in df_critical_sounds.iterrows():
    # Crear la ruta completa del archivo de audio
    file_name = os.path.join("datasets", "ESC-50", "audio", row["filename"])
    
    # Generar y almacenar el espectrograma Mel
    mel_spectrograms.append(extract_mel_spectrogram(file_name))

# Convertir la lista a un array de NumPy
X = np.array(mel_spectrograms)
y = np.array(labels)



In [103]:
X = X[..., np.newaxis]  # Añadir dimensión de canal para la CNN
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [90]:

print("📊 Datos divididos en entrenamiento y prueba:")
print("   - Entrenamiento:", X_train.shape[0], "muestras")
print("   - Prueba:", X_test.shape[0], "muestras")


📊 Datos divididos en entrenamiento y prueba:
   - Entrenamiento: 192 muestras
   - Prueba: 48 muestras


In [104]:
num_classes = len(category_to_index)
model = keras.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(N_MELS, FIXED_LENGTH, 1)),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [106]:
# Compilar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Entrenar el modelo
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluar el modelo
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Precisión en datos de prueba: {test_acc:.2f}")


Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step - accuracy: 0.7031 - loss: 2.7577 - val_accuracy: 0.1042 - val_loss: 63.6223
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 926ms/step - accuracy: 0.7560 - loss: 1.8317 - val_accuracy: 0.1042 - val_loss: 56.3553
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 953ms/step - accuracy: 0.7921 - loss: 1.9539 - val_accuracy: 0.1042 - val_loss: 28.7580
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 991ms/step - accuracy: 0.8725 - loss: 0.9701 - val_accuracy: 0.1042 - val_loss: 37.2751
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 957ms/step - accuracy: 0.8665 - loss: 0.6775 - val_accuracy: 0.1042 - val_loss: 29.8580
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 952ms/step - accuracy: 0.9007 - loss: 0.7733 - val_accuracy: 0.1042 - val_loss: 54.1982
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━

In [119]:
# Función para predecir un sonido nuevo
def predict_sound(file_path):
    mel_spec = extract_mel_spectrogram(file_path)
    mel_spec = mel_spec[np.newaxis, ..., np.newaxis]  # Ajustar dimensiones para la CNN
    prediction = model.predict(mel_spec)
    predicted_class = np.argmax(prediction, axis=1)[0]
    return index_to_category[predicted_class]

# Ejemplo de predicción
file_to_predict = "datasets/UrbanSound8K/fold8/96657-8-0-1.wav"  # Reemplaza con el archivo real
predicted_category = predict_sound(file_to_predict)
print(f"El sonido detectado es: {predicted_category}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
El sonido detectado es: fireworks
