In [4]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [5]:
file_path = "datasets/UrbanSound8K/UrbanSound8K.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [6]:
# Definir los sonidos críticos que queremos detectar
critical_sounds = ['car_horn', 'siren', 'engine_idling']

# Filtrar el dataset
df_filtered = df[df['class'].isin(critical_sounds)]

# Mostrar la cantidad de sonidos seleccionados
df_filtered['class'].value_counts()

class
engine_idling    1000
siren             929
car_horn          429
Name: count, dtype: int64

In [7]:
df_filtered.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn
10,100648-1-1-0.wav,100648,8.998279,10.052132,2,10,1,car_horn
11,100648-1-2-0.wav,100648,16.699509,17.104837,2,10,1,car_horn
12,100648-1-3-0.wav,100648,17.631764,19.253075,2,10,1,car_horn
13,100648-1-4-0.wav,100648,25.332994,27.197502,2,10,1,car_horn


# Extraer Características de Audio (MFCCs)
Cada archivo de audio se convierte en un conjunto de coeficientes MFCC, que son características esenciales para el reconocimiento de sonidos.

In [8]:
def extract_features(file_name):
    try:
        y, sr = librosa.load(file_name, sr=44100)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfccs = np.mean(mfccs.T, axis=0)
        return mfccs
    except Exception as e:
        print("❌ Error en archivo:", file_name, str(e))
        return None


In [9]:
X, y = [], []

for index, row in df_filtered.iterrows():
    file_name = os.path.join(
        "datasets", "UrbanSound8K", f"fold{row['fold']}", row["slice_file_name"]
    )  # Ajusta la ruta de los audios

    features = extract_features(file_name)

    if features is not None:
        X.append(features)
        y.append(row["class"])

# Convertir a arrays de NumPy
X = np.array(X)
y = np.array(y)

print("✅ Características extraídas. Tamaño del dataset:", X.shape)

✅ Características extraídas. Tamaño del dataset: (2358, 40)


In [10]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("🔠 Clases codificadas:", label_encoder.classes_)


🔠 Clases codificadas: ['car_horn' 'engine_idling' 'siren']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("📊 Datos divididos en entrenamiento y prueba:")
print("   - Entrenamiento:", X_train.shape[0], "muestras")
print("   - Prueba:", X_test.shape[0], "muestras")


📊 Datos divididos en entrenamiento y prueba:
   - Entrenamiento: 1886 muestras
   - Prueba: 472 muestras


In [12]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(40,)),  # Capa oculta 1
    Dropout(0.3),
    Dense(128, activation='relu'),  # Capa oculta 2
    Dropout(0.3),
    Dense(64, activation='relu'),  # Capa oculta 3
    Dense(len(label_encoder.classes_), activation='softmax')  # Capa de salida
])

print("🔧 Modelo de IA construido.")

🔧 Modelo de IA construido.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print("⚙️ Modelo compilado.")

⚙️ Modelo compilado.


In [14]:
print("🏋️‍♂️ Iniciando entrenamiento...")
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))
print("✅ Entrenamiento finalizado.")

🏋️‍♂️ Iniciando entrenamiento...
Epoch 1/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.4686 - loss: 8.8861 - val_accuracy: 0.7564 - val_loss: 0.7224
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6194 - loss: 1.9654 - val_accuracy: 0.8284 - val_loss: 0.5067
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6446 - loss: 1.1968 - val_accuracy: 0.8114 - val_loss: 0.5136
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6913 - loss: 0.9525 - val_accuracy: 0.7585 - val_loss: 0.5719
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7208 - loss: 0.7340 - val_accuracy: 0.7691 - val_loss: 0.5007
Epoch 6/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7483 - loss: 0.6057 - val_accuracy: 0.8136 - val_loss: 0.4527
Epoch 

In [15]:
model.save("datasets/UrbanSound8K/data/urban_sound_model.h5")

print("💾 Modelo guardado como 'urban_sound_model.h5'.")



💾 Modelo guardado como 'urban_sound_model.h5'.


In [16]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"📊 Precisión del modelo en datos de prueba: {accuracy:.2f}")

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9506 - loss: 0.1278 
📊 Precisión del modelo en datos de prueba: 0.95


In [17]:
def predict_sound(file_name):
    features = extract_features(file_name)
    if features is not None:
        features = np.array(features).reshape(1, -1)
        prediction = model.predict(features)
        class_pred = label_encoder.inverse_transform([np.argmax(prediction)])
        print(f"🔊 Predicción: {class_pred[0]}")
    else:
        print("❌ Error al procesar el archivo.")

In [18]:
# critical_sounds = ["car_horn", "siren", "engine_idling"]
df_filtered[df_filtered["class"] == "car_horn"]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn
10,100648-1-1-0.wav,100648,8.998279,10.052132,2,10,1,car_horn
11,100648-1-2-0.wav,100648,16.699509,17.104837,2,10,1,car_horn
12,100648-1-3-0.wav,100648,17.631764,19.253075,2,10,1,car_horn
13,100648-1-4-0.wav,100648,25.332994,27.197502,2,10,1,car_horn
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [19]:
predict_sound("datasets/UrbanSound8K/fold7/99812-1-4-0.wav") 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
🔊 Predicción: car_horn
