In [1]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [2]:
file_path = "datasets/UrbanSound8K/UrbanSound8K.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [3]:
# Definir los sonidos críticos que queremos detectar
critical_sounds = ['car_horn', 'siren', 'engine_idling']

# Filtrar el dataset
df_filtered = df[df['class'].isin(critical_sounds)]

# Mostrar la cantidad de sonidos seleccionados
df_filtered['class'].value_counts()

class
engine_idling    1000
siren             929
car_horn          429
Name: count, dtype: int64

In [4]:
df_filtered.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn
10,100648-1-1-0.wav,100648,8.998279,10.052132,2,10,1,car_horn
11,100648-1-2-0.wav,100648,16.699509,17.104837,2,10,1,car_horn
12,100648-1-3-0.wav,100648,17.631764,19.253075,2,10,1,car_horn
13,100648-1-4-0.wav,100648,25.332994,27.197502,2,10,1,car_horn


# Extraer Características de Audio (MFCCs)
Cada archivo de audio se convierte en un conjunto de coeficientes MFCC, que son características esenciales para el reconocimiento de sonidos.

In [5]:
def extract_features(file_name):
    try:
        y, sr = librosa.load(file_name, sr=44100)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfccs = np.mean(mfccs.T, axis=0)
        return mfccs
    except Exception as e:
        print("❌ Error en archivo:", file_name, str(e))
        return None


In [6]:
X, y = [], []

for index, row in df_filtered.iterrows():
    file_name = os.path.join(
        "datasets", "UrbanSound8K", f"fold{row['fold']}", row["slice_file_name"]
    ) 

    features = extract_features(file_name)

    if features is not None:
        X.append(features)
        y.append(row["class"])

# Convertir a arrays de NumPy
X = np.array(X)
y = np.array(y)

print("Características extraídas. Tamaño del dataset:", X.shape)

Características extraídas. Tamaño del dataset: (2358, 40)


In [7]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("Clases codificadas:", label_encoder.classes_)


Clases codificadas: ['car_horn' 'engine_idling' 'siren']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Datos divididos en entrenamiento y prueba:")
print("   - Entrenamiento:", X_train.shape[0], "muestras")
print("   - Prueba:", X_test.shape[0], "muestras")


Datos divididos en entrenamiento y prueba:
   - Entrenamiento: 1886 muestras
   - Prueba: 472 muestras


In [9]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(40,)),  # Capa oculta 1
    Dropout(0.3),
    Dense(128, activation='relu'),  # Capa oculta 2
    Dropout(0.3),
    Dense(64, activation='relu'),  # Capa oculta 3
    Dense(len(label_encoder.classes_), activation='softmax')  # Capa de salida
])

print("Modelo de IA construido.")

Modelo de IA construido.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print("Modelo compilado.")

Modelo compilado.


In [11]:
print("Iniciando entrenamiento...")
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))
print("Entrenamiento finalizado.")

Iniciando entrenamiento...
Epoch 1/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4546 - loss: 9.5446 - val_accuracy: 0.7648 - val_loss: 0.6805
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5718 - loss: 2.0033 - val_accuracy: 0.7839 - val_loss: 0.6154
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6490 - loss: 1.1382 - val_accuracy: 0.8178 - val_loss: 0.4974
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6826 - loss: 1.0009 - val_accuracy: 0.8517 - val_loss: 0.4683
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7130 - loss: 0.7592 - val_accuracy: 0.8686 - val_loss: 0.4119
Epoch 6/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7761 - loss: 0.5865 - val_accuracy: 0.8814 - val_loss: 0.3773
Epoch 7/30
[

In [12]:
model.save("datasets/UrbanSound8K/data/urban_sound_model.h5")

print("Modelo guardado como 'urban_sound_model.h5'.")



Modelo guardado como 'urban_sound_model.h5'.


In [1]:
import tensorflow as tf

model = tf.keras.models.load_model('datasets/UrbanSound8K/data/urban_sound_model.h5')
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

with open('datasets/UrbanSound8K/data/urban_sound_model.tflite', 'wb') as f:
    f.write(tflite_model)

print("Modelo convertido y guardado como sound_detector.tflite")



INFO:tensorflow:Assets written to: C:\Users\yordy\AppData\Local\Temp\tmpv_9psspo\assets


INFO:tensorflow:Assets written to: C:\Users\yordy\AppData\Local\Temp\tmpv_9psspo\assets


Saved artifact at 'C:\Users\yordy\AppData\Local\Temp\tmpv_9psspo'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 40), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  2536270200272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536270199888: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536272988112: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536272987920: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536272991376: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536272992336: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536272993296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2536272993872: TensorSpec(shape=(), dtype=tf.resource, name=None)
Modelo convertido y guardado como sound_detector.tflite


In [13]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Precisión del modelo en datos de prueba: {accuracy:.2f}")

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9585 - loss: 0.1418 
Precisión del modelo en datos de prueba: 0.95


In [14]:
def predict_sound(file_name):
    features = extract_features(file_name)
    if features is not None:
        features = np.array(features).reshape(1, -1)
        prediction = model.predict(features)
        class_pred = label_encoder.inverse_transform([np.argmax(prediction)])
        print(f"🔊 Predicción: {class_pred[0]}")
    else:
        print("❌ Error al procesar el archivo.")

In [15]:
# critical_sounds = ["car_horn", "siren", "engine_idling"]
df_filtered[df_filtered["class"] == "siren"]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
114,102853-8-0-0.wav,102853,0.000000,4.000000,2,7,8,siren
115,102853-8-0-1.wav,102853,0.500000,4.500000,2,7,8,siren
116,102853-8-0-2.wav,102853,1.000000,5.000000,2,7,8,siren
117,102853-8-0-3.wav,102853,1.500000,5.500000,2,7,8,siren
118,102853-8-0-4.wav,102853,2.000000,6.000000,2,7,8,siren
...,...,...,...,...,...,...,...,...
8574,96657-8-0-1.wav,96657,122.837051,126.837051,2,8,8,siren
8575,96657-8-0-2.wav,96657,123.337051,127.337051,2,8,8,siren
8576,96657-8-0-3.wav,96657,123.837051,127.837051,2,8,8,siren
8642,98525-8-0-0.wav,98525,0.000000,4.000000,1,7,8,siren


In [None]:
predict_sound("datasets/UrbanSound8K/fold7/99812-1-4-0.wav") 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
🔊 Predicción: car_horn
