In [None]:
import librosa
import soundfile as sf
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

# Cargar un sonido crítico (por ejemplo, sirena)
input_path = "datasets/UrbanSound8K/fold7/102853-8-0-0.wav"
samples, sample_rate = librosa.load(input_path, sr=None)

# Crear pipeline de aumentaciones
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5)

])

# Aplicar aumentaciones
augmented_samples = augment(samples=samples, sample_rate=sample_rate)

# Guardar el nuevo archivo
sf.write("sirena_aumentada.wav", augmented_samples, sample_rate)

In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
import tensorflow_hub as hub
import tensorflow as tf
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# -----------------------------
# 1. Preparar YAMNet y Augmentaciones
# -----------------------------
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.01, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5)
])

# -----------------------------
# 2. Función para extraer embeddings
# -----------------------------
def extract_embedding(audio, sr=16000):
    waveform = tf.convert_to_tensor(audio, dtype=tf.float32)
    scores, embeddings, spectrogram = yamnet_model(waveform)
    return tf.reduce_mean(embeddings, axis=0).numpy()

# -----------------------------
# 3. Cargar y procesar tus audios
# -----------------------------
def load_dataset(data_dir):
    X, y = [], []
    for label in os.listdir(data_dir):
        class_dir = os.path.join(data_dir, label)
        for file in os.listdir(class_dir):
            file_path = os.path.join(class_dir, file)
            audio, sr = librosa.load(file_path, sr=16000)
            
            # Aumentación
            augmented = augment(samples=audio, sample_rate=sr)
            
            # Extraer embeddings original y aumentado
            emb_original = extract_embedding(audio)
            emb_augmented = extract_embedding(augmented)
            
            X.append(emb_original)
            X.append(emb_augmented)
            y.append(label)
            y.append(label)
    return np.array(X), np.array(y)

# -----------------------------
# 4. Entrenar un clasificador
# -----------------------------
def train_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    clf = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    print("\n🔍 Evaluación del modelo:")
    print(classification_report(y_test, y_pred))
    print("\n📊 Matriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    
    return clf

# -----------------------------
# 5. Ejecutar todo
# -----------------------------
# Estructura esperada:
# dataset/
#   └── sirena/
#   └── bocina/
#   └── alarma/

DATASET_DIR = "dataset"  # Ruta a tus audios organizados por clase

X, y = load_dataset(DATASET_DIR)
model = train_classifier(X, y)


In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile




In [2]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')













In [3]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  class_names = []
  with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      class_names.append(row['display_name'])

  return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

In [4]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [6]:
from scipy.io import wavfile
from IPython.display import Audio

# Ruta al archivo de audio
wav_file_name = 'datasets/UrbanSound8K/fold7/99812-1-4-0.wav'

# Leer el archivo WAV
sample_rate, wav_data = wavfile.read(wav_file_name)

# Asegurarse de que esté a 16000 Hz (YAMNet lo necesita así)
def ensure_sample_rate(sr, data, target_sr=16000):
    if sr != target_sr:
        import librosa
        data = librosa.resample(data.astype(float), orig_sr=sr, target_sr=target_sr)
        return target_sr, data
    return sr, data

sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

# Mostrar información del audio
duration = len(wav_data) / sample_rate
print(f'Sample rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(wav_data)}')

# Reproducir el audio en Jupyter
Audio(wav_data, rate=sample_rate)


Sample rate: 16000 Hz
Total duration: 9.66s
Size of the input: 154614


error: ushort format requires 0 <= number <= 65535