<a href="https://colab.research.google.com/github/jomelsotelo/better_sense/blob/main/audio_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

# Load audio file (mono audio, or stereo if you're processing separate channels)
audio_file = 'path_to_your_audio_file.wav'
y, sr = librosa.load(audio_file, sr=16000)  # y is the signal, sr is the sample rate

# Convert audio to a Mel spectrogram
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
log_S = librosa.power_to_db(S, ref=np.max)  # Convert to decibel units

# Display the Mel spectrogram
plt.figure(figsize=(12, 8))
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()
plt.show()

# Reshape spectrogram to feed into the CNN model
input_data = np.expand_dims(log_S, axis=-1)  # Add channel dimension (for grayscale image)


KeyboardInterrupt: 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

# Define the CNN model architecture
def build_cnn_model(input_shape, num_classes):
    model = Sequential()

    # First convolutional layer
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Second convolutional layer
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Third convolutional layer
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Flatten the output and add fully connected layers
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))  # Dropout for regularization
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))

    # Output layer (with softmax for classification)
    model.add(Dense(num_classes, activation='softmax'))

    return model

# Input shape (e.g., 128 Mel bands x number of time frames)
input_shape = (128, log_S.shape[1], 1)  # Shape based on the spectrogram dimensions

# Number of output classes (e.g., directions: front, back, left, right)
num_classes = 4  # Adjust based on the number of sound direction categories

model = build_cnn_model(input_shape, num_classes)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val))


In [None]:
import sounddevice as sd
import queue

q = queue.Queue()

# Callback function to capture audio input
def callback(indata, frames, time, status):
    if status:
        print(status)
    q.put(indata.copy())

# Function to capture audio in real-time and predict the direction
def real_time_audio_predict(model, duration=5, sr=16000):
    with sd.InputStream(samplerate=sr, channels=1, callback=callback):
        print("Recording...")

        audio_data = []
        for _ in range(0, int(sr * duration / 1024)):
            audio_data.extend(q.get())

        print("Recording complete.")
        y = np.array(audio_data)

        # Convert real-time audio to spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        log_S = librosa.power_to_db(S, ref=np.max)

        # Reshape for model input
        input_data = np.expand_dims(log_S, axis=0)
        input_data = np.expand_dims(input_data, axis=-1)

        # Make prediction
        prediction = model.predict(input_data)
        predicted_class = np.argmax(prediction)
        print(f"Predicted direction: {predicted_class}")

# Use the function to predict direction in real-time
real_time_audio_predict(model)
