In [13]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import models, layers
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import pyaudio
import queue
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
import time
import wave

In [14]:
# Given list of strings
chords_list = ['a', 'am', 'bm', 'c', 'd', 'dm', 'e', 'em', 'f', 'g']

# Convert the list to a numpy array
chords_array = np.array(chords_list)

y = label_encoder.fit_transform(chords_array)

print(label_encoder.classes_)

# Get the number of available audio devices
p = pyaudio.PyAudio()
device_count = p.get_device_count()

# Print information about each audio device
for i in range(device_count):
    device_info = p.get_device_info_by_index(i)
    print(f"Device {i}: {device_info['name']}")

# Specify the index of the microphone you want to use
# Replace 'mic_index' with the index of your microphone
mic_index = 0  # Change this to the index of your microphone

# Initialize PyAudio
p = pyaudio.PyAudio()

# Get the index of the default input device
default_input_device_index = p.get_default_input_device_info()['index']

# Get information about the default input device
default_input_device_info = p.get_device_info_by_index(default_input_device_index)

# Print information about the default input device
print("Default Input Device:")
print(f"  Name: {default_input_device_info['name']}")
print(f"  Index: {default_input_device_info['index']}")
print(f"  Channels: {default_input_device_info['maxInputChannels']}")
print(f"  Default Sample Rate: {default_input_device_info['defaultSampleRate']} Hz")

['a' 'am' 'bm' 'c' 'd' 'dm' 'e' 'em' 'f' 'g']
Device 0: Microsoft Sound Mapper - Input
Device 1: Microphone (Yeti Stereo Microph
Device 2: Microsoft Sound Mapper - Output
Device 3: Speakers (Conexant ISST Audio)
Device 4: Speakers (Yeti Stereo Microphon
Device 5: Primary Sound Capture Driver
Device 6: Microphone (Yeti Stereo Microphone)
Device 7: Primary Sound Driver
Device 8: Speakers (Conexant ISST Audio)
Device 9: Speakers (Yeti Stereo Microphone)
Device 10: Speakers (Yeti Stereo Microphone)
Device 11: Speakers (Conexant ISST Audio)
Device 12: Microphone (Yeti Stereo Microphone)
Device 13: Output 1 (Conexant ISST Audio output)
Device 14: Output 2 (Conexant ISST Audio output)
Device 15: Input (Conexant ISST Audio output)
Device 16: Stereo Mix (Conexant ISST Stereo Mix)
Device 17: Microphone Array (Conexant ISST Audio capture)
Device 18: Speakers (Yeti Stereo Microphone)
Device 19: Microphone (Yeti Stereo Microphone)
Default Input Device:
  Name: Microphone (Yeti Stereo Microph
  Inde

In [17]:
# Load the trained model
model = load_model('cnn_attempt_98_acc.h5')

# Define function to preprocess audio data
def preprocess_audio(audio_sample):
    # Generate features
    features = generate_features(audio_sample)
    # Reshape the feature array to match the input shape of the model
    features = np.expand_dims(features, axis=0)
    return features

def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """
    h = array.shape[0]
    w = array.shape[1]
    a = max((xx - h) // 2,0)
    aa = max(0,xx - a - h)
    b = max(0,(yy - w) // 2)
    bb = max(yy - b - w,0)
    return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

# Function to generate features (replace with your own implementation)
def generate_features(audio_sample, sr=44100):
    max_size = 1000  # Define your max audio feature width
    n_mfcc = 13  # Number of MFCC coefficients

    # Extract features
    stft = librosa.stft(y=audio_sample, n_fft=255, hop_length=512)
    stft = stft[:, :max_size]  # Truncate stft to max_size
    stft = padding(np.abs(stft), 128, max_size)

    mfccs = librosa.feature.mfcc(y=audio_sample, sr=sr, n_mfcc=n_mfcc)
    mfccs = mfccs[:, :max_size]  # Truncate mfccs to max_size
    mfccs = padding(mfccs, 128, max_size)

    spec_centroid = librosa.feature.spectral_centroid(y=audio_sample, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=audio_sample, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=audio_sample, sr=sr)

    spec_bw_truncated = spec_bw[:, :max_size]  # Truncate spec_bw to max_size
    spec_centroid_truncated = spec_centroid[:, :max_size]  # Truncate spec_centroid to max_size
    chroma_stft_truncated = chroma_stft[:, :max_size]

    # Create the image stack
    image = np.array([padding(normalize(spec_bw_truncated), 1, max_size)]).reshape(1, max_size)
    image = np.append(image, padding(normalize(spec_centroid_truncated), 1, max_size), axis=0)

    for i in range(0, 9):
        image = np.append(image, padding(normalize(spec_bw_truncated), 1, max_size), axis=0)
        image = np.append(image, padding(normalize(spec_centroid_truncated), 1, max_size), axis=0)
        image = np.append(image, padding(normalize(chroma_stft_truncated), 12, max_size), axis=0)

    # Stack STFT and MFCCs
    image = np.dstack((image, stft))
    image = np.dstack((image, mfccs))

    return image

# Function to make predictions
def predict_chord(features):
    predictions = model.predict(features)
    predicted_class = np.argmax(predictions)
    predicted_categorical_label = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_categorical_label

# Global variables to control listening duration and buffer
LISTEN_DURATION = 5  # Duration to listen in seconds
buffer = []
start_time = time.time()  # Initialize start_time

# Global variable to keep track of the file index
file_index = 0
# Callback function for PyAudio
def callback(in_data, frame_count, time_info, status):
    global buffer, start_time, file_index
    
    # Convert byte data to numpy array and normalize to the range [-1, 1]
    audio_data = np.frombuffer(in_data, dtype=np.int16) / 32767.0
    
    # Calculate the root mean square (RMS) amplitude of the audio signal
    rms_amplitude = np.sqrt(np.mean(np.square(audio_data)))
    
    # Define a threshold value for sound detection
    threshold = 0.1  # Adjust this threshold as needed
    
    # If RMS amplitude exceeds the threshold, capture audio
    if rms_amplitude > threshold:
        buffer.append(audio_data)
    
    # Check if the listening duration has elapsed
    if time.time() - start_time >= LISTEN_DURATION:
        if buffer:
            # Concatenate audio data from buffer
            audio_data = np.concatenate(buffer)
            
            # Save the captured audio data to a WAV file
            filename = f"captured_audio_{file_index}.wav"
            with wave.open(filename, 'wb') as wf:
                wf.setnchannels(channels)
                wf.setsampwidth(p.get_sample_size(audio_format))
                wf.setframerate(sample_rate)
                wf.writeframes(audio_data.tobytes())
            print(f"Captured audio saved as: {filename}")
            file_index += 1
            
            # Preprocess audio
            features = preprocess_audio(audio_data)
            
            # Make prediction
            predicted_chord = predict_chord(features)
            print(f"Predicted chord: {predicted_chord}")
        
        # Reset buffer and start time
        buffer = []
        start_time = time.time()
    
    return (in_data, pyaudio.paContinue)

# Set up PyAudio
audio_format = pyaudio.paInt16  # Set to 16-bit integer format
channels = 1  # Mono
sample_rate = 44100  # 44100 Hz sample rate
chunk_size = 1024

p = pyaudio.PyAudio()

stream = p.open(format=audio_format,
                channels=channels,
                rate=sample_rate,
                input=True,
                frames_per_buffer=chunk_size,
                stream_callback=callback)

print("Listening...")

# Start the stream
stream.start_stream()

try:
    while stream.is_active():
        pass
except KeyboardInterrupt:
    # Stop the stream if interrupted by the user
    stream.stop_stream()
    stream.close()
    p.terminate()
    print("Stream stopped.")



Listening...
Captured audio saved as: captured_audio_0.wav




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
Predicted chord: bm
Stream stopped.
