In [7]:
import pyaudio
import wave
import sys
import threading
import os
import numpy as np
import time
import librosa
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

# Global variable for the model
model = load_model('cnn_attempt_98_acc.h5')

label_encoder = LabelEncoder()

# Given list of strings
chords_list = ['a', 'am', 'bm', 'c', 'd', 'dm', 'e', 'em', 'f', 'g']

# Convert the list to a numpy array
chords_array = np.array(chords_list)

y = label_encoder.fit_transform(chords_array)

print(label_encoder.classes_)

class AudioRecorder:
    def __init__(self, filename='recording.wav', chunk=1024, channels=1, rate=44100, threshold=2000, duration=2):
        self.filename = filename
        self.chunk = chunk
        self.channels = channels
        self.rate = rate
        self.frames = []
        self.threshold = threshold
        self.duration = duration
        self.pa = pyaudio.PyAudio()
        self.stream = self.pa.open(format=pyaudio.paInt16,
                                   channels=self.channels,
                                   rate=self.rate,
                                   input=True,
                                   frames_per_buffer=self.chunk)
        self.lock = threading.Lock()

    def start_recording(self):
        while True:
            print("Listening for threshold level...")
            while True:
                data = np.frombuffer(self.stream.read(self.chunk), dtype=np.int16)
                if np.max(data) > self.threshold:
                    print("Threshold level reached. Recording started...")
                    self.record()
                    break

    def record(self):
        start_time = time.time()
        while time.time() - start_time < self.duration:
            data = self.stream.read(self.chunk)
            self.lock.acquire()
            self.frames.append(data)
            self.lock.release()
        self.stop_recording()
        self.frames.clear()
        time.sleep(0.5)  # Wait for a moment before restarting recording

    def stop_recording(self):
        print("Recording stopped...")
        with wave.open(self.filename, 'wb') as wf:
            wf.setnchannels(self.channels)
            wf.setsampwidth(self.pa.get_sample_size(pyaudio.paInt16))
            wf.setframerate(self.rate)
            self.lock.acquire()
            wf.writeframes(b''.join(self.frames))
            self.lock.release()
        print(f"Recording saved as {self.filename}")
        self.predict(self.filename)

    def predict(self, filename):
        # Load the audio file
        audio_file = filename  # Use the recorded WAV file
        audio_sample, sr = librosa.load(audio_file)
    
        # Preprocess the audio to generate features
        features = generate_features(audio_sample)
    
        # Reshape the feature array to match the input shape of the model
        features = np.expand_dims(features, axis=0)
    
        # Make predictions
        predictions = model.predict(features)
    
        # Get the predicted class
        predicted_class = np.argmax(predictions)

        predicted_categorical_label = label_encoder.inverse_transform([predicted_class])[0]
    
        # Print the predicted class
        print(f"For audio file {audio_file}, predicted class: {predicted_categorical_label}")

def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """
    h = array.shape[0]
    w = array.shape[1]
    a = max((xx - h) // 2,0)
    aa = max(0,xx - a - h)
    b = max(0,(yy - w) // 2)
    bb = max(yy - b - w,0)
    return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

def normalize(feature):
    return (feature - np.min(feature)) / (np.max(feature) - np.min(feature))

def generate_features(audio_sample):
    max_size = 1000  # Define your max audio feature width
    n_mfcc = 13  # Number of MFCC coefficients
    sr = 22050

    # Extract features
    stft = librosa.stft(y=audio_sample, n_fft=255, hop_length=512)
    stft = stft[:, :max_size]  # Truncate stft to max_size
    stft = padding(np.abs(stft), 128, max_size)

    mfccs = librosa.feature.mfcc(y=audio_sample, sr=sr, n_mfcc=n_mfcc)
    mfccs = mfccs[:, :max_size]  # Truncate mfccs to max_size
    mfccs = padding(mfccs, 128, max_size)

    spec_centroid = librosa.feature.spectral_centroid(y=audio_sample, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=audio_sample, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=audio_sample, sr=sr)

    spec_bw_truncated = spec_bw[:, :max_size]  # Truncate spec_bw to max_size
    spec_centroid_truncated = spec_centroid[:, :max_size]  # Truncate spec_centroid to max_size
    chroma_stft_truncated = chroma_stft[:, :max_size]

    # Create the image stack
    image = np.array([padding(normalize(spec_bw_truncated), 1, max_size)]).reshape(1, max_size)
    image = np.append(image, padding(normalize(spec_centroid_truncated), 1, max_size), axis=0)

    for i in range(0, 9):
        image = np.append(image, padding(normalize(spec_bw_truncated), 1, max_size), axis=0)
        image = np.append(image, padding(normalize(spec_centroid_truncated), 1, max_size), axis=0)
        image = np.append(image, padding(normalize(chroma_stft_truncated), 12, max_size), axis=0)

    # Stack STFT and MFCCs
    image = np.dstack((image, stft))
    image = np.dstack((image, mfccs))

    return image

def main():
    filename = 'recording.wav'
    recorder = AudioRecorder(filename)

    # Start listening and recording continuously
    recorder.start_recording()

if __name__ == "__main__":
    main()




['a' 'am' 'bm' 'c' 'd' 'dm' 'e' 'em' 'f' 'g']
Listening for threshold level...
Threshold level reached. Recording started...
Recording stopped...
Recording saved as recording.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step
For audio file recording.wav, predicted class: c
Listening for threshold level...
Threshold level reached. Recording started...
Recording stopped...
Recording saved as recording.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
For audio file recording.wav, predicted class: d
Listening for threshold level...
Threshold level reached. Recording started...
Recording stopped...
Recording saved as recording.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
For audio file recording.wav, predicted class: am
Listening for threshold level...
Threshold level reached. Recording started...
Recording stopped...
Recording saved as recording.wav
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

KeyboardInterrupt: 