In [2]:
import sounddevice as sd
import numpy as np
import librosa
import tensorflow as tf
import time

In [3]:
# Load your trained model
model = tf.keras.models.load_model("audio_emotion_CNN_model.h5")



In [4]:
# Class labels (adjust based on your training labels order)
class_labels = ["neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

# Audio parameters
duration = 3  # seconds to record
fs = 22050

In [5]:
def extract_features(audio, sr=22050, n_mfcc=120, max_pad_len=174):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    # Pad or truncate
    if mfccs.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode="constant")
    else:
        mfccs = mfccs[:, :max_pad_len]
    return mfccs

In [6]:
def record_and_predict():
    print("🎤 Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype="float32")
    sd.wait()  # Wait until recording is finished
    audio = np.squeeze(audio)

    features = extract_features(audio, sr=fs)
    features = np.expand_dims(features, axis=-1)  # (120, 174, 1)
    features = np.expand_dims(features, axis=0)   # (1, 120, 174, 1)


    # Predict
    prediction = model.predict(features)
    predicted_label = class_labels[np.argmax(prediction)]

    print(f"Predicted Emotion: {predicted_label}")

In [7]:
if __name__ == "__main__":
    while True:
        cmd = input("Press Enter to record and predict (or type 'q' to quit): ")
        if cmd.lower() == "q":
            break
        record_and_predict()

🎤 Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step
Predicted Emotion: fearful
🎤 Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Predicted Emotion: surprised
🎤 Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Predicted Emotion: surprised
