<a href="https://colab.research.google.com/github/harsha-9977/AIML/blob/main/happy_voice_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install gdown
!gdown https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
!unzip -q Audio_Speech_Actors_01-24.zip -d ravdess_speech


Downloading...
From: https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
To: /content/Audio_Speech_Actors_01-24.zip
100% 208M/208M [03:02<00:00, 1.14MB/s]


In [3]:
import librosa
import numpy as np

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=22050, mono=True, duration=4)
    if len(y) < sr * 4:
        y = np.pad(y, (0, sr * 4 - len(y)))

    melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    melspec_db = librosa.power_to_db(melspec, ref=np.max)

    if melspec_db.shape[1] < 128:
        melspec_db = np.pad(melspec_db, ((0, 0), (0, 128 - melspec_db.shape[1])), mode='constant')
    else:
        melspec_db = melspec_db[:, :128]

    melspec_db = (melspec_db - melspec_db.min()) / (melspec_db.max() - melspec_db.min() + 1e-6)
    return melspec_db[..., np.newaxis]  # Shape: (128, 128, 1)


In [4]:
import os
from tqdm import tqdm

emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",      # our target
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

X, y = [], []

for actor in sorted(os.listdir("ravdess_speech")):
    actor_path = os.path.join("ravdess_speech", actor)
    for file in tqdm(os.listdir(actor_path), desc=f"Processing {actor}"):
        if file.endswith(".wav"):
            emotion_id = file.split("-")[2]
            label = 1 if emotion_map[emotion_id] == "happy" else 0
            path = os.path.join(actor_path, file)
            features = extract_features(path)
            X.append(features)
            y.append(label)

X = np.array(X)
y = np.array(y)
print("✅ Dataset prepared:", X.shape, y.shape)


Processing Actor_01: 100%|██████████| 60/60 [00:23<00:00,  2.56it/s]
Processing Actor_02: 100%|██████████| 60/60 [00:01<00:00, 59.55it/s]
Processing Actor_03: 100%|██████████| 60/60 [00:00<00:00, 60.58it/s]
Processing Actor_04: 100%|██████████| 60/60 [00:00<00:00, 62.11it/s]
Processing Actor_05: 100%|██████████| 60/60 [00:01<00:00, 49.06it/s]
Processing Actor_06: 100%|██████████| 60/60 [00:03<00:00, 19.91it/s]
Processing Actor_07: 100%|██████████| 60/60 [00:01<00:00, 34.17it/s]
Processing Actor_08: 100%|██████████| 60/60 [00:01<00:00, 58.44it/s]
Processing Actor_09: 100%|██████████| 60/60 [00:01<00:00, 54.39it/s]
Processing Actor_10: 100%|██████████| 60/60 [00:00<00:00, 61.38it/s]
Processing Actor_11: 100%|██████████| 60/60 [00:00<00:00, 62.25it/s]
Processing Actor_12: 100%|██████████| 60/60 [00:00<00:00, 61.26it/s]
Processing Actor_13: 100%|██████████| 60/60 [00:01<00:00, 46.78it/s]
Processing Actor_14: 100%|██████████| 60/60 [00:01<00:00, 38.45it/s]
Processing Actor_15: 100%|████████

✅ Dataset prepared: (1440, 128, 128, 1) (1440,)


In [5]:
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32,
                    validation_data=(X_test, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1s/step - accuracy: 0.8115 - loss: 0.4655 - val_accuracy: 0.8785 - val_loss: 0.3761
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 954ms/step - accuracy: 0.8728 - loss: 0.3929 - val_accuracy: 0.8785 - val_loss: 0.3662
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 994ms/step - accuracy: 0.8513 - loss: 0.4311 - val_accuracy: 0.8785 - val_loss: 0.3635
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1s/step - accuracy: 0.8681 - loss: 0.3940 - val_accuracy: 0.8785 - val_loss: 0.3556
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 947ms/step - accuracy: 0.8678 - loss: 0.3781 - val_accuracy: 0.8785 - val_loss: 0.3390
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 960ms/step - accuracy: 0.8479 - loss: 0.4144 - val_accuracy: 0.8785 - val_loss: 0.3483
Epoch 7/10
[1m36/36[0m [3

In [6]:
# Evaluate accuracy on test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"✅ Test Accuracy: {test_acc:.2f}")


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 217ms/step - accuracy: 0.8836 - loss: 0.3222
✅ Test Accuracy: 0.89


In [7]:
# Save in Colab
model.save("happy_speech_classifier.h5")

# Optional: Save to Google Drive
model.save("/content/drive/MyDrive/happy_speech_classifier.h5")




In [27]:
import soundfile as sf
import numpy as np

# Create a happy sample (higher frequency)
happy_audio = np.sin(2 * np.pi * 880 * np.linspace(0, 1, 44100))
sf.write('happy_sample.wav', happy_audio, 44100)

# Create a sad sample (lower frequency)
sad_audio = np.sin(2 * np.pi * 220 * np.linspace(0, 1, 44100))
sf.write('not_happy_sample.wav', sad_audio, 44100)

In [28]:
def predict_audio(filename):
    try:
        y, sr = librosa.load(filename, sr=22050, mono=True, duration=4.0)
        if len(y) < sr * 4:
            y = np.pad(y, (0, sr * 4 - len(y)))

        melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        melspec_db = librosa.power_to_db(melspec, ref=np.max)

        if melspec_db.shape[1] < 128:
            melspec_db = np.pad(melspec_db, ((0, 0), (0, 128 - melspec_db.shape[1])), mode='constant')
        else:
            melspec_db = melspec_db[:, :128]

        melspec_db = (melspec_db - np.min(melspec_db)) / (np.max(melspec_db) - np.min(melspec_db) + 1e-6)
        input_data = melspec_db[np.newaxis, ..., np.newaxis]

        prob = model.predict(input_data, verbose=0)[0][0]
        label = "😊 Happy Voice Detected!" if prob > 0.5 else "😐 Not a Happy Voice"
        confidence = f"Confidence: {prob:.2f}"
        return label, confidence

    except Exception as e:
        import traceback
        print("🔍 Full traceback:")
        traceback.print_exc()
        return f"⚠️ Error: {str(e)}", None


In [29]:
predict_audio("happy_sample.wav")


('😐 Not a Happy Voice', 'Confidence: 0.18')

In [30]:
print("Not Happy Sample:", predict_audio("not_happy_sample.wav"))

Not Happy Sample: ('😐 Not a Happy Voice', 'Confidence: 0.37')


In [31]:
import librosa
import soundfile as sf

# Check happy sample
happy_audio, sr = librosa.load("happy_sample.wav", sr=None)
print(f"Happy sample: Duration = {len(happy_audio)/sr:.2f}s, Sample Rate = {sr} Hz")

# Check sad sample
sad_audio, sr = librosa.load("not_happy_sample.wav", sr=None)
print(f"Sad sample: Duration = {len(sad_audio)/sr:.2f}s, Sample Rate = {sr} Hz")

Happy sample: Duration = 1.00s, Sample Rate = 44100 Hz
Sad sample: Duration = 1.00s, Sample Rate = 44100 Hz


In [43]:
predict_audio("/content/ravdess_speech/Actor_18/03-01-03-01-02-02-18.wav")

('😊 Happy Voice Detected!', 'Confidence: 0.51')

In [40]:
predict_audio("/content/ravdess_speech/Actor_03/03-01-04-01-02-02-03.wav")


('😐 Not a Happy Voice', 'Confidence: 0.01')

In [46]:
predict_audio("/content/ravdess_speech/Actor_18/03-01-08-02-02-02-18.wav")


('😐 Not a Happy Voice', 'Confidence: 0.09')

In [52]:
predict_audio("/content/ravdess_speech/Actor_24/03-01-08-02-02-02-24.wav")


('😐 Not a Happy Voice', 'Confidence: 0.01')