# Speech Emotion Recognition Capstone: Step-by-Step Notebook

This notebook demonstrates the end-to-end workflow for the capstone project, including data download, feature extraction, model training, inference, and MVP demo.

## 1. Data Acquisition: Download & Extract Datasets

In [1]:
# !pip install gdown
import gdown
import zipfile

# Download Ravdess_Tess.zip
url1 = 'https://drive.google.com/uc?id=1I5aRKBUb7bGUoB1SMWWQ4U4KGsxecpgZ'
gdown.download(url1, 'RavdessTess.zip', quiet=False)
with zipfile.ZipFile('RavdessTess.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# Download KaggleTestDataSet.zip
url2 = 'https://drive.google.com/uc?id=1UyEHeoXg6kFR47vh6pMCLqI3w043PDtu'
gdown.download(url2, 'KaggleTestDataSet.zip', quiet=False)
with zipfile.ZipFile('KaggleTestDataSet.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

Downloading...
From (original): https://drive.google.com/uc?id=1I5aRKBUb7bGUoB1SMWWQ4U4KGsxecpgZ
From (redirected): https://drive.google.com/uc?id=1I5aRKBUb7bGUoB1SMWWQ4U4KGsxecpgZ&confirm=t&uuid=95171dd2-443c-4788-8c68-23af5538e7cb
To: C:\iimk\capstoneassinment_14_secA\RavdessTess.zip
100%|█████████████████████████████████████████████████████████████████| 279M/279M [00:19<00:00, 14.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UyEHeoXg6kFR47vh6pMCLqI3w043PDtu
To: C:\iimk\capstoneassinment_14_secA\KaggleTestDataSet.zip
100%|███████████████████████████████████████████████████████████████| 14.0M/14.0M [00:02<00:00, 5.11MB/s]


## 2. Batch Feature Extraction (MFCC, Chroma, Spectrograms)

In [12]:
import os
import numpy as np
import librosa
import pandas as pd

DATA_DIR = 'audio_folder/'  # Update as needed
features = []
labels = []

for file in os.listdir(DATA_DIR):
    if file.endswith('.wav') or file.endswith('.mp3'):
        y, sr = librosa.load(os.path.join(DATA_DIR, file), sr=None)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
        contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
        feature_vec = np.hstack([mfccs, chroma, contrast])
        features.append(feature_vec)
        labels.append(file.split('_')[0])

X = np.array(features)
y = np.array(labels)
np.save('features.npy', X)
np.save('labels.npy', y)
pd.DataFrame({'filename': os.listdir(DATA_DIR), 'label': y}).to_csv('labels.csv', index=False)

## 3. Generate Spectrogram Images for CNN/CRNN Training

In [13]:
import librosa.display
import matplotlib.pyplot as plt

SPEC_DIR = 'spectrograms/'
os.makedirs(SPEC_DIR, exist_ok=True)

for file in os.listdir(DATA_DIR):
    if file.endswith('.wav') or file.endswith('.mp3'):
        y, sr = librosa.load(os.path.join(DATA_DIR, file), sr=22050)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_DB = librosa.power_to_db(S, ref=np.max)
        plt.figure(figsize=(2,2))
        librosa.display.specshow(S_DB, sr=sr, cmap='magma')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(os.path.join(SPEC_DIR, file.replace('.wav', '.png').replace('.mp3', '.png')), bbox_inches='tight', pad_inches=0)
        plt.close()

## 4. Model Building: Baseline ML and Deep Learning Architectures 

Check your current working directory in the notebook (run !pwd or import os; print(os.getcwd())) and make sure it matches the directory where your extraction script saves features.npy and labels.npy.
Re-run the feature extraction script in the SAME directory as your notebook:
bash
python lalitnayyar_capstone14seca_steps_fixed.py

In [14]:
import numpy as np
X = np.load('features.npy')
y = np.load('labels.npy')
print("Features shape:", X.shape)
print("Labels shape:", y.shape)
print("First 5 labels:", y[:5])

Features shape: (1168, 59)
Labels shape: (1168,)
First 5 labels: ['03-01-01-01-01-01-01' '03-01-01-01-01-02-01' '03-01-01-01-02-01-01'
 '03-01-01-01-02-02-01' '03-01-03-01-01-01-01']


In [15]:
# Notebook Defensive Programming: Add Data Existence Check
# Update your notebook code so that you never proceed if data is missing:

In [16]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import os

# Check if files exist
if not (os.path.exists('features.npy') and os.path.exists('labels.npy')):
    raise FileNotFoundError("features.npy or labels.npy not found in current directory!")

X = np.load('features.npy')
y = np.load('labels.npy')

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

if X.shape[0] == 0 or y.shape[0] == 0:
    raise ValueError("Feature or label array is empty! Please check your feature extraction step and ensure the files are not empty.")

# Proceed only if arrays are not empty
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Features shape: (1168, 59)
Labels shape: (1168,)
Accuracy: 0.0
                      precision    recall  f1-score   support

03-01-01-01-01-01-02       0.00      0.00      0.00       1.0
03-01-01-01-01-01-05       0.00      0.00      0.00       1.0
03-01-01-01-01-01-09       0.00      0.00      0.00       1.0
03-01-01-01-01-01-10       0.00      0.00      0.00       0.0
03-01-01-01-01-01-11       0.00      0.00      0.00       1.0
03-01-01-01-01-01-12       0.00      0.00      0.00       0.0
03-01-01-01-01-01-14       0.00      0.00      0.00       1.0
03-01-01-01-01-01-16       0.00      0.00      0.00       1.0
03-01-01-01-01-01-17       0.00      0.00      0.00       1.0
03-01-01-01-01-01-18       0.00      0.00      0.00       1.0
03-01-01-01-01-01-20       0.00      0.00      0.00       1.0
03-01-01-01-01-01-23       0.00      0.00      0.00       0.0
03-01-01-01-01-01-24       0.00      0.00      0.00       0.0
03-01-01-01-01-02-06       0.00      0.00      0.00       0.0
03-01-

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Baseline: RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = np.load('features.npy')
y = np.load('labels.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.0
                      precision    recall  f1-score   support

03-01-01-01-01-01-02       0.00      0.00      0.00       1.0
03-01-01-01-01-01-05       0.00      0.00      0.00       1.0
03-01-01-01-01-01-09       0.00      0.00      0.00       1.0
03-01-01-01-01-01-11       0.00      0.00      0.00       1.0
03-01-01-01-01-01-14       0.00      0.00      0.00       1.0
03-01-01-01-01-01-15       0.00      0.00      0.00       0.0
03-01-01-01-01-01-16       0.00      0.00      0.00       1.0
03-01-01-01-01-01-17       0.00      0.00      0.00       1.0
03-01-01-01-01-01-18       0.00      0.00      0.00       1.0
03-01-01-01-01-01-20       0.00      0.00      0.00       1.0
03-01-01-01-01-01-23       0.00      0.00      0.00       0.0
03-01-01-01-01-01-24       0.00      0.00      0.00       0.0
03-01-01-01-01-02-02       0.00      0.00      0.00       0.0
03-01-01-01-01-02-04       0.00      0.00      0.00       0.0
03-01-01-01-01-02-06       0.00      0.00      0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Deep Learning: CRNN Example (Keras)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, TimeDistributed, LSTM, Dense, Flatten

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(128, 128, 1)),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Dropout(0.3),
    TimeDistributed(Flatten()),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NameError: name 'num_classes' is not defined

### Attention/Transformer Block (Keras)

In [None]:
from tensorflow.keras.layers import LayerNormalization, Dense, Dropout, Add
from tensorflow.keras.layers import MultiHeadAttention, Input
from tensorflow.keras.models import Model


#num_classes = 8  # Replace 8 with your actual number of classes
# Set these based on your input data shape:
timesteps = 100   # or X.shape[1] if X is your 3D feature array
features = 59     # or X.shape[2] if X is your 3D feature array
input_layer = Input(shape=(timesteps, features))
attn = MultiHeadAttention(num_heads=4, key_dim=features)(input_layer, input_layer)
attn = Dropout(0.1)(attn)
attn = Add()([input_layer, attn])
attn = LayerNormalization()(attn)
dense = Dense(128, activation='relu')(attn)
dense = Dropout(0.1)(dense)
output = Dense(num_classes, activation='softmax')(dense)
model = Model(input_layer, output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## 5. MVP Demo: Streamlit App (Code Example)

In [None]:
import streamlit as st
import numpy as np
import librosa
import joblib
import os
import pandas as pd

st.title("Speech Emotion Recognition - Advanced MVP")

uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])

@st.cache_resource
def load_model():
    if os.path.exists("model.pkl"):
        return joblib.load("model.pkl")
    else:
        return None

model = load_model()
FEEDBACK_FILE = "feedback_log.csv"

def extract_features(file):
    y, sr = librosa.load(file, sr=None)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    features = np.hstack([mfccs, chroma, contrast])
    return features.reshape(1, -1)

if uploaded_file is not None:
    st.audio(uploaded_file)
    features = extract_features(uploaded_file)
    if model is not None:
        prediction = model.predict(features)[0]
        st.success(f"Predicted Emotion: {prediction}")
    else:
        st.warning("No trained model found. Please train and save a model as 'model.pkl'.")
        prediction = None
    feedback = st.text_input("Was this prediction correct? (Yes/No)")
    if feedback and prediction is not None:
        feedback_entry = pd.DataFrame([[str(uploaded_file.name), prediction, feedback]], columns=["filename","prediction","feedback"])
        if os.path.exists(FEEDBACK_FILE):
            feedback_entry.to_csv(FEEDBACK_FILE, mode='a', header=False, index=False)
        else:
            feedback_entry.to_csv(FEEDBACK_FILE, mode='w', header=True, index=False)
        st.write("Thank you for your feedback!")

## 6. Feedback Logging & Analysis (CSV)

In [None]:
import pandas as pd
feedback = pd.read_csv('feedback_log.csv')
feedback.head()