## XGB Base Model for Speech Emotion Recognition

### Import Required Libraries  

In [None]:
import pandas as pd 
import librosa
import numpy as np
from tqdm import tqdm
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder,  StandardScaler
import joblib
from utils import get_processed_data_dir, get_models_dir

### Import Dataset 
> Make sure that the dataset is downsampled and augmented using the scripts provided

In [None]:
train_csv= get_processed_data_dir("speech_sentiment") / "emotion_dataset.csv"
df = pd.read_csv(train_csv)

### Encode emotions to numeric values

In [None]:
label_encoder = LabelEncoder()
df["emotion_lable"] = label_encoder.fit_transform(df["emotion"])

### Feature Extraction (Work in Progress)

> Currently consiering mfcc, chroma and contrast

In [None]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs_mean = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    chroma_mean = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    contrast_mean = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)

    features = {}
    for i in range(13):
        features[f'mfccs_mean_{i}'] = mfccs_mean[i]
    for i in range(12):
        features[f'chroma_mean_{i}'] = chroma_mean[i]
    for i in range(7):
        features[f'contrast_mean_{i}'] = contrast_mean[i]
    return features

tqdm.pandas(desc="Extracting Features")

features_list = df['path'].progress_apply(extract_features).tolist()
features_df = pd.DataFrame(features_list)

df = pd.concat([df, features_df], axis=1)

In [None]:
features_path = get_processed_data_dir("speech_sentiment") / "xgb_features.csv"
df.to_csv(features_path, index = False)

### Create trainng and testing sets

In [None]:
features_path = get_processed_data_dir("speech_sentiment") / "xgb_features.csv"
df = pd.read_csv(features_path)

### Scaling the data

In [None]:
scaler = StandardScaler()
X = df.drop(columns=['path', 'emotion', 'emotion_lable'], axis=1)
X = scaler.fit_transform(X)
y = df['emotion_lable']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create XGB model and train

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 500, 1000],
}

In [None]:
model = xgb.XGBClassifier(objective='multi:softprob', num_class=len(y.unique()))

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, verbose=10)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [None]:
model.fit(X_train, y_train, verbose=True)

In [None]:
model_dir = get_models_dir("speech_sentiment/xgb")
joblib.dump(best_model, model_dir / "model.pkl")
joblib.dump(scaler, model_dir / "scaler.pkl")
joblib.dump(label_encoder, model_dir / "label_encoder.pkl")


### Model evaluation

In [None]:
model_dir = get_models_dir("speech_sentiment/xgb")
best_model = joblib.load(model_dir / "model.pkl")
scaler = joblib.load(model_dir / "scaler.pkl")
label_encoder = joblib.load(model_dir / "label_encoder.pkl")

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, cmap = "Blues")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
plt.savefig(model_dir / 'confusion_matrix.png')

### Inference

In [None]:
import pandas as pd
import librosa
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from utils import get_models_dir
import os

In [None]:
model_dir = get_models_dir("speech_sentiment/xgb")
best_model = joblib.load(model_dir / "model.pkl")
scaler = joblib.load(model_dir / "scaler.pkl")
label_encoder = joblib.load(model_dir / "label_encoder.pkl")

In [None]:
clips_dir = get_processed_data_dir("speech_sentiment") / "downsampled_clips"
test_clip = clips_dir / os.listdir(clips_dir)[18000]

# Feature extraction
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs_mean = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    chroma_mean = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    contrast_mean = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)

    features = {}
    for i in range(13):
        features[f'mfccs_mean_{i}'] = mfccs_mean[i]
    for i in range(12):
        features[f'chroma_mean_{i}'] = chroma_mean[i]
    for i in range(7):
        features[f'contrast_mean_{i}'] = contrast_mean[i]
    return features

In [None]:
new_features = extract_features(test_clip)

# Scale the features
new_features_df = pd.DataFrame([new_features])
new_features_scaled = scaler.transform(new_features_df)

In [None]:
y_pred = best_model.predict(new_features_scaled)
predicted_emotion = label_encoder.inverse_transform(y_pred)
print(f'Predicted Emotion: {predicted_emotion[0]}')