## Basline machine Learning for Emotion Detection

In [35]:
# dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

In [36]:
# data
df_english = pd.read_csv(r'..\Data\English\features_english.csv')
df_german = pd.read_csv(r'..\Data\German\features_german.csv')
df_italian = pd.read_csv(r'..\Data\Italian\features_italian.csv')
df_urdu = pd.read_csv(r'..\Data\Urdu\features_urdu.csv')

languages = {
    'english': df_english,
    'german': df_german,
    'italian': df_italian,
    'urdu': df_urdu
}

In [37]:
for language, data in languages.items():
    print(f"{language.title()}:", data["speaker_id"].unique())

English: [1 2 3 4]
German: [ 3  8  9 10 11 12 13 14 15 16]
Italian: [1 2 3 4 5 6]
Urdu: [12 13 14 15  1  2  3  4  5  6  7  8 28 29 30 16 17 18  9 31 32 33 10 11
 34 35 36 37 38]


### Data Preprocessing
* we have a small dataset with unbalanced data. we need to be careful about overfitting. We want to have balanced data, so we use SMOTE to oversample the minority class (this usually works well for smaller datasets) 
* we need GroupKfold crossvalidation, as we do not want to have speakers from the training in the test dataset 

=> we do this all together in the pipepline

In [38]:
# check classes
for language, data in languages.items():
    print(f"{language.title()}", data['valence'].value_counts())

English valence
0    240
1    240
Name: count, dtype: int64
German valence
0    385
1    150
Name: count, dtype: int64
Italian valence
0    336
1    252
Name: count, dtype: int64
Urdu valence
0    200
1    200
Name: count, dtype: int64


### Model Training

In [39]:
# Initialize models
svm_model = SVC(probability=True, random_state=42)
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
xgb_model = XGBClassifier(random_state=42)
randomForest_model = RandomForestClassifier(random_state=42)

# Create pipelines
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("classifier", svm_model)
])

logreg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("classifier", logreg_model)
])

xgb_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("classifier", xgb_model)
])

randomForest_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("classifier", randomForest_model)
])

# random predictor
# Zufallsbaseline mit gewichteter Klassenverteilung
dummy_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", DummyClassifier(strategy="stratified", random_state=42))
])

### Evaluation
in the paper they used unweighted average recall rate (UAR) which gives equal weight to each class, making it good for imbalanced datasets. UAR is the same as balanced accuracy.
we choose to check also other metrics, which are good for imbalanced date, like f1 score.

In [40]:
# Separate features and labels, create groups
def separate_feats_labs(df):
    X = df.drop(["emotion", "speaker_id", "filename", "valence"], axis=1)  # Features
    y = df["valence"]  # Labels
    groups = df["speaker_id"]  

    if not df.equals(df_english):
        # Initialize GroupKFold with 5 folds
        gkf = GroupKFold(n_splits=5)
    else:
        # For the English dataset we initialize with 4 folds (since n = 4 < 5)
        gkf = GroupKFold(n_splits=4)

    return X, y, groups, gkf

# Evaluate pipelines using different metrics
def evaluate_pipelines(X, y, groups, gkf):
    scoring_metrics = ["accuracy", "balanced_accuracy", "f1_weighted", "roc_auc_ovr", "precision_weighted", "recall_weighted"]
    results = []

    for metric in scoring_metrics:
        svm_scores = cross_val_score(svm_pipeline, X, y, groups=groups, cv=gkf, scoring=metric)
        logreg_scores = cross_val_score(logreg_pipeline, X, y, groups=groups, cv=gkf, scoring=metric)
        xgb_scores = cross_val_score(xgb_pipeline, X, y, groups=groups, cv=gkf, scoring=metric)
        randomForest_scores = cross_val_score(randomForest_pipeline, X, y, groups=groups, cv=gkf, scoring=metric)
        dummy_scors = cross_val_score(dummy_pipeline, X, y, groups=groups, cv=gkf, scoring=metric)
    
        # Store results
        results.append(["SVM", metric, svm_scores.mean()])
        results.append(["Logistic Regression", metric, logreg_scores.mean()])
        results.append(["Random Forest", metric, randomForest_scores.mean()])
        results.append(["XGBoost", metric, xgb_scores.mean()])
        results.append(["Dummy", metric, dummy_scors.mean()])
    return results

In [41]:
for language, data in languages.items():

    X, y, groups, gkf = separate_feats_labs(data)

    results = evaluate_pipelines(X, y, groups, gkf)

    results_df = pd.DataFrame(results, columns=["Model", "Metric", "Score"])

    # save the data as csv
    results_df.to_csv(f"../Evaluation/Baseline/{language}_results.csv",index=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
