## Basline machine Learning for Emotion Detection Replication
- no smote/oversampling
- test and trainign set: 
    - URDU:  consists of 38 speakers, so we selected 23 speakers for training and 6 remaining  for testing with five-foldcross-validation. 
    - other languages: Leave-One-Speaker-Out (LOSO)


In [1]:
# dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, LeaveOneGroupOut, cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

In [2]:
# data
df_english = pd.read_csv(r'..\Data\English\features_english.csv')
df_german = pd.read_csv(r'..\Data\German\features_german.csv')
df_italian = pd.read_csv(r'..\Data\Italian\features_italian.csv')
df_urdu = pd.read_csv(r'..\Data\Urdu\features_urdu.csv')

languages = {
    'english': df_english,
    'german': df_german,
    'italian': df_italian,
    'urdu': df_urdu
}

In [3]:
for language, data in languages.items():
    print(f"{language.title()}:", data["speaker_id"].unique())

English: [2 3 4 5]
German: [ 3  8  9 10 11 12 13 14 15 16]
Italian: [1 2 3 4 5 6]
Urdu: [12 13 14 15  1  2  3  4  5  6  7  8 28 29 30 16 17 18  9 31 32 33 10 11
 34 35 36 37 38]


### Data Preprocessing
* we have a small dataset with unbalanced data. we need to be careful about overfitting. We want to have balanced data, so we use SMOTE to oversample the minority class (this usually works well for smaller datasets) 
* we need GroupKfold crossvalidation, as we do not want to have speakers from the training in the test dataset 

=> we do this all together in the pipepline

In [4]:
# check classes
for language, data in languages.items():
    print(f"{language.title()}", data['valence'].value_counts())

English valence
0    240
1    240
Name: count, dtype: int64
German valence
0    385
1    150
Name: count, dtype: int64
Italian valence
0    336
1    252
Name: count, dtype: int64
Urdu valence
0    200
1    200
Name: count, dtype: int64


### Model Training

In [5]:
# Initialize models
svm_model = SVC(kernel = "linear", C= 0.1, probability=True, random_state=42)
logreg_model = LogisticRegression(max_iter=500,solver="liblinear", random_state=42)
xgb_model = XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
randomForest_model = RandomForestClassifier(n_estimators=50, max_depth=5,random_state=42)

# Create pipelines
pipelines = {
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", svm_model)
    ]),
    "Random Forest": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", randomForest_model)
    ]),
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", logreg_model)
    ]),
    "XGBoost": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", xgb_model)
    ]),
    # random predictor
    # Zufallsbaseline mit gewichteter Klassenverteilung
    "Dummy": Pipeline([
        ("classifier", DummyClassifier(strategy="most_frequent"))
    ])
}

### Evaluation
in the paper they used unweighted average recall rate (UAR) which gives equal weight to each class, making it good for imbalanced datasets. UAR is the same as balanced accuracy.
we choose to check also other metrics, which are good for imbalanced date, like f1 score.

In [6]:
def separate_feats_labs(df, dataset_name):
    X = df.drop(["emotion", "speaker_id", "filename", "valence"], axis=1)  # Features
    y = df["valence"]  # Labels
    groups = df["speaker_id"]

    if dataset_name == "urdu":
        # URDU: 30 speaker for Training, 8 for Testing
        # dont have all data, therefore 21%  (= 6 speaker) for testing
        unique_speakers = np.random.permutation(df["speaker_id"].unique())

        train_speakers = unique_speakers[:22]
        test_speakers = unique_speakers[22:]

        train_idx = df[df["speaker_id"].isin(train_speakers)].index
        test_idx = df[df["speaker_id"].isin(test_speakers)].index

        # 5-fache Cross-Validation auf den Trainingsdaten
        gkf = GroupKFold(n_splits=5)

        return X.loc[train_idx], y.loc[train_idx], groups.loc[train_idx], gkf, X.loc[test_idx], y.loc[test_idx]
    else:
        # Leave-One-Speaker-Out (LOSO)
        logo = LeaveOneGroupOut()
        return X, y, groups, logo, None, None

def evaluate_pipelines(X, y, groups, cv, pipelines, test_X=None, test_y=None):
    scoring_metrics = ["accuracy", "balanced_accuracy", "f1_weighted", "precision_weighted", "recall_weighted"]
    results = []

    for name, pipeline in pipelines.items():
        for metric in scoring_metrics:
            if test_X is not None and test_y is not None:
                # URDU: 5-Fold auf Training + Testset separat
                scores = cross_val_score(pipeline, X, y, groups=groups, cv=cv, scoring=metric)
                pipeline.fit(X, y)
                test_score = pipeline.score(test_X, test_y)
                results.append([name, metric, scores.mean(), test_score])
            else:
                # Andere: LOSO
                scores = cross_val_score(pipeline, X, y, groups=groups, cv=cv, scoring=metric)
                results.append([name, metric, scores.mean()])
    return results

In [7]:
for language, data in languages.items():
    print(language)
    train_X, train_y, groups, gkf, test_X, test_y = separate_feats_labs(data, language)

    results = evaluate_pipelines(train_X, train_y, groups, gkf, pipelines, test_X, test_y)

    # Einheitliches Format sicherstellen
    for result in results:
        if len(result) == 3:  # LOSO-Fall ohne Test-Set
            result.append(None)  # Platzhalter für Test Score

    # DataFrame erstellen
    results_df = pd.DataFrame(results, columns=["Model", "Metric", "Score", "Test Score"])
    
    # save the data as csv
    results_df.to_csv(f"../Evaluation/Baseline/{language}_replication_results.csv", index=False)

english


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameter

german


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } ar

italian


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

urdu


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "u

In [8]:
results_df

Unnamed: 0,Model,Metric,Score,Test Score
0,SVM,accuracy,0.533333,0.825
1,SVM,balanced_accuracy,0.523114,0.825
2,SVM,f1_weighted,0.551115,0.825
3,SVM,precision_weighted,0.643557,0.825
4,SVM,recall_weighted,0.533333,0.825
5,Random Forest,accuracy,0.660833,0.85
6,Random Forest,balanced_accuracy,0.665204,0.85
7,Random Forest,f1_weighted,0.671397,0.85
8,Random Forest,precision_weighted,0.776636,0.85
9,Random Forest,recall_weighted,0.660833,0.85
