In [None]:
import argparse
import os
import random
from datetime import datetime
from typing import Any, Dict, List
import audeer
import audmetric
import joblib
import numpy as np
import pandas as pd
import yaml
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor

In [None]:
datasets_interview = [
    "interview_question_facebook-wav2vec2",
    "interview_question_opensmile",
    "interview_windowed_facebook-wav2vec2",
    "interview_windowed_opensmile",
]

datasets_text = [
    "text_phrase_facebook-wav2vec2",
    "text_phrase_opensmile",
    "text_word_facebook-wav2vec2",
    "text_word_opensmile",
]

datasets_combined = [
    "combined_question-phrase_facebook-wav2vec2",
    "combined_question-phrase_opensmile",
    "combined_windowed-word_facebook-wav2vec2",
    "combined_windowed-word_opensmile",
]

features_folder = "../data/final_datasets"

# Defining evaluation metrics
metrics = {
    'accuracy': accuracy_score,
    'UAR': audmetric.unweighted_average_recall,
    'roc_auc': roc_auc_score
}

seeds = [104, 105, 106]

In [None]:
def SVM(df_train: pd.DataFrame, X_test: np.ndarray) -> np.ndarray:
    """Train SVM with inner CV and return predictions."""
    df_train = df_train.set_index("patient")
    y = df_train["label"]
    X = df_train.drop("label", axis=1)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    grid = {
        "kernel": ["rbf", "linear"],
        "C": [1e-4, 1e-3, 1e-1, 1, 5, 10],
        "gamma": ["auto", "scale"],
    }
    search = GridSearchCV(
        SVC(class_weight="balanced"), grid, cv=KFold(3, shuffle=True, random_state=1)
    )
    search.fit(X_train, y)
    best = search.best_estimator_

    print("Train acc:", accuracy_score(best.predict(X_train), y))
    return best.predict(X_test_scaled)


def XGBoost(df_train: pd.DataFrame, X_test: np.ndarray) -> np.ndarray:
    """Train XGBoost with inner CV and return predictions."""
    df_train = df_train.set_index("patient")
    y = df_train["label"]
    X = df_train.drop("label", axis=1)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    grid = {"colsample_bytree": [0.2, 0.6], "max_depth": [2, 4], "n_estimators": [10, 30]}
    model = XGBClassifier(
        random_state=42,
        objective="reg:logistic",
        colsample_bytree=0.4,
        learning_rate=0.1,
        max_depth=2,
        alpha=10,
        reg_lambda=10,
        n_estimators=10,
    )
    search = GridSearchCV(model, grid, cv=KFold(3, shuffle=True, random_state=1))
    search.fit(X_train, y)
    best = search.best_estimator_

    print("Train acc:", accuracy_score(best.predict(X_train), y))
    return best.predict(X_test_scaled)


def LR(df_train: pd.DataFrame, X_test: np.ndarray) -> np.ndarray:
    """Train Logistic Regression CV and return predictions."""
    df_train = df_train.set_index("patient")
    y = df_train["label"]
    X = df_train.drop("label", axis=1)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    model = LogisticRegressionCV(
        Cs=10, cv=3, penalty="l2", max_iter=100, solver="liblinear", random_state=42
    )
    model.fit(X_train, y)

    print("Train acc:", model.score(X_train, y))
    return model.predict(X_test_scaled)


def DecisionTree(df_train: pd.DataFrame, X_test: np.ndarray) -> np.ndarray:
    """Train Decision Tree with inner CV and return predictions."""
    df_train = df_train.set_index("patient")
    y = df_train["label"]
    X = df_train.drop("label", axis=1)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    grid = {
        "max_depth": [2, 4, 6],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    }
    search = GridSearchCV(
        DecisionTreeClassifier(random_state=42),
        grid,
        cv=KFold(3, shuffle=True, random_state=1),
    )
    search.fit(X_train, y)
    best = search.best_estimator_

    print("Train acc:", best.score(X_train, y))
    return best.predict(X_test_scaled)


def RandomForest(df_train: pd.DataFrame, X_test: np.ndarray) -> np.ndarray:
    """Train Random Forest with Leave-One-Group-Out CV and return predictions."""
    from sklearn.model_selection import LeaveOneGroupOut

    df_train = df_train.set_index("patient")
    y = df_train["label"]
    X = df_train.drop("label", axis=1)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    grid = {"n_estimators": [10, 50, 100], "max_depth": [5, 10], "min_samples_split": [2, 5]}
    logo = LeaveOneGroupOut()
    search = GridSearchCV(
        RandomForestClassifier(random_state=42, n_estimators=100),
        grid,
        cv=logo,
        refit=True,
    )
    search.fit(X_train, y, groups=df_train.index)
    best = search.best_estimator_

    print("Train acc:", accuracy_score(best.predict(X_train), y))
    return best.predict(X_test_scaled)


In [None]:
for seed in seeds:
    np.random.seed(seed)
    random.seed(seed)
    for feature_set in datasets_combined:
        print(f"Runs for {feature_set} at {datetime.now()}")
        for data in ['all','vor','nach']:
            
            if data!="all":
                results_path = os.path.join(f"../data/results/{seed}_SVM/{feature_set}_results_only_{data}")
            else:
                results_path = os.path.join(f"../data/results/{seed}_SVM/{feature_set}_results")

            results = {
                key: []
                for key in metrics
            }
            
            features = pd.read_csv(os.path.join(features_folder, f"{feature_set}.csv"), index_col=False)
            
            if data!='all':
                features = features[features['timepoint'] == data]
                
            features = features.drop(['timepoint'],axis=1)
            features = features.dropna()
            
            all_subjects = features['patient'].unique()
            all_results = []
            
            for subj in audeer.progress_bar(all_subjects, total=len(all_subjects), desc='LOSO'):
                df_test = features[features['patient']==subj]
                df_test.set_index('patient', inplace=True)
                y_test = df_test['label']
                X_test = df_test.drop('label',axis=1)
                df_train = features[features['patient']!=subj]
                df_train = df_train.sample(frac=1).reset_index(drop=True)
                

                experiment_folder = audeer.mkdir(
                    os.path.join(results_path, f'{subj}'))
                
                df_test.insert(2, 'prediction', SVM(df_train, X_test))
                df_test = df_test[['label','prediction']]
                all_results.append(df_test)
                
                df_test.reset_index().to_csv(os.path.join(
                    experiment_folder, 'results.csv'), index=False)
               
            results_df = pd.concat(all_results)
            print(results_df)
            
            results = {
                key: metrics[key](results_df['label'], results_df['prediction'])
                for key in metrics
            }
            print("\nResults per unit:")
            print(results)
            with open(os.path.join(results_path, 'results.yaml'), 'w') as fp:
                yaml.dump(results, fp)
                
            # Calculating results per subject:
            results_df.reset_index(drop=False, inplace=True)
            subj_list = []
            label_list = []
            prediction_list = []
            for subj in all_subjects:
                subj_list.append(subj)
                subj_df = results_df[results_df['patient']==subj]
                label_list.append(subj_df['label'].values[0])
                prediction_list.append(subj_df['prediction'].value_counts().idxmax())
            session_df = pd.DataFrame({'subject': subj_list, 'label': label_list, 'prediction': prediction_list})
            session_df.reset_index().to_csv(os.path.join(results_path, 'results_session.csv'), index=False)
            results = {
                key: metrics[key](session_df['label'], session_df['prediction'])
                for key in metrics
            }
            print("\nResults per session:")
            print(results)
            with open(os.path.join(results_path, 'results_session.yaml'), 'w') as fp:
                yaml.dump(results, fp)
