In [None]:
import os
import random
from datetime import datetime
from typing import Tuple
import audeer
import audmetric
import numpy as np
import pandas as pd
import yaml
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC

In [None]:
datasets_opensmile = [
    "interview_question_opensmile",
    "interview_windowed_opensmile",
    "text_phrase_opensmile",
    "text_word_opensmile",
]
datasets_combined = [
    "combined_question-phrase_opensmile",
    "combined_windowed-word_opensmile",
]

features_folder = "../data/final_datasets"

# Defining evaluation metrics
metrics = {
    'accuracy': accuracy_score,
    'UAR': audmetric.unweighted_average_recall,
    'roc_auc': roc_auc_score
}

seeds = [104, 105, 106]

In [None]:
def SVM(df_train: pd.DataFrame, X_test: pd.DataFrame) -> np.ndarray:
    """Train SVM with inner CV and return test predictions."""
    df_train = df_train.sample(frac=1).reset_index(drop=True).set_index("patient")
    y = df_train["label"]
    X = df_train.drop("label", axis=1)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    grid = {
        "kernel": ["rbf", "linear"],
        "C": [1e-4, 1e-3, 1e-1, 1, 5, 10],
        "gamma": ["auto", "scale"],
    }
    search = GridSearchCV(
        SVC(class_weight="balanced"), grid, cv=KFold(3, shuffle=True, random_state=1)
    )
    search.fit(X_train, y)
    best = search.best_estimator_

    print("Train acc:", accuracy_score(best.predict(X_train), y))
    return best.predict(X_test_scaled)

def k_means(
    df_clustering: pd.DataFrame,
    subj: pd.DataFrame,
    n_clusters: int = 2
) -> Tuple[np.ndarray, np.ndarray]:
    """Cluster training subjects and assign cluster to held-out subj."""
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(df_clustering)
    model = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto")
    model.fit(train_scaled)

    subj_scaled = scaler.transform(subj)
    subj_cluster = model.predict(subj_scaled)
    return model.labels_, subj_cluster

In [None]:
for seed in seeds:
    np.random.seed(seed)
    random.seed(seed)

    for feature_set in datasets_opensmile:
        print(f"{feature_set} @ {datetime.now()}")

        results_path = os.path.join(f"../data/results/{seed}_personalisation", f"{feature_set}_results")
        os.makedirs(results_path, exist_ok=True)

        # load & clean
        df = pd.read_csv(os.path.join(features_folder, f"{feature_set}.csv")).dropna()
        df = df.drop("timepoint", axis=1)
        subjects = df["patient"].unique()

        all_results = []
        for subj in audeer.progress_bar(subjects, desc="LOSO"):
            df_test = df[df["patient"] == subj].set_index("patient")
            X_test = df_test.drop("label", axis=1)

            df_train = df[df["patient"] != subj]
            train_ids = df_train["patient"].unique()

            # cluster at subject-level
            agg_train = df_train.groupby("patient").mean().drop("label", axis=1)
            agg_subj = df_test.groupby("patient").mean().drop("label", axis=1)
            clusters, subj_cluster = k_means(agg_train, agg_subj)

            # keep only same-cluster patients
            keep = train_ids[clusters == subj_cluster]
            df_train = df_train[df_train["patient"].isin(keep)]

            folder = audeer.mkdir(os.path.join(results_path, subj))
            preds = SVM(df_train, X_test)

            df_out = df_test.assign(prediction=preds)[["label", "prediction"]]
            df_out.reset_index().to_csv(os.path.join(folder, "results.csv"), index=False)
            all_results.append(df_out)

        # unit-level metrics
        results_df = pd.concat(all_results)
        unit_scores = {
            k: fn(results_df["label"], results_df["prediction"]) for k, fn in metrics.items()
        }
        with open(os.path.join(results_path, "results.yaml"), "w") as f:
            yaml.dump(unit_scores, f)

        # session-level aggregation
        session = (
            results_df
            .reset_index()
            .groupby("patient")["prediction"]
            .agg(lambda x: x.value_counts().idxmax())
            .reset_index()
            .rename(columns={"patient": "subject"})
        )
        session["label"] = session["subject"].map(
            lambda s: df[df["patient"] == s]["label"].iat[0]
        )
        session.to_csv(os.path.join(results_path, "results_session.csv"), index=False)

        session_scores = {
            k: fn(session["label"], session["prediction"]) for k, fn in metrics.items()
        }
        with open(os.path.join(results_path, "results_session.yaml"), "w") as f:
            yaml.dump(session_scores, f)