In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

In [3]:
df = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data_filtered.csv")

In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average="macro", zero_division=0)
stop_words = set(stopwords.words('english'))
stop_words = list(stop_words)
scoring = {
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0)
}

# Baselines

In [6]:
results = []

In [10]:
tasks = {
    "MCI vs. AD": df[df['diagnosis'].isin(['MCI', 'AD'])],
    "MCI vs. Control": df[df['diagnosis'].isin(['MCI', 'Control'])],
    "MCI vs. AD vs. Control": df
}

models = {
    "SVM": SVC(random_state=42, class_weight='balanced'),
}

In [17]:
import shap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, FunctionTransformer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

# ----- PARAMETERS -----
numerical_cols_by_task = {
    "MCI vs. AD": ['stop_words', 'short_pause', 'mean_surprisal'],
    "MCI vs. Control": ['mean_surprisal'],
    "MCI vs. AD vs. Control": ['co', 'mean_surprisal']
}
output_dir = "shap_plots"
import os
os.mkdir("shap_plots")
n_tfidf_components = 1  # or more if desired
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ----- LOOP THROUGH TASKS -----
features_used = {}

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    # Select relevant numerical columns
    numerical_cols = numerical_cols_by_task[task_name]
    features_used[task_name] = ['TF-IDF'] + numerical_cols

    X = task_df[['speech'] + numerical_cols].copy()
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # --- Pipelines ---
    tfidf_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words)),
        ('svd', TruncatedSVD(n_components=n_tfidf_components, random_state=42))
    ])

    text_transformer = FunctionTransformer(lambda x: x['speech'], validate=False)

    preprocessor = ColumnTransformer([
        ('tfidf', Pipeline([
            ('extract', text_transformer),
            ('tfidf', tfidf_pipeline)
        ]), ['speech']),
        ('num', Pipeline([
            ('scaler', StandardScaler())
        ]), numerical_cols)
    ])

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('clf', SVC(probability=True, kernel='linear', class_weight='balanced', random_state=42))
    ])

    # --- Train on full data (or optionally cross-val inside loop) ---
    pipeline.fit(X, y)

    # --- SHAP Analysis ---
    X_preprocessed = pipeline.named_steps['preprocess'].transform(X)
    explainer = shap.Explainer(pipeline.named_steps['clf'], X_preprocessed)
    shap_values = explainer(X_preprocessed)

    # --- Save SHAP Plot ---
    plt.figure()
    shap.summary_plot(
        shap_values,
        features=X_preprocessed,
        feature_names=features_used[task_name],
        show=False
    )
    plt.tight_layout()
    plot_path = f"{output_dir}/shap_{task_name.replace(' ', '_').replace('/', '_')}.png"
    plt.savefig(plot_path, dpi=300)
    plt.close()
    print(f"SHAP plot saved to: {plot_path}")




### MCI vs. AD ###

SHAP plot saved to: shap_plots/shap_MCI_vs._AD.png

### MCI vs. Control ###

SHAP plot saved to: shap_plots/shap_MCI_vs._Control.png

### MCI vs. AD vs. Control ###

SHAP plot saved to: shap_plots/shap_MCI_vs._AD_vs._Control.png


## MMSE

In [8]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['mmse']].values
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")
        cv_results = cross_validate(model, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "MMSE"
        })



### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

Model: SVM
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

Model: Logistic Regression
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5714 ± 0.1125
Precision-Macro: 0.6067
Recall-Macro: 0.6140

Model: SVM
F1-Macro: 0.6169 ± 0.1504
Precision-Macro: 0.6658
Recall-Macro: 0.6407

Model: Logistic Regression
F1-Macro: 0.6109 ± 0.1687
Precision-Macro: 0.6256
Recall-Macro: 0.6503

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9575 ± 0.0327
Precision-Macro: 0.9562
Recall-Macro: 0.9640

Model: SVM
F1-Macro: 0.9550 ± 0.0321
Precision-Macro: 0.9552
Recall-Macro: 0.9603

Model: Logistic Regression
F1-Macro: 0.9571 ± 0.0401
Precision-Macro: 0.9562
Recall-Macro: 0.9613

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.6755 ± 0.0696
Precision-Macro: 0.706

## TF-IDF

In [9]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df['speech']
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words)),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: Logistic Regression
F1-Macro: 0.6586 ± 0.1372
Precision-Macro: 0.6785
Recall-Macro: 0.6610

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: Logistic Regression
F1-Macro: 0.4796 ± 0.0915
Precision-Macro: 0.4756
Recall-Macro: 0.4958

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.8078 ± 0.0699
Precision-Macro: 0.8111
Recall-Macro: 0.8112

Model: SVM
F1-Macro: 0.8082 ± 0.0576
Precision-Macro: 0.8120
Recall-Macro: 0.8151

Model: Logistic Regression
F1-Macro: 0.7814 ± 0.0580
Precision-Macro: 0.7875
Recall-Macro: 0.7906

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5029 ± 0.0276
Precision-Macro: 0.491

## TF-IDF + MMSE

In [10]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['speech', 'mmse']]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # Define column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('mmse', StandardScaler(), ['mmse'])
        ]
    )

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })



### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.8170 ± 0.0721
Precision-Macro: 0.7851
Recall-Macro: 0.8916

Model: Logistic Regression
F1-Macro: 0.8357 ± 0.0816
Precision-Macro: 0.7966
Recall-Macro: 0.9386

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.6503 ± 0.1624
Precision-Macro: 0.7113
Recall-Macro: 0.6416

Model: Logistic Regression
F1-Macro: 0.6218 ± 0.1890
Precision-Macro: 0.6314
Recall-Macro: 0.6372

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9257 ± 0.0444
Precision-Macro: 0.9297
Recall-Macro: 0.9282

Model: SVM
F1-Macro: 0.9622 ± 0.0315
Precision-Macro: 0.9610
Recall-Macro: 0.9682

Model: Logistic Regression
F1-Macro: 0.9551 ± 0.0339
Precision-Macro: 0.9540
Recall-Macro: 0.9613

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5972 ± 0.0315
Precision-Macro: 0.577

In [11]:
baselines_results = pd.DataFrame(results)

In [12]:
baselines_results.head()

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
0,MCI vs. AD,Random Forest,0.820021,0.778155,0.931896,0.074311,MMSE
1,MCI vs. AD,SVM,0.820021,0.778155,0.931896,0.074311,MMSE
2,MCI vs. AD,Logistic Regression,0.820021,0.778155,0.931896,0.074311,MMSE
3,MCI vs. Control,Random Forest,0.571365,0.606726,0.613954,0.112546,MMSE
4,MCI vs. Control,SVM,0.616925,0.665836,0.64067,0.150424,MMSE


In [13]:
baselines_results.loc[baselines_results.groupby('Task')['Mean F1 Macro'].idxmax()]

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
31,AD vs. Control,SVM,0.962242,0.961029,0.968169,0.031528,TF-IDF + MMSE
26,MCI vs. AD,Logistic Regression,0.835719,0.796607,0.938616,0.081554,TF-IDF + MMSE
10,MCI vs. AD vs. Control,SVM,0.695081,0.713441,0.76155,0.056706,MMSE
28,MCI vs. Control,SVM,0.650258,0.711317,0.641573,0.162386,TF-IDF + MMSE


# All Features

In [14]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    features_columns = task_df.select_dtypes(include=[np.number]).drop(columns=['speaking time (s)']).columns.tolist()
    # features_columns.remove('mmse')
    X = task_df[features_columns]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "All Features"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.6058 ± 0.1302
Precision-Macro: 0.6743
Recall-Macro: 0.5952

Model: SVM
F1-Macro: 0.7455 ± 0.1045
Precision-Macro: 0.7414
Recall-Macro: 0.7897

Model: Logistic Regression
F1-Macro: 0.8396 ± 0.0774
Precision-Macro: 0.8221
Recall-Macro: 0.9031

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.4561 ± 0.0199
Precision-Macro: 0.4251
Recall-Macro: 0.4955

Model: SVM
F1-Macro: 0.6109 ± 0.0998
Precision-Macro: 0.6205
Recall-Macro: 0.6576

Model: Logistic Regression
F1-Macro: 0.6025 ± 0.1099
Precision-Macro: 0.6027
Recall-Macro: 0.6737

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9426 ± 0.0341
Precision-Macro: 0.9416
Recall-Macro: 0.9471

Model: SVM
F1-Macro: 0.9210 ± 0.0394
Precision-Macro: 0.9206
Recall-Macro: 0.9251

Model: Logistic Regression
F1-Macro: 0.9473 ± 0.0390
Precision-Macro: 0.9467
Recall-Macro: 0.9522

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.6090 ± 0.0243
Precision-Macro: 0.587

# TF-IDF + All Features

In [15]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    features_columns = task_df.select_dtypes(include=[np.number]).drop(columns=['speaking time (s)']).columns.tolist()
    X = task_df[['speech'] + features_columns]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # Define column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('num', StandardScaler(), features_columns)
        ]
    )

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "All Features"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.7455 ± 0.1045
Precision-Macro: 0.7414
Recall-Macro: 0.7897

Model: Logistic Regression
F1-Macro: 0.8143 ± 0.0875
Precision-Macro: 0.7981
Recall-Macro: 0.8738

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.6063 ± 0.0971
Precision-Macro: 0.6217
Recall-Macro: 0.6507

Model: Logistic Regression
F1-Macro: 0.6080 ± 0.1050
Precision-Macro: 0.6050
Recall-Macro: 0.6620

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9309 ± 0.0314
Precision-Macro: 0.9319
Recall-Macro: 0.9326

Model: SVM
F1-Macro: 0.9210 ± 0.0394
Precision-Macro: 0.9206
Recall-Macro: 0.9251

Model: Logistic Regression
F1-Macro: 0.9498 ± 0.0368
Precision-Macro: 0.9490
Recall-Macro: 0.9550

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.6019 ± 0.0211
Precision-Macro: 0.583

# Linear Feature Addition to TF-IDF

In [16]:
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("speaking time (s)")
numerical_features.remove("mmse")  # mmse will be added manually at the start
numerical_features.insert(0, "mmse")

In [17]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    for model_name, model in models.items():
        X_text = task_df[['speech']]  # Start with text only
        y = LabelEncoder().fit_transform(task_df['diagnosis'])

        # Define TF-IDF for baseline
        tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=5000, stop_words=stop_words)

        baseline_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        baseline_pipeline = Pipeline([
            ('preprocessor', baseline_preprocessor),
            ('clf', model)
        ])

        # Baseline: TF-IDF only
        baseline_score = cross_val_score(baseline_pipeline, X_text, y, cv=kf, scoring=f1_scorer).mean()
        print(f"\nModel: {model_name}")
        print(f"Baseline (TF-IDF only) F1-Macro: {baseline_score:.4f}")

        # Feature selection loop
        best_score = baseline_score
        selected_features = []

        for feature in numerical_features:

            current_features = selected_features + [feature]

            preprocessor = ColumnTransformer([
                ('tfidf', tfidf, 'speech'),
                ('num', StandardScaler(), current_features)
            ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', model)
            ])

            X_all = task_df[['speech'] + current_features]
            new_score = cross_val_score(pipeline, X_all, y, cv=kf, scoring=f1_scorer).mean()

            if new_score - best_score >= best_score * 0.01:
                best_score = new_score
                selected_features.append(feature)
                print(f"✅ Added Feature: {feature}, New F1-Macro: {new_score:.4f}")
            else:
                print(f"❌ Skipped Feature: {feature}, F1-Macro: {new_score:.4f}")

        print(f"\nFinal Selected Features: {selected_features}, Final F1-Macro: {best_score:.4f}")

        # Final evaluation with all selected features
        final_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech'),
            ('num', StandardScaler(), selected_features)
        ]) if selected_features else ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        final_pipeline = Pipeline([
            ('preprocessor', final_preprocessor),
            ('clf', model)
        ])

        X_final = task_df[['speech'] + selected_features] if selected_features else task_df[['speech']]

        final_scores = cross_validate(final_pipeline, X_final, y, cv=kf, scoring=scoring)

        print(f"\n🔎 Final Evaluation with Selected Features:")
        print(f"F1-Macro: {final_scores['test_f1_macro'].mean():.4f} ± {final_scores['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {final_scores['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {final_scores['test_recall_macro'].mean():.4f}")
        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": final_scores['test_f1_macro'].mean(),
            "Std F1": final_scores['test_f1_macro'].std(),
            "Mean Precision Macro": final_scores['test_precision_macro'].mean(),
            "Mean Recall Macro": final_scores['test_recall_macro'].mean(),
            "Type": "+".join(["TF-IDF"] + selected_features) if selected_features else "TF-IDF only"
        })


### MCI vs. AD ###


Model: Random Forest
Baseline (TF-IDF only) F1-Macro: 0.4679
❌ Skipped Feature: mmse, F1-Macro: 0.4679
❌ Skipped Feature: on, F1-Macro: 0.4679
❌ Skipped Feature: co, F1-Macro: 0.4679
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.4679
❌ Skipped Feature: mlu, F1-Macro: 0.4679
❌ Skipped Feature: stop_words, F1-Macro: 0.4679
❌ Skipped Feature: tree_depth, F1-Macro: 0.4679
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.4679
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.4679
❌ Skipped Feature: sid, F1-Macro: 0.4679
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: pid, F1-Macro: 0.4679
❌ Skipped Feature: pid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: maas, F1-Macro: 0.4679
❌ Skipped Feature: frazier_score, F1-Macro: 0.4679
❌ Skipped Feature: words_per_clause, F1-Macro: 0.4679
❌ Skipped Feature: short_pause, F1-Macro: 0.4679
❌ Skipped Feature: mid_pause, F1-Macro: 0.4679
❌ Skipped Feature: long_pause, F1-Macro: 0.4679
❌ Skipped

✅ Added Feature: short_pause, New F1-Macro: 0.9406
❌ Skipped Feature: mid_pause, F1-Macro: 0.9138
❌ Skipped Feature: long_pause, F1-Macro: 0.9207
❌ Skipped Feature: mean_surprisal, F1-Macro: 0.9256

Final Selected Features: ['mmse', 'short_pause'], Final F1-Macro: 0.9406

🔎 Final Evaluation with Selected Features:
F1-Macro: 0.9406 ± 0.0560
Precision-Macro: 0.9423
Recall-Macro: 0.9442

Model: SVM
Baseline (TF-IDF only) F1-Macro: 0.8082
✅ Added Feature: mmse, New F1-Macro: 0.9622
❌ Skipped Feature: on, F1-Macro: 0.9573
❌ Skipped Feature: co, F1-Macro: 0.9622
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.9574
❌ Skipped Feature: mlu, F1-Macro: 0.9623
❌ Skipped Feature: stop_words, F1-Macro: 0.9598
❌ Skipped Feature: tree_depth, F1-Macro: 0.9623
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.9599
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.9574
❌ Skipped Feature: sid, F1-Macro: 0.9598
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.9550
❌ Skipped Feature: pid, F1-Macro: 0

# Linear Feature Deletion from TF-IDF

In [18]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    for model_name, model in models.items():
        X_text = task_df[['speech']]  # Start with text only
        y = LabelEncoder().fit_transform(task_df['diagnosis'])

        baseline_preprocessor = ColumnTransformer([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), max_features=5000, stop_words=stop_words), 'speech')
        ])

        baseline_pipeline = Pipeline([
            ('preprocessor', baseline_preprocessor),
            ('clf', model)
        ])

        # Baseline: TF-IDF only
        baseline_score = cross_val_score(baseline_pipeline, X_text, y, cv=kf, scoring=f1_scorer).mean()
        print(f"\nModel: {model_name}")
        print(f"Baseline (TF-IDF only) F1-Macro: {baseline_score:.4f}")

        # Feature selection loop
        best_score = baseline_score
        selected_features = numerical_features.copy()
        current_features = selected_features.copy()
        for feature in numerical_features:

            current_features.remove(feature)

            preprocessor = ColumnTransformer([
                ('tfidf', tfidf, 'speech'),
                ('num', StandardScaler(), current_features)
            ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', model)
            ])

            X_all = task_df[['speech'] + current_features]
            new_score = cross_val_score(pipeline, X_all, y, cv=kf, scoring=f1_scorer).mean()

            if new_score - best_score >= best_score * 0.01:
                best_score = new_score
                selected_features.remove(feature)
                print(f"✅ Removed Feature: {feature}, New F1-Macro: {new_score:.4f}")
            else:
                print(f"❌ Skipped Feature: {feature}, F1-Macro: {new_score:.4f}")

        print(f"\nFinal Selected Features: {selected_features}, Final F1-Macro: {best_score:.4f}")

        # Final evaluation with all selected features
        final_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech'),
            ('num', StandardScaler(), selected_features)
        ]) if selected_features else ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        final_pipeline = Pipeline([
            ('preprocessor', final_preprocessor),
            ('clf', model)
        ])

        X_final = task_df[['speech'] + selected_features] if selected_features else task_df[['speech']]

        final_scores = cross_validate(final_pipeline, X_final, y, cv=kf, scoring=scoring)

        print(f"\n🔎 Final Evaluation with Selected Features:")
        print(f"F1-Macro: {final_scores['test_f1_macro'].mean():.4f} ± {final_scores['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {final_scores['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {final_scores['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": final_scores['test_f1_macro'].mean(),
            "Std F1": final_scores['test_f1_macro'].std(),
            "Mean Precision Macro": final_scores['test_precision_macro'].mean(),
            "Mean Recall Macro": final_scores['test_recall_macro'].mean(),
            "Type": "+".join(["TF-IDF"] + selected_features) if selected_features else "TF-IDF only"
        })



### MCI vs. AD ###


Model: Random Forest
Baseline (TF-IDF only) F1-Macro: 0.4679
❌ Skipped Feature: mmse, F1-Macro: 0.4679
❌ Skipped Feature: on, F1-Macro: 0.4679
❌ Skipped Feature: co, F1-Macro: 0.4679
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.4679
❌ Skipped Feature: mlu, F1-Macro: 0.4679
❌ Skipped Feature: stop_words, F1-Macro: 0.4679
❌ Skipped Feature: tree_depth, F1-Macro: 0.4679
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.4679
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.4679
❌ Skipped Feature: sid, F1-Macro: 0.4679
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: pid, F1-Macro: 0.4679
❌ Skipped Feature: pid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: maas, F1-Macro: 0.4679
❌ Skipped Feature: frazier_score, F1-Macro: 0.4679
❌ Skipped Feature: words_per_clause, F1-Macro: 0.4679
❌ Skipped Feature: short_pause, F1-Macro: 0.4679
❌ Skipped Feature: mid_pause, F1-Macro: 0.4679
❌ Skipped Feature: long_pause, F1-Macro: 0.4679
❌ Skipped

❌ Skipped Feature: short_pause, F1-Macro: 0.4874
❌ Skipped Feature: mid_pause, F1-Macro: 0.5066
❌ Skipped Feature: long_pause, F1-Macro: 0.5038
❌ Skipped Feature: mean_surprisal, F1-Macro: 0.4796

Final Selected Features: ['on', 'co', 'mean_sent_embs', 'mlu', 'stop_words', 'tree_depth', 'verbs_with_inflections', 'nouns_with_determiners', 'sid', 'sid_efficiency', 'pid', 'pid_efficiency', 'maas', 'frazier_score', 'words_per_clause', 'short_pause', 'mid_pause', 'long_pause', 'mean_surprisal'], Final F1-Macro: 0.5953

🔎 Final Evaluation with Selected Features:
F1-Macro: 0.5953 ± 0.1178
Precision-Macro: 0.5898
Recall-Macro: 0.6421

### AD vs. Control ###


Model: Random Forest
Baseline (TF-IDF only) F1-Macro: 0.8078
❌ Skipped Feature: mmse, F1-Macro: 0.8076
❌ Skipped Feature: on, F1-Macro: 0.7987
✅ Removed Feature: co, New F1-Macro: 0.8432
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.8033
❌ Skipped Feature: mlu, F1-Macro: 0.8155
❌ Skipped Feature: stop_words, F1-Macro: 0.7938
❌ Skipped Fe

❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.5295
❌ Skipped Feature: mlu, F1-Macro: 0.5279
❌ Skipped Feature: stop_words, F1-Macro: 0.5204
❌ Skipped Feature: tree_depth, F1-Macro: 0.5377
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.5384
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.5174
❌ Skipped Feature: sid, F1-Macro: 0.5190
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.5198
❌ Skipped Feature: pid, F1-Macro: 0.5133
❌ Skipped Feature: pid_efficiency, F1-Macro: 0.5274
❌ Skipped Feature: maas, F1-Macro: 0.5326
❌ Skipped Feature: frazier_score, F1-Macro: 0.5412
❌ Skipped Feature: words_per_clause, F1-Macro: 0.5309
❌ Skipped Feature: short_pause, F1-Macro: 0.5257
❌ Skipped Feature: mid_pause, F1-Macro: 0.5221
❌ Skipped Feature: long_pause, F1-Macro: 0.5245
❌ Skipped Feature: mean_surprisal, F1-Macro: 0.5079

Final Selected Features: ['on', 'co', 'mean_sent_embs', 'mlu', 'stop_words', 'tree_depth', 'verbs_with_inflections', 'nouns_with_determiners', 'sid', 'sid_effici

# Sequential Feature Selection

In [19]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer
import numpy as np

f1_macro_scorer = make_scorer(f1_score, average='macro')

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    all_numeric = task_df.select_dtypes(include=[np.number]).drop(columns=['speaking time (s)']).columns.tolist()

    X_num_all = task_df[all_numeric].values
    numeric_feature_names = all_numeric
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        sfs = SFS(
            estimator=clone(model),
            k_features='best',
            floating=True,
            scoring=f1_macro_scorer,
            cv=kf,
            n_jobs=-1
        )

        sfs = sfs.fit(X_num_all, y)
        selected_idx = list(sfs.k_feature_idx_)
        selected_features = [numeric_feature_names[i] for i in selected_idx]
        X_selected = task_df[selected_features]
        print("Selected numeric features: ", selected_features)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), selected_features)
            ]
        )

        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X_selected, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "+".join(selected_features),
        })

        ### TF-IDF + Selected Features
        preprocessor = ColumnTransformer(
            transformers=[
                ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
                ('mmse', StandardScaler(), ['mmse'])
            ]
        )

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        X_selected = task_df[['speech'] + selected_features]

        cv_results = cross_validate(pipeline, X_selected, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "+".join(["TF-IDF"] + selected_features)
        })



### MCI vs. AD ###


Model: Random Forest
Selected numeric features:  ['mmse', 'short_pause']
F1-Macro: 0.8273 ± 0.0572
Precision-Macro: 0.8103
Recall-Macro: 0.8914
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
Selected numeric features:  ['mmse']
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319
F1-Macro: 0.8170 ± 0.0721
Precision-Macro: 0.7851
Recall-Macro: 0.8916

Model: Logistic Regression
Selected numeric features:  ['on', 'co', 'mean_sent_embs', 'stop_words', 'tree_depth', 'verbs_with_inflections', 'nouns_with_determiners', 'sid', 'sid_efficiency', 'pid', 'pid_efficiency', 'maas', 'frazier_score', 'mmse', 'short_pause', 'mean_surprisal']
F1-Macro: 0.7955 ± 0.0773
Precision-Macro: 0.7750
Recall-Macro: 0.8672
F1-Macro: 0.8357 ± 0.0816
Precision-Macro: 0.7966
Recall-Macro: 0.9386

### MCI vs. Control ###


Model: Random Forest
Selected numeric features:  ['stop_words', 'pid', 'pid_efficiency', 'mmse', 'mean_surprisal']
F1-Ma

# Results

In [20]:
import pandas as pd

In [21]:
all_results = pd.DataFrame(results)
# all_results.to_csv("all_results.csv", index=False)

In [22]:
all_results.head()

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
0,MCI vs. AD,Random Forest,0.820021,0.778155,0.931896,0.074311,MMSE
1,MCI vs. AD,SVM,0.820021,0.778155,0.931896,0.074311,MMSE
2,MCI vs. AD,Logistic Regression,0.820021,0.778155,0.931896,0.074311,MMSE
3,MCI vs. Control,Random Forest,0.571365,0.606726,0.613954,0.112546,MMSE
4,MCI vs. Control,SVM,0.616925,0.665836,0.64067,0.150424,MMSE


In [23]:
all_results = all_results[all_results["Model"] == "SVM"]

In [24]:
all_results.loc[all_results.groupby("Task")["Mean F1 Macro"].idxmax()]

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
31,AD vs. Control,SVM,0.962242,0.961029,0.968169,0.031528,TF-IDF + MMSE
61,MCI vs. AD,SVM,0.849255,0.821257,0.922562,0.081628,TF-IDF+mmse+stop_words+verbs_with_inflections+...
70,MCI vs. AD vs. Control,SVM,0.754181,0.7597,0.772782,0.083009,TF-IDF+mmse+co+mean_surprisal
64,MCI vs. Control,SVM,0.704522,0.744655,0.688144,0.184661,TF-IDF+mmse+mean_surprisal
