In [24]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [25]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

In [26]:
df = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data_filtered.csv")

In [27]:
df.head()

Unnamed: 0,id,diagnosis,speech,annotation,speaking time (s),on,co,mean_sent_embs,mlu,stop_words,...,pid,pid_efficiency,maas,frazier_score,words_per_clause,mmse,short_pause,mid_pause,long_pause,mean_surprisal
0,138-1,Control,there's a cookie jar on the shelf . and the li...,# sent_id = 1\n# text = there's a cookie jar o...,46.2,0.0,0.105263,0.802693,9.571429,0.589552,...,0.123539,0.324675,0.025319,0.727537,6.090909,28.0,0,0,1,4.843843
1,631-0,Control,the kids are in the cookies . the stool is fal...,# sent_id = 1\n# text = the kids are in the co...,17.15,0.0,0.0,0.853938,7.25,0.534483,...,0.059091,0.233236,0.024072,0.868304,4.461538,29.0,0,0,0,5.742492
2,121-0,Control,the boy is taking a cookie out of the cookie j...,# sent_id = 1\n# text = the boy is taking a co...,128.05,0.0,0.153846,0.801815,11.666667,0.567857,...,0.21821,0.554471,0.026686,0.719372,6.666667,30.0,0,1,0,4.826125
3,142-3,Control,the water's running over on the floor . the st...,# sent_id = 1\n# text = the water's running ov...,16.82,0.0,0.285714,0.838216,7.0,0.5,...,0.280952,0.653983,0.019465,1.004314,6.0,30.0,0,0,0,6.550135
4,267-2,Control,mother is drying the dishes and looking out th...,# sent_id = 1\n# text = mother is drying the d...,39.71,0.0,0.0,0.795138,10.090909,0.54955,...,0.304233,0.956938,0.023439,0.910656,6.529412,30.0,2,0,0,5.018967


In [28]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average="macro", zero_division=0)
stop_words = set(stopwords.words('english'))
stop_words = list(stop_words)
scoring = {
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0)
}

# Baselines

In [29]:
results = []

In [30]:
tasks = {
    "MCI vs. AD": df[df['diagnosis'].isin(['MCI', 'AD'])],
    "MCI vs. Control": df[df['diagnosis'].isin(['MCI', 'Control'])],
    "AD vs. Control": df[df['diagnosis'].isin(['AD', 'Control'])],
    "MCI vs. AD vs. Control": df
}

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "SVM": SVC(random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')
}

## MMSE

In [31]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['mmse']].values
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")
        cv_results = cross_validate(model, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "MMSE"
        })



### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

Model: SVM
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

Model: Logistic Regression
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5714 ± 0.1125
Precision-Macro: 0.6067
Recall-Macro: 0.6140

Model: SVM
F1-Macro: 0.6169 ± 0.1504
Precision-Macro: 0.6658
Recall-Macro: 0.6407

Model: Logistic Regression
F1-Macro: 0.6109 ± 0.1687
Precision-Macro: 0.6256
Recall-Macro: 0.6503

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9575 ± 0.0327
Precision-Macro: 0.9562
Recall-Macro: 0.9640

Model: SVM
F1-Macro: 0.9550 ± 0.0321
Precision-Macro: 0.9552
Recall-Macro: 0.9603

Model: Logistic Regression
F1-Macro: 0.9571 ± 0.0401
Precision-Macro: 0.9562
Recall-Macro: 0.9613

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.6755 ± 0.0696
Precision-Macro: 0.706

## TF-IDF

In [32]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df['speech']
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words)),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: Logistic Regression
F1-Macro: 0.6586 ± 0.1372
Precision-Macro: 0.6785
Recall-Macro: 0.6610

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: Logistic Regression
F1-Macro: 0.4796 ± 0.0915
Precision-Macro: 0.4756
Recall-Macro: 0.4958

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.8078 ± 0.0699
Precision-Macro: 0.8111
Recall-Macro: 0.8112

Model: SVM
F1-Macro: 0.8082 ± 0.0576
Precision-Macro: 0.8120
Recall-Macro: 0.8151

Model: Logistic Regression
F1-Macro: 0.7814 ± 0.0580
Precision-Macro: 0.7875
Recall-Macro: 0.7906

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5029 ± 0.0276
Precision-Macro: 0.491

## TF-IDF + MMSE

In [33]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['speech', 'mmse']]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # Define column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('mmse', StandardScaler(), ['mmse'])
        ]
    )

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })



### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.8170 ± 0.0721
Precision-Macro: 0.7851
Recall-Macro: 0.8916

Model: Logistic Regression
F1-Macro: 0.8357 ± 0.0816
Precision-Macro: 0.7966
Recall-Macro: 0.9386

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.6503 ± 0.1624
Precision-Macro: 0.7113
Recall-Macro: 0.6416

Model: Logistic Regression
F1-Macro: 0.6218 ± 0.1890
Precision-Macro: 0.6314
Recall-Macro: 0.6372

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9257 ± 0.0444
Precision-Macro: 0.9297
Recall-Macro: 0.9282

Model: SVM
F1-Macro: 0.9622 ± 0.0315
Precision-Macro: 0.9610
Recall-Macro: 0.9682

Model: Logistic Regression
F1-Macro: 0.9551 ± 0.0339
Precision-Macro: 0.9540
Recall-Macro: 0.9613

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5972 ± 0.0315
Precision-Macro: 0.577

In [34]:
baselines_results = pd.DataFrame(results)

In [35]:
baselines_results.head()

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
0,MCI vs. AD,Random Forest,0.820021,0.778155,0.931896,0.074311,MMSE
1,MCI vs. AD,SVM,0.820021,0.778155,0.931896,0.074311,MMSE
2,MCI vs. AD,Logistic Regression,0.820021,0.778155,0.931896,0.074311,MMSE
3,MCI vs. Control,Random Forest,0.571365,0.606726,0.613954,0.112546,MMSE
4,MCI vs. Control,SVM,0.616925,0.665836,0.64067,0.150424,MMSE


In [36]:
baselines_results.loc[baselines_results.groupby('Task')['Mean F1 Macro'].idxmax()]

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
31,AD vs. Control,SVM,0.962242,0.961029,0.968169,0.031528,TF-IDF + MMSE
26,MCI vs. AD,Logistic Regression,0.835719,0.796607,0.938616,0.081554,TF-IDF + MMSE
10,MCI vs. AD vs. Control,SVM,0.695081,0.713441,0.76155,0.056706,MMSE
28,MCI vs. Control,SVM,0.650258,0.711317,0.641573,0.162386,TF-IDF + MMSE


# All Features

In [37]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    features_columns = task_df.select_dtypes(include=[np.number]).drop(columns=['speaking time (s)']).columns.tolist()
    features_columns.remove('mmse')
    X = task_df[features_columns]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "All Features"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4855 ± 0.0475
Precision-Macro: 0.4916
Recall-Macro: 0.5100

Model: SVM
F1-Macro: 0.6089 ± 0.1301
Precision-Macro: 0.6064
Recall-Macro: 0.6609

Model: Logistic Regression
F1-Macro: 0.5714 ± 0.1224
Precision-Macro: 0.5769
Recall-Macro: 0.6327

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5072 ± 0.1652
Precision-Macro: 0.4749
Recall-Macro: 0.5475

Model: SVM
F1-Macro: 0.5535 ± 0.1232
Precision-Macro: 0.5554
Recall-Macro: 0.5913

Model: Logistic Regression
F1-Macro: 0.5651 ± 0.1167
Precision-Macro: 0.5787
Recall-Macro: 0.6356

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.7306 ± 0.0842
Precision-Macro: 0.7439
Recall-Macro: 0.7344

Model: SVM
F1-Macro: 0.7665 ± 0.0683
Precision-Macro: 0.7707
Recall-Macro: 0.7756

Model: Logistic Regression
F1-Macro: 0.7577 ± 0.0702
Precision-Macro: 0.7605
Recall-Macro: 0.7668

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.4772 ± 0.0345
Precision-Macro: 0.464

# TF-IDF + All Features

In [38]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    features_columns = task_df.select_dtypes(include=[np.number]).drop(columns=['speaking time (s)']).columns.tolist()
    X = task_df[['speech'] + features_columns]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # Define column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('num', StandardScaler(), features_columns)
        ]
    )

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "All Features"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.7708 ± 0.1157
Precision-Macro: 0.7653
Recall-Macro: 0.8059

Model: Logistic Regression
F1-Macro: 0.8218 ± 0.1078
Precision-Macro: 0.8095
Recall-Macro: 0.8735

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.6071 ± 0.1220
Precision-Macro: 0.6242
Recall-Macro: 0.6531

Model: Logistic Regression
F1-Macro: 0.5945 ± 0.1063
Precision-Macro: 0.6008
Recall-Macro: 0.6348

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9262 ± 0.0351
Precision-Macro: 0.9264
Recall-Macro: 0.9282

Model: SVM
F1-Macro: 0.9256 ± 0.0449
Precision-Macro: 0.9248
Recall-Macro: 0.9310

Model: Logistic Regression
F1-Macro: 0.9449 ± 0.0393
Precision-Macro: 0.9440
Recall-Macro: 0.9499

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5971 ± 0.0223
Precision-Macro: 0.577

# Linear Feature Addition to TF-IDF

In [39]:
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("speaking time (s)")
numerical_features.remove("mmse")  # mmse will be added manually at the start
numerical_features.insert(0, "mmse")

In [40]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    for model_name, model in models.items():
        X_text = task_df[['speech']]  # Start with text only
        y = LabelEncoder().fit_transform(task_df['diagnosis'])

        # Define TF-IDF for baseline
        tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=5000, stop_words=stop_words)

        baseline_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        baseline_pipeline = Pipeline([
            ('preprocessor', baseline_preprocessor),
            ('clf', model)
        ])

        # Baseline: TF-IDF only
        baseline_score = cross_val_score(baseline_pipeline, X_text, y, cv=kf, scoring=f1_scorer).mean()
        print(f"\nModel: {model_name}")
        print(f"Baseline (TF-IDF only) F1-Macro: {baseline_score:.4f}")

        # Feature selection loop
        best_score = baseline_score
        selected_features = []

        for feature in numerical_features:

            current_features = selected_features + [feature]

            preprocessor = ColumnTransformer([
                ('tfidf', tfidf, 'speech'),
                ('num', StandardScaler(), current_features)
            ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', model)
            ])

            X_all = task_df[['speech'] + current_features]
            new_score = cross_val_score(pipeline, X_all, y, cv=kf, scoring=f1_scorer).mean()

            if new_score - best_score >= best_score * 0.01:
                best_score = new_score
                selected_features.append(feature)
                print(f"✅ Added Feature: {feature}, New F1-Macro: {new_score:.4f}")
            else:
                print(f"❌ Skipped Feature: {feature}, F1-Macro: {new_score:.4f}")

        print(f"\nFinal Selected Features: {selected_features}, Final F1-Macro: {best_score:.4f}")

        # Final evaluation with all selected features
        final_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech'),
            ('num', StandardScaler(), selected_features)
        ]) if selected_features else ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        final_pipeline = Pipeline([
            ('preprocessor', final_preprocessor),
            ('clf', model)
        ])

        X_final = task_df[['speech'] + selected_features] if selected_features else task_df[['speech']]

        final_scores = cross_validate(final_pipeline, X_final, y, cv=kf, scoring=scoring)

        print(f"\n🔎 Final Evaluation with Selected Features:")
        print(f"F1-Macro: {final_scores['test_f1_macro'].mean():.4f} ± {final_scores['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {final_scores['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {final_scores['test_recall_macro'].mean():.4f}")
        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": final_scores['test_f1_macro'].mean(),
            "Std F1": final_scores['test_f1_macro'].std(),
            "Mean Precision Macro": final_scores['test_precision_macro'].mean(),
            "Mean Recall Macro": final_scores['test_recall_macro'].mean(),
            "Type": "+".join(["TF-IDF"] + selected_features) if selected_features else "TF-IDF only"
        })


### MCI vs. AD ###


Model: Random Forest
Baseline (TF-IDF only) F1-Macro: 0.4679
❌ Skipped Feature: mmse, F1-Macro: 0.4679
❌ Skipped Feature: on, F1-Macro: 0.4679
❌ Skipped Feature: co, F1-Macro: 0.4679
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.4679
❌ Skipped Feature: mlu, F1-Macro: 0.4679
❌ Skipped Feature: stop_words, F1-Macro: 0.4679
❌ Skipped Feature: tree_depth, F1-Macro: 0.4679
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.4679
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.4679
❌ Skipped Feature: sid, F1-Macro: 0.4679
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: pid, F1-Macro: 0.4679
❌ Skipped Feature: pid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: maas, F1-Macro: 0.4679
❌ Skipped Feature: frazier_score, F1-Macro: 0.4679
❌ Skipped Feature: words_per_clause, F1-Macro: 0.4679
❌ Skipped Feature: short_pause, F1-Macro: 0.4679
❌ Skipped Feature: mid_pause, F1-Macro: 0.4679
❌ Skipped Feature: long_pause, F1-Macro: 0.4679
❌ Skipped

❌ Skipped Feature: short_pause, F1-Macro: 0.9307
❌ Skipped Feature: mid_pause, F1-Macro: 0.9253
❌ Skipped Feature: long_pause, F1-Macro: 0.9186
❌ Skipped Feature: mean_surprisal, F1-Macro: 0.9283

Final Selected Features: ['mmse', 'on', 'mean_sent_embs', 'tree_depth'], Final F1-Macro: 0.9428

🔎 Final Evaluation with Selected Features:
F1-Macro: 0.9428 ± 0.0474
Precision-Macro: 0.9428
Recall-Macro: 0.9457

Model: SVM
Baseline (TF-IDF only) F1-Macro: 0.8082
✅ Added Feature: mmse, New F1-Macro: 0.9622
❌ Skipped Feature: on, F1-Macro: 0.9573
❌ Skipped Feature: co, F1-Macro: 0.9622
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.9574
❌ Skipped Feature: mlu, F1-Macro: 0.9623
❌ Skipped Feature: stop_words, F1-Macro: 0.9598
❌ Skipped Feature: tree_depth, F1-Macro: 0.9623
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.9598
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.9574
❌ Skipped Feature: sid, F1-Macro: 0.9598
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.9550
❌ Skipped Feat

# Linear Feature Deletion from TF-IDF

In [41]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    for model_name, model in models.items():
        X_text = task_df[['speech']]  # Start with text only
        y = LabelEncoder().fit_transform(task_df['diagnosis'])

        baseline_preprocessor = ColumnTransformer([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), max_features=5000, stop_words=stop_words), 'speech')
        ])

        baseline_pipeline = Pipeline([
            ('preprocessor', baseline_preprocessor),
            ('clf', model)
        ])

        # Baseline: TF-IDF only
        baseline_score = cross_val_score(baseline_pipeline, X_text, y, cv=kf, scoring=f1_scorer).mean()
        print(f"\nModel: {model_name}")
        print(f"Baseline (TF-IDF only) F1-Macro: {baseline_score:.4f}")

        # Feature selection loop
        best_score = baseline_score
        selected_features = numerical_features.copy()
        current_features = selected_features.copy()
        for feature in numerical_features:

            current_features.remove(feature)

            preprocessor = ColumnTransformer([
                ('tfidf', tfidf, 'speech'),
                ('num', StandardScaler(), current_features)
            ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', model)
            ])

            X_all = task_df[['speech'] + current_features]
            new_score = cross_val_score(pipeline, X_all, y, cv=kf, scoring=f1_scorer).mean()

            if new_score - best_score >= best_score * 0.01:
                best_score = new_score
                selected_features.remove(feature)
                print(f"✅ Removed Feature: {feature}, New F1-Macro: {new_score:.4f}")
            else:
                print(f"❌ Skipped Feature: {feature}, F1-Macro: {new_score:.4f}")

        print(f"\nFinal Selected Features: {selected_features}, Final F1-Macro: {best_score:.4f}")

        # Final evaluation with all selected features
        final_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech'),
            ('num', StandardScaler(), selected_features)
        ]) if selected_features else ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        final_pipeline = Pipeline([
            ('preprocessor', final_preprocessor),
            ('clf', model)
        ])

        X_final = task_df[['speech'] + selected_features] if selected_features else task_df[['speech']]

        final_scores = cross_validate(final_pipeline, X_final, y, cv=kf, scoring=scoring)

        print(f"\n🔎 Final Evaluation with Selected Features:")
        print(f"F1-Macro: {final_scores['test_f1_macro'].mean():.4f} ± {final_scores['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {final_scores['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {final_scores['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": final_scores['test_f1_macro'].mean(),
            "Std F1": final_scores['test_f1_macro'].std(),
            "Mean Precision Macro": final_scores['test_precision_macro'].mean(),
            "Mean Recall Macro": final_scores['test_recall_macro'].mean(),
            "Type": "+".join(["TF-IDF"] + selected_features) if selected_features else "TF-IDF only"
        })



### MCI vs. AD ###


Model: Random Forest
Baseline (TF-IDF only) F1-Macro: 0.4679
❌ Skipped Feature: mmse, F1-Macro: 0.4679
❌ Skipped Feature: on, F1-Macro: 0.4679
❌ Skipped Feature: co, F1-Macro: 0.4679
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.4679
❌ Skipped Feature: mlu, F1-Macro: 0.4679
❌ Skipped Feature: stop_words, F1-Macro: 0.4679
❌ Skipped Feature: tree_depth, F1-Macro: 0.4679
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.4679
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.4679
❌ Skipped Feature: sid, F1-Macro: 0.4679
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: pid, F1-Macro: 0.4679
❌ Skipped Feature: pid_efficiency, F1-Macro: 0.4679
❌ Skipped Feature: maas, F1-Macro: 0.4679
❌ Skipped Feature: frazier_score, F1-Macro: 0.4679
❌ Skipped Feature: words_per_clause, F1-Macro: 0.4679
❌ Skipped Feature: short_pause, F1-Macro: 0.4679
❌ Skipped Feature: mid_pause, F1-Macro: 0.4679
❌ Skipped Feature: long_pause, F1-Macro: 0.4679
❌ Skipped

❌ Skipped Feature: long_pause, F1-Macro: 0.5038
❌ Skipped Feature: mean_surprisal, F1-Macro: 0.4796

Final Selected Features: ['on', 'co', 'mean_sent_embs', 'mlu', 'stop_words', 'tree_depth', 'verbs_with_inflections', 'nouns_with_determiners', 'sid', 'sid_efficiency', 'pid', 'pid_efficiency', 'maas', 'frazier_score', 'words_per_clause', 'short_pause', 'mid_pause', 'long_pause', 'mean_surprisal'], Final F1-Macro: 0.5827

🔎 Final Evaluation with Selected Features:
F1-Macro: 0.5827 ± 0.1110
Precision-Macro: 0.5811
Recall-Macro: 0.6308

### AD vs. Control ###


Model: Random Forest
Baseline (TF-IDF only) F1-Macro: 0.8078
❌ Skipped Feature: mmse, F1-Macro: 0.7865
❌ Skipped Feature: on, F1-Macro: 0.7944
✅ Removed Feature: co, New F1-Macro: 0.8244
❌ Skipped Feature: mean_sent_embs, F1-Macro: 0.8194
❌ Skipped Feature: mlu, F1-Macro: 0.8086
❌ Skipped Feature: stop_words, F1-Macro: 0.8021
❌ Skipped Feature: tree_depth, F1-Macro: 0.7993
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.7913


❌ Skipped Feature: tree_depth, F1-Macro: 0.5356
❌ Skipped Feature: verbs_with_inflections, F1-Macro: 0.5384
❌ Skipped Feature: nouns_with_determiners, F1-Macro: 0.5174
❌ Skipped Feature: sid, F1-Macro: 0.5190
❌ Skipped Feature: sid_efficiency, F1-Macro: 0.5198
❌ Skipped Feature: pid, F1-Macro: 0.5133
❌ Skipped Feature: pid_efficiency, F1-Macro: 0.5274
❌ Skipped Feature: maas, F1-Macro: 0.5326
❌ Skipped Feature: frazier_score, F1-Macro: 0.5412
❌ Skipped Feature: words_per_clause, F1-Macro: 0.5309
❌ Skipped Feature: short_pause, F1-Macro: 0.5257
❌ Skipped Feature: mid_pause, F1-Macro: 0.5221
❌ Skipped Feature: long_pause, F1-Macro: 0.5245
❌ Skipped Feature: mean_surprisal, F1-Macro: 0.5079

Final Selected Features: ['co', 'mean_sent_embs', 'mlu', 'stop_words', 'tree_depth', 'verbs_with_inflections', 'nouns_with_determiners', 'sid', 'sid_efficiency', 'pid', 'pid_efficiency', 'maas', 'frazier_score', 'words_per_clause', 'short_pause', 'mid_pause', 'long_pause', 'mean_surprisal'], Final F1-

# Sequential Feature Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer
import numpy as np

f1_macro_scorer = make_scorer(f1_score, average='macro')

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    all_numeric = task_df.select_dtypes(include=[np.number]).drop(columns=['speaking time (s)']).columns.tolist()

    X_num_all = task_df[all_numeric].values
    numeric_feature_names = all_numeric
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        sfs = SFS(
            estimator=clone(model),
            k_features='best',
            floating=True,
            scoring=f1_macro_scorer,
            cv=kf,
            n_jobs=-1
        )

        sfs = sfs.fit(X_num_all, y)
        selected_idx = list(sfs.k_feature_idx_)
        selected_features = [numeric_feature_names[i] for i in selected_idx]
        X_selected = task_df[selected_features]
        print("Selected numeric features: ", selected_features)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), selected_features)
            ]
        )

        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X_selected, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "+".join(selected_features),
        })

        ### TF-IDF + Selected Features
        preprocessor = ColumnTransformer(
            transformers=[
                ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
                ('num', StandardScaler(), selected_features)
            ]
        )

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        X_selected = task_df[['speech'] + selected_features]

        cv_results = cross_validate(pipeline, X_selected, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "+".join(["TF-IDF"] + selected_features)
        })



### MCI vs. AD ###


Model: Random Forest
Selected numeric features:  ['mmse', 'short_pause']
F1-Macro: 0.8273 ± 0.0572
Precision-Macro: 0.8103
Recall-Macro: 0.8914
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
Selected numeric features:  ['mmse']
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319
F1-Macro: 0.8170 ± 0.0721
Precision-Macro: 0.7851
Recall-Macro: 0.8916

Model: Logistic Regression
Selected numeric features:  ['on', 'co', 'mean_sent_embs', 'stop_words', 'tree_depth', 'verbs_with_inflections', 'nouns_with_determiners', 'sid', 'sid_efficiency', 'pid', 'pid_efficiency', 'maas', 'frazier_score', 'mmse', 'short_pause', 'mean_surprisal']
F1-Macro: 0.8074 ± 0.0685
Precision-Macro: 0.7798
Recall-Macro: 0.8925
F1-Macro: 0.7985 ± 0.0819
Precision-Macro: 0.7744
Recall-Macro: 0.8697

### MCI vs. Control ###


Model: Random Forest
Selected numeric features:  ['verbs_with_inflections', 'pid', 'mmse', 'mean_surprisal']
F1-Macro: 0

# Results

In [19]:
import pandas as pd

In [2]:
all_results = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/src/classification/all_results.csv")

In [20]:
all_results = pd.DataFrame(results)
# all_results.to_csv("all_results.csv", index=False)

In [21]:
all_results.head()

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
0,MCI vs. AD,Random Forest,0.820021,0.778155,0.931896,0.074311,MMSE
1,MCI vs. AD,SVM,0.820021,0.778155,0.931896,0.074311,MMSE
2,MCI vs. AD,Logistic Regression,0.820021,0.778155,0.931896,0.074311,MMSE
3,MCI vs. Control,Random Forest,0.571365,0.606726,0.613954,0.112546,MMSE
4,MCI vs. Control,SVM,0.616925,0.665836,0.64067,0.150424,MMSE


In [22]:
all_results = all_results[all_results["Model"] == "SVM"]

In [23]:
all_results.loc[all_results.groupby("Task")["Mean F1 Macro"].idxmax()]

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
31,AD vs. Control,SVM,0.962242,0.961029,0.968169,0.031528,TF-IDF + MMSE
73,MCI vs. AD,SVM,0.843426,0.809513,0.9201,0.086444,TF-IDF+mmse+on+stop_words+frazier_score
82,MCI vs. AD vs. Control,SVM,0.740903,0.751815,0.773452,0.079807,TF-IDF+mmse+co+verbs_with_inflections+short_pause
76,MCI vs. Control,SVM,0.685734,0.711041,0.673876,0.195618,TF-IDF+mmse+sid+mean_surprisal
