In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import KFold, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

In [3]:
df = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data_filtered.csv")

In [4]:
df.head()

Unnamed: 0,id,diagnosis,speech,annotation,speaking time (s),on,co,mean_sent_embs,mlu,stop_words,...,sid_efficiency,pid,pid_efficiency,maas,frazier_score,words_per_clause,mmse,short_pause,mid_pause,long_pause
0,138-1,Control,there's a cookie jar on the shelf . and the li...,# sent_id = 1\n# text = there's a cookie jar o...,46.2,0.0,0.105263,0.802693,9.571429,0.589552,...,0.974026,0.123539,0.324675,0.025319,0.727537,6.090909,28.0,0,0,1
1,631-0,Control,the kids are in the cookies . the stool is fal...,# sent_id = 1\n# text = the kids are in the co...,17.15,0.0,0.0,0.853938,7.25,0.534483,...,1.107872,0.059091,0.233236,0.024072,0.868304,4.461538,29.0,0,0,0
2,121-0,Control,the boy is taking a cookie out of the cookie j...,# sent_id = 1\n# text = the boy is taking a co...,128.05,0.0,0.153846,0.801815,11.666667,0.567857,...,0.788754,0.21821,0.554471,0.026686,0.719372,6.666667,30.0,0,1,0
3,142-3,Control,the water's running over on the floor . the st...,# sent_id = 1\n# text = the water's running ov...,16.82,0.0,0.285714,0.838216,7.0,0.5,...,1.070155,0.280952,0.653983,0.019465,1.004314,6.0,30.0,0,0,0
4,267-2,Control,mother is drying the dishes and looking out th...,# sent_id = 1\n# text = mother is drying the d...,39.71,0.0,0.0,0.795138,10.090909,0.54955,...,1.133216,0.304233,0.956938,0.023439,0.910656,6.529412,30.0,2,0,0


In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average="macro")
stop_words = set(stopwords.words('english'))
stop_words = list(stop_words)
scoring = {
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0)
}

# Baselines

In [6]:
results = []

In [7]:
tasks = {
    "MCI vs. AD": df[df['diagnosis'].isin(['MCI', 'AD'])],
    "MCI vs. Control": df[df['diagnosis'].isin(['MCI', 'Control'])],
    "AD vs. Control": df[df['diagnosis'].isin(['AD', 'Control'])],
    "MCI vs. AD vs. Control": df
}

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "SVM": SVC(random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')
}

## MMSE

In [7]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['mmse']].values
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")
        cv_results = cross_validate(model, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "MMSE"
        })



### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

Model: SVM
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

Model: Logistic Regression
F1-Macro: 0.8200 ± 0.0743
Precision-Macro: 0.7782
Recall-Macro: 0.9319

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5714 ± 0.1125
Precision-Macro: 0.6067
Recall-Macro: 0.6140

Model: SVM
F1-Macro: 0.6169 ± 0.1504
Precision-Macro: 0.6658
Recall-Macro: 0.6407

Model: Logistic Regression
F1-Macro: 0.6109 ± 0.1687
Precision-Macro: 0.6256
Recall-Macro: 0.6503

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9575 ± 0.0327
Precision-Macro: 0.9562
Recall-Macro: 0.9640

Model: SVM
F1-Macro: 0.9550 ± 0.0321
Precision-Macro: 0.9552
Recall-Macro: 0.9603

Model: Logistic Regression
F1-Macro: 0.9571 ± 0.0401
Precision-Macro: 0.9562
Recall-Macro: 0.9613

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.6755 ± 0.0696
Precision-Macro: 0.706

## TF-IDF

In [8]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df['speech']
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words)),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })


### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: Logistic Regression
F1-Macro: 0.6586 ± 0.1372
Precision-Macro: 0.6785
Recall-Macro: 0.6610

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: Logistic Regression
F1-Macro: 0.4796 ± 0.0915
Precision-Macro: 0.4756
Recall-Macro: 0.4958

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.8078 ± 0.0699
Precision-Macro: 0.8111
Recall-Macro: 0.8112

Model: SVM
F1-Macro: 0.8082 ± 0.0576
Precision-Macro: 0.8120
Recall-Macro: 0.8151

Model: Logistic Regression
F1-Macro: 0.7814 ± 0.0580
Precision-Macro: 0.7875
Recall-Macro: 0.7906

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5029 ± 0.0276
Precision-Macro: 0.491

## TF-IDF + MMSE

In [9]:
for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['speech', 'mmse']]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # Define column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('mmse', StandardScaler(), ['mmse'])
        ]
    )

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })



### MCI vs. AD ###


Model: Random Forest
F1-Macro: 0.4679 ± 0.0115
Precision-Macro: 0.4400
Recall-Macro: 0.5000

Model: SVM
F1-Macro: 0.8170 ± 0.0721
Precision-Macro: 0.7851
Recall-Macro: 0.8916

Model: Logistic Regression
F1-Macro: 0.8357 ± 0.0816
Precision-Macro: 0.7966
Recall-Macro: 0.9386

### MCI vs. Control ###


Model: Random Forest
F1-Macro: 0.5085 ± 0.1649
Precision-Macro: 0.4751
Recall-Macro: 0.5500

Model: SVM
F1-Macro: 0.6503 ± 0.1624
Precision-Macro: 0.7113
Recall-Macro: 0.6416

Model: Logistic Regression
F1-Macro: 0.6218 ± 0.1890
Precision-Macro: 0.6314
Recall-Macro: 0.6372

### AD vs. Control ###


Model: Random Forest
F1-Macro: 0.9257 ± 0.0444
Precision-Macro: 0.9297
Recall-Macro: 0.9282

Model: SVM
F1-Macro: 0.9622 ± 0.0315
Precision-Macro: 0.9610
Recall-Macro: 0.9682

Model: Logistic Regression
F1-Macro: 0.9551 ± 0.0339
Precision-Macro: 0.9540
Recall-Macro: 0.9613

### MCI vs. AD vs. Control ###


Model: Random Forest
F1-Macro: 0.5972 ± 0.0315
Precision-Macro: 0.577

In [10]:
baselines_results = pd.DataFrame(results)

In [11]:
baselines_results.head()

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
0,MCI vs. AD,Random Forest,0.820021,0.778155,0.931896,0.074311,MMSE
1,MCI vs. AD,SVM,0.820021,0.778155,0.931896,0.074311,MMSE
2,MCI vs. AD,Logistic Regression,0.820021,0.778155,0.931896,0.074311,MMSE
3,MCI vs. Control,Random Forest,0.571365,0.606726,0.613954,0.112546,MMSE
4,MCI vs. Control,SVM,0.616925,0.665836,0.64067,0.150424,MMSE


In [12]:
baselines_results.loc[baselines_results.groupby('Task')['Mean F1 Macro'].idxmax()]

Unnamed: 0,Task,Model,Mean F1 Macro,Mean Precision Macro,Mean Recall Macro,Std F1,Type
31,AD vs. Control,SVM,0.962242,0.961029,0.968169,0.031528,TF-IDF + MMSE
26,MCI vs. AD,Logistic Regression,0.835719,0.796607,0.938616,0.081554,TF-IDF + MMSE
10,MCI vs. AD vs. Control,SVM,0.695081,0.713441,0.76155,0.056706,MMSE
28,MCI vs. Control,SVM,0.650258,0.711317,0.641573,0.162386,TF-IDF + MMSE


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import KFold, cross_validate
import pandas as pd

# --- Setup ---
text_column = 'speech'
always_include_num = ['mmse']
candidate_num = df.select_dtypes(include=[np.number]).columns.tolist()
candidate_num.remove('mmse') # Replace with your actual feature names
target = 'diagnosis'

# --- Prepare your dataset ---
# Ensure X is a DataFrame (important!)
X = df[[text_column] + always_include_num + candidate_num].copy()
y = df[target]

# --- Step 1: Select features using SFS on numeric data only ---
from sklearn.linear_model import LogisticRegression

# Use only candidate features for SFS
X_sfs = df[candidate_num]

sfs = SequentialFeatureSelector(
    SVC(random_state=42, class_weight='balanced'),
    n_features_to_select='auto',
    direction='forward',
    scoring='f1_macro',
    tol = 0.005,
    cv=kf,
    n_jobs=-1,
)
sfs.fit(X_sfs, y)

selected_features = [name for name, selected in zip(candidate_num, sfs.get_support()) if selected]
print("Selected features:", selected_features)

# --- Step 2: Build final pipeline using selected features ---
all_numeric = always_include_num + selected_features

preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), text_column),
    ('num', StandardScaler(), all_numeric)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42, class_weight='balanced'))
])

# --- Step 3: Cross-validation ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_results = cross_validate(
    pipeline, X, y,
    cv=kf,
    scoring=['accuracy', 'f1_macro'],
    return_train_score=True,
    error_score='raise'  # <- optional: force showing errors
)

import numpy as np
print("Accuracy: ", np.mean(cv_results['test_accuracy']))
print("F1 Macro: ", np.mean(cv_results['test_f1_macro']))

Selected features: ['mean_sent_embs', 'nouns_with_determiners']
Accuracy:  0.8441414141414141
F1 Macro:  0.651820770959275


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import KFold, cross_validate
import pandas as pd
import numpy as np

# --- Setup ---
text_column = 'speech'
always_include_num = ['mmse']
candidate_num = df.select_dtypes(include=[np.number]).columns.tolist()
candidate_num.remove('mmse')  # Remove mmse from candidate features, as it's in always_include_num
target = 'diagnosis'

# --- Prepare your dataset ---
X = df[[text_column] + always_include_num + candidate_num].copy()
y = df[target]

# --- Step 1: Preprocess text and numeric features together ---
# Define KFold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# ColumnTransformer to handle both TF-IDF and numeric columns together
# We'll treat the entire TF-IDF as a single feature, so no need to break it down into words.
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), text_column),  # For text column
    ('num', StandardScaler(), always_include_num + candidate_num)  # For numeric features (including mmse)
])

# --- Step 2: Apply Sequential Feature Selection (SFS) to all features ---
# SFS will be applied to the full combined feature set (TF-IDF as one feature, plus numeric ones)
sfs = SequentialFeatureSelector(
    estimator=SVC(random_state=42, class_weight='balanced'),
    n_features_to_select='auto',  # Choose an optimal number of features
    direction='forward',  # Forward selection (could also use 'backward')
    scoring='f1_macro',
    # tol=0.005,
    cv=kf,
    n_jobs=-1
)

# Create the pipeline including preprocessing and feature selection
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('sfs', sfs),
    ('clf', SVC(random_state=42, class_weight='balanced'))
])

# --- Step 3: Cross-validation ---
cv_results = cross_validate(
    pipeline, X, y,
    cv=kf,
    scoring=['accuracy', 'f1_macro'],
    return_train_score=True,
    error_score='raise'  # Optional: show errors
)

# Output results
print("Accuracy: ", np.mean(cv_results['test_accuracy']))
print("F1 Macro: ", np.mean(cv_results['test_f1_macro']))


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold, cross_validate
import pandas as pd
import numpy as np

# --- Setup ---
text_column = 'speech'
always_include_num = ['mmse']
candidate_num = df.select_dtypes(include=[np.number]).columns.tolist()
candidate_num.remove('mmse')  # Replace with your actual feature names
target = 'diagnosis'

# --- Prepare your dataset ---
# Ensure X is a DataFrame (important!)
X = df[[text_column] + always_include_num + candidate_num].copy()
y = df[target]

# --- Step 1: Select features using RFECV on numeric data only ---
from sklearn.svm import SVC

# Use only candidate features for RFECV
X_sfs = df[candidate_num]

# Create a pipeline with a StandardScaler and SVC estimator
estimator = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('svc', SVC(random_state=42, class_weight='balanced'))  # Model
])

# Set up the RFECV for feature selection
rfecv = RFECV(
    estimator=estimator,
    step=1,  # Number of features to remove at each iteration
    min_features_to_select=1,  # Minimum number of features to select
    cv=kf,  # Cross-validation splitting strategy
    scoring='f1_macro',  # Use f1_macro as the scoring metric
    n_jobs=-1  # Parallelize across all CPUs
)

# Fit RFECV to the data
rfecv.fit(X_sfs, y)

# Get the selected features
selected_features = [name for name, selected in zip(candidate_num, rfecv.support_) if selected]
print("Selected features:", selected_features)

# --- Step 2: Build final pipeline using selected features ---
all_numeric = always_include_num + selected_features

# Create the preprocessing pipeline
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), text_column),
    ('num', StandardScaler(), all_numeric)
])

# Build the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42, class_weight='balanced'))
])

# --- Step 3: Cross-validation ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_results = cross_validate(
    pipeline, X, y,
    cv=kf,
    scoring=['accuracy', 'f1_macro'],
    return_train_score=True,
    error_score='raise'  # <- optional: force showing errors
)

import numpy as np
print("Accuracy: ", np.mean(cv_results['test_accuracy']))
print("F1 Macro: ", np.mean(cv_results['test_f1_macro']))


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold, cross_validate
import pandas as pd
import numpy as np

# --- Setup ---
text_column = 'speech'
always_include_num = ['mmse']
candidate_num = df.select_dtypes(include=[np.number]).columns.tolist()
candidate_num.remove('mmse')  # Replace with your actual feature names
target = 'diagnosis'

# --- Prepare your dataset ---
# Ensure X is a DataFrame (important!)
X = df[[text_column] + always_include_num + candidate_num].copy()
y = df[target]

# --- Step 1: Select features using RFECV on numeric data only ---
from sklearn.svm import SVC

# Use only candidate features for RFECV
X_sfs = df[candidate_num]

# Set up the RFECV for feature selection
rfecv = RFECV(
    estimator=SVC(random_state=42, class_weight='balanced'),
    step=1,  # Number of features to remove at each iteration
    min_features_to_select=1,  # Minimum number of features to select
    cv=kf,  # Cross-validation splitting strategy
    scoring='f1_macro',  # Use f1_macro as the scoring metric
    n_jobs=-1  # Parallelize across all CPUs
)

# Fit RFECV to the data
rfecv.fit(X_sfs, y)

# Get the selected features
selected_features = [name for name, selected in zip(candidate_num, rfecv.support_) if selected]
print("Selected features:", selected_features)

# --- Step 2: Build final pipeline using selected features ---
all_numeric = always_include_num + selected_features

# Create the preprocessing pipeline
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), text_column),
    ('num', StandardScaler(), all_numeric)
])

# Build the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42, class_weight='balanced'))
])

# --- Step 3: Cross-validation ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_results = cross_validate(
    pipeline, X, y,
    cv=kf,
    scoring=['accuracy', 'f1_macro'],
    return_train_score=True,
    error_score='raise'  # <- optional: force showing errors
)

import numpy as np
print("Accuracy: ", np.mean(cv_results['test_accuracy']))
print("F1 Macro: ", np.mean(cv_results['test_f1_macro']))


ERROR! Session/line number was not unique in database. History logging moved to new session 808


In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_validate
from sklearn.feature_selection import RFECV
import pandas as pd

# --- Setup ---
text_column = 'speech'
always_include_num = ['mmse']
candidate_num = df.select_dtypes(include=[np.number]).columns.tolist()
candidate_num.remove('mmse')
target = 'diagnosis'

# --- Prepare your dataset ---
# Ensure X is a DataFrame (important!)
X = df[[text_column] + always_include_num + candidate_num].copy()
y = df[target]

# --- Step 1: Setup RFECV for feature selection ---
from sklearn.linear_model import LogisticRegression

# Use all numeric features for RFECV selection
X_rfecv = df[always_include_num + candidate_num]

# Initialize the model for RFECV
model = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)

# Initialize RFECV with the model, step=1 removes one feature at a time, cv=5 is the number of cross-validation folds
rfecv = RFECV(estimator=model, step=1, cv=KFold(10), scoring='f1_macro')
X_rfecv_selected = rfecv.fit_transform(X_rfecv, y)

# Get the selected feature names
selected_features = [name for name, selected in zip(always_include_num + candidate_num, rfecv.support_) if selected]
print("Selected features by RFECV:", selected_features)

# --- Step 2: Build final pipeline using selected features ---
all_numeric = always_include_num + selected_features

# Column transformer with TF-IDF for text and standard scaler for numeric features
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words=stop_words), text_column),
    ('num', StandardScaler(), all_numeric)
])

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42))
])

# --- Step 3: Cross-validation ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_results = cross_validate(
    pipeline, X, y,
    cv=kf,
    scoring=['accuracy', 'f1_macro'],
    return_train_score=True,
    error_score='raise'  # <- optional: force showing errors
)

import numpy as np
print("Accuracy: ", np.mean(cv_results['test_accuracy']))
print("F1 Macro: ", np.mean(cv_results['test_f1_macro']))


Selected features by RFECV: ['mmse', 'co', 'mean_sent_embs', 'tree_depth', 'nouns_with_determiners', 'sid_efficiency', 'pid_efficiency', 'frazier_score']
Accuracy:  0.861868686868687
F1 Macro:  0.6930257986614607
