In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = set(stopwords.words('english'))

In [2]:
stop_words = list(stop_words)

In [13]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

# Define custom scorers
scoring = {
    'f1_macro': make_scorer(f1_score, average='macro'),
    'precision_macro': make_scorer(precision_score, average='macro'),
    'recall_macro': make_scorer(recall_score, average='macro')
}

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    task_df = task_df.dropna(subset=['speech', 'mmse', 'diagnosis'])  # Clean up
    X = task_df[['speech', 'mmse']]
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    # Define column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('mmse', StandardScaler(), ['mmse'])
        ]
    )

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

        print(f"F1-Macro: {cv_results['test_f1_macro'].mean():.4f} ± {cv_results['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {cv_results['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {cv_results['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": cv_results['test_f1_macro'].mean(),
            "Mean Precision Macro": cv_results['test_precision_macro'].mean(),
            "Mean Recall Macro": cv_results['test_recall_macro'].mean(),
            "Std F1": cv_results['test_f1_macro'].std(),
            "Type": "TF-IDF + MMSE"
        })


NameError: name 'tasks' is not defined

In [33]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

# Load your data
# df = pd.read_csv("your_data.csv")
df = result  # Assuming 'result' is already your DataFrame

# Define classification tasks
tasks = {
    "MCI vs. AD": df[df['diagnosis'].isin(['MCI', 'AD'])],
    "MCI vs. Control": df[df['diagnosis'].isin(['MCI', 'Control'])],
    "AD vs. Control": df[df['diagnosis'].isin(['AD', 'Control'])],
    "MCI vs. AD vs. Control": df
}

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "SVM": SVC(random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average="macro")
stop_words = set(stopwords.words('english'))

# Numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("mmse")  # mmse will be added manually at the start
numerical_features.insert(0, "mmse")

results = []

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    for model_name, model in models.items():
        X_text = task_df[['speech']]  # Start with text only
        y = LabelEncoder().fit_transform(task_df['diagnosis'])

        # Define TF-IDF for baseline
        tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=5000, stop_words=stop_words)

        baseline_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        baseline_pipeline = Pipeline([
            ('preprocessor', baseline_preprocessor),
            ('clf', model)
        ])

        # Baseline: TF-IDF only
        baseline_score = cross_val_score(baseline_pipeline, X_text, y, cv=kf, scoring=f1_scorer).mean()
        print(f"\nModel: {model_name}")
        print(f"Baseline (TF-IDF only) F1-Macro: {baseline_score:.4f}")

        # Feature selection loop
        best_score = baseline_score
        selected_features = []

        for feature in numerical_features:
            if feature == "speaking time (s)":
                continue

            current_features = selected_features + [feature]

            preprocessor = ColumnTransformer([
                ('tfidf', tfidf, 'speech'),
                ('num', StandardScaler(), current_features)
            ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', model)
            ])

            X_all = task_df[['speech'] + current_features]
            new_score = cross_val_score(pipeline, X_all, y, cv=kf, scoring=f1_scorer).mean()

            if new_score - best_score >= best_score * 0.001:
                best_score = new_score
                selected_features.append(feature)
                print(f"✅ Added Feature: {feature}, New F1-Macro: {new_score:.4f}")
            else:
                print(f"❌ Skipped Feature: {feature}, F1-Macro: {new_score:.4f}")

        print(f"\nFinal Selected Features: {selected_features}, Final F1-Macro: {best_score:.4f}")

        # Final evaluation with all selected features
        final_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech'),
            ('num', StandardScaler(), selected_features)
        ]) if selected_features else ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        final_pipeline = Pipeline([
            ('preprocessor', final_preprocessor),
            ('clf', model)
        ])

        X_final = task_df[['speech'] + selected_features] if selected_features else task_df[['speech']]

        scoring_final = {
            'f1_macro': make_scorer(f1_score, average='macro'),
            'precision_macro': make_scorer(precision_score, average='macro'),
            'recall_macro': make_scorer(recall_score, average='macro')
        }

        final_scores = cross_validate(final_pipeline, X_final, y, cv=kf, scoring=scoring_final)

        print(f"\n🔎 Final Evaluation with Selected Features:")
        print(f"F1-Macro: {final_scores['test_f1_macro'].mean():.4f} ± {final_scores['test_f1_macro'].std():.4f}")
        print(f"Precision-Macro: {final_scores['test_precision_macro'].mean():.4f}")
        print(f"Recall-Macro: {final_scores['test_recall_macro'].mean():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": final_scores['test_f1_macro'].mean(),
            "Std F1": final_scores['test_f1_macro'].std(),
            "Mean Precision Macro": final_scores['test_precision_macro'].mean(),
            "Mean Recall Macro": final_scores['test_recall_macro'].mean(),
            "Type": "+".join(selected_features) if selected_features else "TF-IDF only"
        })



### MCI vs. AD ###



ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/feature_extraction/text.py", line 2104, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {"wasn't", 'any', 'his', "aren't", 'i', 'her', "hadn't", 's', 'or', 'have', 'he', 'which', 'yourselves', "he'd", 'd', 'again', 'himself', 'our', "she's", 'at', "didn't", "i'd", 'through', "we'll", 'while', 'few', 'been', 'being', "it'd", 'once', 'this', 'am', 'what', 'no', 'own', 'a', 'll', 'herself', "won't", 'off', 'those', 'same', 'ours', 'now', 'into', 'during', 'after', 'is', 'other', 'out', 'didn', 'doesn', 'between', 'further', 'that', 'themselves', 'had', 'over', "isn't", 'by', 'from', 'too', "haven't", 'not', 'can', 'ourselves', 'we', 'wouldn', "mustn't", "needn't", 'these', 'ma', 'both', 'up', "we've", 'won', "it's", 'needn', "wouldn't", 'me', "they'd", 'so', 'hasn', 'about', 'shan', 'where', 'but', 'if', 'before', "i'm", 'yourself', 'doing', 'such', 'than', 'y', 'will', 'was', 'why', 'they', "don't", 'because', "doesn't", "couldn't", 'them', 'having', 'all', 'don', "i've", 't', "we'd", 'below', 'you', 'for', 'ain', 'weren', 'in', 'then', 'my', "he'll", 'hers', 'it', 'when', 'haven', "it'll", 'only', 'be', "hasn't", 'm', 'each', "he's", "should've", 'were', "they're", 'who', 'him', 'most', "weren't", 'the', 've', 'hadn', 'under', "she'll", 'on', 'until', 'of', 'mightn', 'did', 'shouldn', 'isn', "you're", 'mustn', "they'll", "shan't", 'whom', "you'd", 'here', 'there', 'does', 'she', 'its', 'do', 'down', "i'll", "mightn't", 'are', 'just', 'as', 'against', 'and', 'nor', 'how', 'with', 'above', "shouldn't", 'your', 'their', "they've", "that'll", "she'd", 'has', 'wasn', 'more', "you'll", 'myself', 'o', "you've", 'very', 'should', 'an', 'couldn', 'itself', 'to', 'theirs', "we're", 'some', 're', 'yours', 'aren'} instead.


In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

# Load data
# df = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data.csv")
df = result

# Define classification tasks
tasks = {
    "MCI vs. AD": df[df['diagnosis'].isin(['MCI', 'AD'])],
    "MCI vs. Control": df[df['diagnosis'].isin(['MCI', 'Control'])],
    "AD vs. Control": df[df['diagnosis'].isin(['AD', 'Control'])],
    "MCI vs. AD vs. Control": df
}

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "SVM": SVC(random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average="macro")

stop_words = set(stopwords.words('english'))
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("mmse")
numerical_features.insert(0, "mmse")

results = []

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")

    for model_name, model in models.items():
        X_text = task_df[['speech']]  # Keep as DataFrame
        y = LabelEncoder().fit_transform(task_df['diagnosis'])

        # Define baseline TF-IDF transformation
        tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=5000, stop_words=stop_words)

        preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech')  # Start with only TF-IDF
        ])

        print(f"\nModel: {model_name}")

        # Baseline model with only TF-IDF
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        baseline_score = cross_val_score(pipeline, X_text, y, cv=kf, scoring=f1_scorer).mean()
        print(f"Baseline (TF-IDF only) F1-Macro: {baseline_score:.4f}")

        best_score = baseline_score
        selected_features = []

        # Try adding numerical features one by one
        for feature in numerical_features:
            if feature == "speaking time (s)":
                continue

            if len(selected_features) == 0:
                f = [feature]
            else:
                f = selected_features + [feature]

            preprocessor = ColumnTransformer([
                ('tfidf', tfidf, 'speech'),
                ('num', StandardScaler(), f)
            ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', model)
            ])

            new_score = cross_val_score(pipeline, task_df, y, cv=kf, scoring=f1_scorer).mean()

            if new_score - best_score >= best_score * 0.001:  # Only keep features that improve performance
                best_score = new_score
                selected_features.append(feature)
                print(f"✅ Added Feature: {feature}, New F1-Macro: {new_score:.4f}")
            else:
                print(f"❌ Skipped Feature: {feature}, F1-Macro: {new_score:.4f}")

        print(f"\nFinal Selected Features: {selected_features}, Final F1-Macro: {best_score:.4f}")

        # Rebuild final pipeline with selected features
        final_preprocessor = ColumnTransformer([
            ('tfidf', tfidf, 'speech'),
            ('num', StandardScaler(), selected_features)
        ]) if selected_features else ColumnTransformer([
            ('tfidf', tfidf, 'speech')
        ])

        final_pipeline = Pipeline([
            ('preprocessor', final_preprocessor),
            ('clf', model)
        ])

        # Prepare input for prediction
        if selected_features:
            X_all = task_df[['speech'] + selected_features]
        else:
            X_all = task_df[['speech']]

        y_pred = cross_val_predict(final_pipeline, X_all, y, cv=kf)

        precision = precision_score(y, y_pred, average='macro')
        recall = recall_score(y, y_pred, average='macro')

        print(f"Precision (Macro): {precision:.4f}")
        print(f"Recall (Macro): {recall:.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": best_score,
            "Precision": precision,
            "Recall": recall,
            "Type": "+".join([feature for feature in selected_features])
        })


In [None]:
results = pd.DataFrame(results)

In [None]:
results = results.T.to_dict().values()

In [None]:
results = list(results)

In [None]:
results

In [None]:
results.to_csv("models_results.csv", index=False)

In [None]:
results

In [None]:
results_df = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    data=results_df,
    x="Task",
    y="Mean F1 Macro",
    hue="Model",
    capsize=0.1,
    errwidth=1.5
)

plt.ylim(0, 1)
plt.ylabel("Best F1-Macro")
plt.xlabel("Classification Task")
plt.title("Model Performance Across Tasks")
plt.xticks(rotation=15)
plt.legend(title="Model")

# Add labels to bars
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f", padding=3)

plt.show()

In [1]:
import pandas as pd
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key): self.key = key
    def fit(self, X, y=None): return self
    def transform(self, X): return X[self.key].values  # Convert to 1D array

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, keys): self.keys = keys
    def fit(self, X, y=None): return self
    def transform(self, X): return X[self.keys]  # Keep multiple numerical columns as DataFrame

df = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data.csv")

X = df[['speech'] + numerical_features]  # Speech (text) + Multiple numerical features
y = LabelEncoder().fit_transform(df['diagnosis'])  # Encode labels

feature_union = FeatureUnion([
    ('text_features', Pipeline([
        ('selector', TextSelector('speech')), 
        ('tfidf', TfidfVectorizer(ngram_range=(1, 3), max_features=5000))
    ])),
    ('numerical_features', Pipeline([
        ('selector', NumberSelector(numerical_features)), 
        ('scaler', StandardScaler())  # Scale all numerical features together
    ]))
])

svm_model = SVC(kernel="sigmoid", random_state=42, class_weight='balanced')

pipeline = Pipeline([
    ('features', feature_union),  # Process text and numerical features separately
    ('clf', svm_model)  # Classification model
])

param_grid = {
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf', 'sigmoid'],
    'clf__gamma': ['scale', 'auto']
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best F1-macro score:", grid_search.best_score_)


NameError: name 'numerical_features' is not defined

In [1]:
import pandas as pd

# Load the Excel file and select the "data" sheet
excel_file = pd.read_excel('/Users/kirillkonca/Downloads/Pitt/PItt-data.xlsx', sheet_name='match')

# Extract the 'id' and 'mms' columns
excel_data = excel_file[['id', 'mms']]


KeyError: "['mms'] not in index"

In [None]:
df = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data.csv")
merged_df = pd.merge(df, excel_data, on='id', how='left')

print(merged_df)

In [1]:
import pandas as pd

# Load the "match" sheet from the Excel file
excel_df = pd.read_excel('/Users/kirillkonca/Downloads/Pitt/PItt-data.xlsx', sheet_name='match')
df = pd.read_csv('/Users/kirillkonca/Documents/dementia_prediction/data.csv')

# Extract the main ID and sub-ID
df['main_id'] = df['id'].apply(lambda x: int(x.split('-')[0]))  # Main ID as integer
df['sub_id'] = df['id'].apply(lambda x: x.split('-')[1])  # Sub-ID as a string

# Function to get the relevant mmse value
def get_mmse(row):
    main_id = row['main_id']
    sub_id = row['sub_id']
    mmse_column = f"mmse{sub_id}"
    
    # Check if the column exists in the "match" sheet
    if mmse_column in excel_df.columns:
        result = excel_df.loc[excel_df['id'] == main_id, mmse_column]
        return result.values[0] if not result.empty else None
    else:
        return None

# Apply the function to extract mmse values
df['mmse'] = df.apply(get_mmse, axis=1)

# Drop helper columns if not needed
df.drop(columns=['main_id', 'sub_id'], inplace=True)


In [2]:
filtered_df = df[df['mmse'].isna()]

In [3]:
len(filtered_df)

91

In [4]:
result = df.merge(filtered_df, on=df.columns.tolist(), how='left', indicator=True)
result = result[result['_merge'] == 'left_only'].drop(columns='_merge')

In [5]:
len(result)

449

In [7]:
len(df)

540

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

df = df.dropna(subset=['diagnosis', 'mmse'])  # Drop rows where 'diagnosis' or 'mmse' is None

tasks = {
    "MCI vs. AD": df[df['diagnosis'].isin(['MCI', 'AD'])],
    "MCI vs. Control": df[df['diagnosis'].isin(['MCI', 'Control'])],
    "AD vs. Control": df[df['diagnosis'].isin(['AD', 'Control'])],
    "MCI vs. AD vs. Control": df
}

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "SVM": SVC(random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average="macro")

results = []

for task_name, task_df in tasks.items():
    print(f"\n### {task_name} ###\n")
    X = task_df[['mmse']].values  # Use 'mmse' as the feature
    y = LabelEncoder().fit_transform(task_df['diagnosis'])

    for model_name, model in models.items():
        print(f"\nModel: {model_name}")
        scores = cross_val_score(model, X, y, cv=kf, scoring=f1_scorer)

        print(f"F1-Macro: {scores.mean():.4f} ± {scores.std():.4f}")

        results.append({
            "Task": task_name,
            "Model": model_name,
            "Mean F1 Macro": scores.mean(),
            "Std F1": scores.std(),
            "Type": "mmse"
        })

In [None]:
results_df = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    data=results_df,
    x="Task",
    y="Mean F1 Macro",
    hue="Model",
    capsize=0.1,
    errwidth=1.5
)

plt.ylim(0, 1)
plt.ylabel("Best F1-Macro")
plt.xlabel("Classification Task")
plt.title("Model Performance Across Tasks (MMSE)")
plt.xticks(rotation=15)
plt.legend(title="Model")

# Add labels to bars
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f", padding=3)

plt.show()

In [None]:
len(result[result['diagnosis'] == 'Control'])

In [6]:
df2 = pd.read_csv("/Users/kirillkonca/Documents/dementia_prediction/data_pauses.csv")

In [7]:
df2

Unnamed: 0,id,diagnosis,speech,annotation,speaking time (s),on,co,short_pause,mid_pause,long_pause
0,138-1,Control,there's a cookie jar on the shelf . and the li...,# sent_id = 1\n# text = there's a cookie jar o...,46.200,0.0,0.105263,0,0,1
1,631-0,Control,the kids are in the cookies . the stool is fal...,# sent_id = 1\n# text = the kids are in the co...,17.150,0.0,0.000000,0,0,0
2,182-3,Control,Johnny's falling off the stool . the boy's fal...,# sent_id = 1\n# text = Johnny's falling off t...,17.800,0.0,0.076923,0,0,0
3,121-0,Control,the boy is taking a cookie out of the cookie j...,# sent_id = 1\n# text = the boy is taking a co...,128.050,0.0,0.153846,0,1,0
4,142-3,Control,the water's running over on the floor . the st...,# sent_id = 1\n# text = the water's running ov...,16.820,0.0,0.285714,0,0,0
...,...,...,...,...,...,...,...,...,...,...
535,270-1,AD,a boy is getting a cookie from the cookie jar ...,# sent_id = 1\n# text = a boy is getting a coo...,30.679,0.0,0.214286,0,1,1
536,213-2,AD,I see this woman here . and she's carrying som...,# sent_id = 1\n# text = I see this woman here ...,34.100,0.0,0.296296,1,1,0
537,091-1,AD,there's a boy getting in the cookie jar . and ...,# sent_id = 1\n# text = there's a boy getting ...,25.530,0.0,0.052632,1,1,0
538,579-0,AD,woman doing dishes . climbing up to get some c...,# sent_id = 1\n# text = woman doing dishes .\n...,11.810,0.0,0.200000,0,1,0


In [8]:
import pandas as pd

# Assuming df and df2 are your dataframes, and both contain an 'id' column

# Select only the relevant columns from df2
pause_columns = ['id', 'short_pause', 'mid_pause', 'long_pause']

# Merge into df based on 'id'
result = result.merge(df2[pause_columns], on='id', how='left')


In [9]:
result.to_csv("/Users/kirillkonca/Documents/dementia_prediction/data_filtered.csv", index=False)

In [None]:
result.to_csv("data_mmse.csv", index=False)

In [15]:
model = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')

In [46]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector

# We'll apply SFS on the output of preprocessing
sfs = SequentialFeatureSelector(model, direction='forward', cv=kf, n_features_to_select="auto", tol=0.001)



In [47]:
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("mmse")  # mmse will be added manually at the start
numerical_features.insert(0, "mmse")

In [48]:
preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('mmse', StandardScaler(), ['mmse'])
        ]
    )

In [49]:
pipeline_with_sfs = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', sfs),
    ('clf', model)
])

In [50]:
task = result[result['diagnosis'].isin(['MCI', 'AD'])]

In [51]:
preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), 'speech'),
            ('mmse', StandardScaler(), ['mmse'])
        ]
    )

In [52]:
numerical_features

['mmse',
 'speaking time (s)',
 'on',
 'co',
 'mean_sent_embs',
 'mlu',
 'stop_words',
 'tree_depth',
 'verbs_with_inflections',
 'nouns_with_determiners',
 'sid',
 'sid_efficiency',
 'pid',
 'pid_efficiency',
 'maas',
 'frazier_score',
 'words_per_clause']

In [53]:
features = numerical_features + ['speech']

In [54]:
X = task[features]
y = LabelEncoder().fit_transform(task['diagnosis'])

In [55]:
scores = cross_val_score(pipeline_with_sfs, X, y, cv=kf, scoring=make_scorer(f1_score, average='macro'))

print("Cross-validated scores:", scores)
print("Mean accuracy:", scores.mean())

Cross-validated scores: [0.75454545 0.88979592 0.58966565 0.71458774 0.88979592 0.55978261
 0.8125     0.75238095 0.79844961 0.75238095]
Mean accuracy: 0.7513884808479798


In [None]:
import sys
print(sys.executable)

In [None]:
!/Users/kirillkonca/miniforge3/bin/python3 -m pip install -U scikit-learn

In [19]:
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("mmse")  # mmse will be added manually at the start

NameError: name 'np' is not defined

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score

# ==== CONFIGURATION ====
tfidf_max_features = 5000
sfs_inner_folds = 5
cv_outer_folds = 10
random_seed = 42
metric = make_scorer(f1_score, average='macro')

# ==== YOUR DATA ====
# df: your DataFrame
# numerical_features: list of additional numerical column names (excluding 'mmse')
# target column name: 'target'

# ==== STEP 1: TF-IDF Fit ====
vectorizer = TfidfVectorizer(max_features=tfidf_max_features)
X_tfidf = vectorizer.fit_transform(df['speech']).toarray()

# ==== STEP 2: Prepare Extra Features ====
extra_features = df[['mmse'] + numerical_features].reset_index(drop=True)
scaler = StandardScaler()
X_extra = scaler.fit_transform(extra_features)

# ==== STEP 3: Combine TF-IDF + Extra Features ====
X_full = np.hstack((X_tfidf, X_extra))
y = df['diagnosis'].values

# ==== STEP 4: One-time SFS on Entire Dataset ====
print("Running Sequential Feature Selection on full training data...")
selector = SequentialFeatureSelector(
    estimator=LogisticRegression(max_iter=1000),
    n_features_to_select="auto",
    direction="forward",
    scoring='f1_macro',
    cv=StratifiedKFold(n_splits=sfs_inner_folds, shuffle=True, random_state=random_seed),
    n_jobs=-1
)

selector.fit(X_full, y)
selected_features = selector.get_support()
print(f"Selected {np.sum(selected_features)} features out of {X_full.shape[1]}")

# ==== STEP 5: Cross-validate using selected features ====
X_selected = X_full[:, selected_features]

clf = LogisticRegression(max_iter=1000)
kfold = StratifiedKFold(n_splits=cv_outer_folds, shuffle=True, random_state=random_seed)
scores = cross_val_score(clf, X_selected, y, scoring=metric, cv=kfold)

# ==== FINAL RESULTS ====
print("\n=== Cross-Validation Results ===")
print(f"F1-macro scores: {scores}")
print(f"Average F1-macro: {scores.mean():.4f}")
print(f"Standard deviation: {scores.std():.4f}")


Running Sequential Feature Selection on full training data...
Selected 258 features out of 517

=== Cross-Validation Results ===
F1-macro scores: [nan nan nan nan nan nan nan nan nan nan]
Average F1-macro: nan
Standard deviation: nan


Traceback (most recent call last):
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/utils/_response.py", line 207, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['AD' 'Control' 'MCI']

Traceback (most recent call last):
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
  File "/Users/kirillkonca/miniforge3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
text_feature = 'speech'
fixed_num_features = ['mmse']  # Always included
candidate_num_features = numerical_features  # Features for SFS

NameError: name 'numerical_features' is not defined

In [5]:
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [319]:
scoring = {
    'f1_macro': make_scorer(f1_score, average='macro'),
    'precision_macro': make_scorer(precision_score, average='macro'),
    'recall_macro': make_scorer(recall_score, average='macro')
}

In [336]:
from sklearn.feature_selection import RFECV


preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, stop_words=stop_words), text_feature),
        ('mmse', StandardScaler(), fixed_num_features),
        ('sfs_features', Pipeline([
            ('scaler', StandardScaler()),
            ('sfs', SequentialFeatureSelector(
                estimator=LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000),
                n_features_to_select='auto',  # Select top 2 features
                direction='forward',
                cv=cv_strategy,  # Use StratifiedKFold here
                scoring=make_scorer(f1_score, average='macro'),
                tol=0.05
            ))
        ]), candidate_num_features)
    ],
    remainder='drop'
)

In [337]:
task_df = result[result['diagnosis'].isin(['MCI', 'AD'])]

In [338]:
X = task_df[['speech', 'mmse'] + numerical_features]

In [339]:
y = LabelEncoder().fit_transform(task_df['diagnosis'])

In [340]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(random_state=42, class_weight='balanced'))
])

from sklearn.model_selection import cross_val_score

scores = cross_validate(
    estimator=pipeline,
    X=X,  # Your full training DataFrame
    y=y,
    cv=cv_strategy,  # Stratified K-Fold
    scoring=scoring,
    return_train_score=True,  # Optional: Compare train/test performance
    n_jobs=-1  # Parallelize if possible
)

In [341]:
scores

{'fit_time': array([0.95617104, 1.04173613, 0.94189072, 0.99636197, 1.04253221,
        1.01032495, 1.03534317, 0.98465419, 0.59779501, 0.60730076]),
 'score_time': array([0.01384091, 0.01288605, 0.01362705, 0.00716329, 0.00718307,
        0.00678992, 0.00560284, 0.01393509, 0.00538301, 0.00527406]),
 'test_f1_macro': array([0.75379939, 0.85326087, 0.88979592, 0.91793313, 0.8       ,
        0.85326087, 0.87727273, 0.75238095, 0.79844961, 0.79844961]),
 'train_f1_macro': array([0.85930172, 0.85930172, 0.86567901, 0.85930172, 0.85930172,
        0.85621089, 0.85621089, 0.87228405, 0.86578215, 0.87228405]),
 'test_precision_macro': array([0.72826087, 0.8       , 0.98      , 0.875     , 0.75      ,
        0.85326087, 0.83333333, 0.71428571, 0.75      , 0.75      ]),
 'train_precision_macro': array([0.80851064, 0.80851064, 0.81521739, 0.80851064, 0.80851064,
        0.80434783, 0.80434783, 0.82222222, 0.81521739, 0.82222222]),
 'test_recall_macro': array([0.79166667, 0.95833333, 0.8333333

In [342]:
scores['test_f1_macro'].mean()

0.8294603084754015

In [343]:
pipeline.fit(X, y)

# Extract selected features
sfs = pipeline.named_steps['preprocessor'].named_transformers_['sfs_features'].named_steps['sfs']
selected_indices = sfs.get_support(indices=True)
selected_features = [candidate_num_features[i] for i in selected_indices]

print("Selected features:", selected_features)

Selected features: ['tree_depth']


In [344]:
task_df.columns

Index(['id', 'diagnosis', 'speech', 'annotation', 'speaking time (s)', 'on',
       'co', 'mean_sent_embs', 'mlu', 'stop_words', 'tree_depth',
       'verbs_with_inflections', 'nouns_with_determiners', 'sid',
       'sid_efficiency', 'pid', 'pid_efficiency', 'maas', 'frazier_score',
       'words_per_clause', 'mmse', 'short_pause', 'mid_pause', 'long_pause'],
      dtype='object')