In [None]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from confidenceinterval import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, tnr_score
from scipy.stats import ttest_ind
import warnings
from joblib import Parallel, delayed
import os
import shap
import pickle
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
np.seterr(invalid='ignore')

os.environ['PYTHONHASHSEED'] = '0'
random_state = 42

In [5]:
# === FILE PATHS ===
project_path = '/Users/labneuro2/Documents/lab/AI_MBS/AI_MBS'

# === LOAD DATA ===

df_clinic = pd.read_excel(f'{project_path}/clinical_data.xlsx') \
    .sort_values(by='subject')

df_features_corr = pd.read_csv(f'{project_path}/FC_combined_atlas.csv') \
    .sort_values(by='Subject').reset_index(drop=True)

X_corr = df_features_corr[[col for col in df_features_corr.columns if '_to_' in col]].to_numpy()

# === LABELS: weight loss success (>50% overweight lost within a year) ===
y = (df_clinic.loc[df_clinic['post_MBS'] == 1]['overweight_delta_proc_to_baseline_kg'] < -50).astype(int).to_numpy()

In [None]:
# FeatureSelector selects top k features based on t-test statistics between classes
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=10):
        self.k = k
        self.selected_features_ = None

    def fit(self, X, y):
        t_values, p_values = ttest_ind(X[y == 0], X[y == 1], axis=0)
        self.selected_features_ = np.argsort(np.abs(t_values))[-self.k:]
        return self

    def transform(self, X):
        if self.selected_features_ is None:
            raise ValueError("The FeatureSelector has not been fitted yet.")
        return X[:, self.selected_features_]

In [None]:
def process_fold(train_idx, test_idx, X, y, option):
    # Suppress warnings for cleaner output
    warnings.filterwarnings("ignore", category=UserWarning)
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    param_grid = option['param_grid'] 
    cv = option['cv'] 

    # Split data into train and test for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Build pipeline: feature selection, scaling, classifier
    pipeline = Pipeline([
        ('selector', FeatureSelector()),
        ('scaler', StandardScaler()), 
        ('classifier', MLPClassifier(random_state=random_state))
    ])
    
    # Perform grid search cross-validation to find best parameters
    best_pipeline = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    best_pipeline.fit(X_train, y_train)

    # Extract fitted selector and scaler
    best_selector = best_pipeline.best_estimator_['selector']
    best_scaler = best_pipeline.best_estimator_['scaler']

    # Transform train and test data using best selector and scaler
    X_train_selected = best_selector.transform(X_train)
    X_test_selected = best_selector.transform(X_test)
    selected_features = best_selector.selected_features_

    X_train_scaled = best_scaler.transform(X_train_selected)
    X_test_scaled = best_scaler.transform(X_test_selected)
        
    # Get the best model and predict probabilities for the test sample
    best_model = best_pipeline.best_estimator_['classifier']
    y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

    # Compute SHAP values for the test sample
    explainer = shap.KernelExplainer(best_model.predict, X_train_scaled)
    shap_values = explainer.shap_values(X_test_scaled)
    
    # Return true label, predicted probability, selected features, SHAP values, and explainer
    return y_test[0], y_pred_proba[0], selected_features, shap_values, explainer

In [None]:
def run_loo(X, y, options):
    # This function performs Leave-One-Out cross-validation for each option in options.
    # For each fold, it collects predictions, selected features, SHAP values, and explainers.
    # It then computes performance metrics and prints summary statistics.

    for option in options:
        loo = LeaveOneOut()
        # Run each fold in parallel for efficiency
        fold_results = Parallel(n_jobs=-1)(
            delayed(process_fold)(train_idx, test_idx, X, y, option) for train_idx, test_idx in loo.split(X, y)
        )

        # Collect results from all folds
        y_true_all = np.array([res[0] for res in fold_results])
        y_pred_proba_all = np.array([res[1] for res in fold_results])
        y_pred = (y_pred_proba_all >= .5).astype(int)
        selected_features_all = [res[2] for res in fold_results]
        shapley_values_all = [res[3] for res in fold_results]
        explainers_all = [res[4] for res in fold_results]

        # Calculate metrics with confidence intervals
        auc, auc_ci = roc_auc_score(y_true_all, y_pred_proba_all, confidence_level=0.95)
        accuracy, accuracy_ci = accuracy_score(y_true_all, y_pred, confidence_level=0.95)
        sensitivity, sensitivity_ci = recall_score(y_true_all, y_pred, confidence_level=0.95)
        specificity, specificity_ci = tnr_score(y_true_all, y_pred, confidence_level=0.95)
        precision, precision_ci = precision_score(y_true_all, y_pred, confidence_level=0.95)
        f1, f1_ci = f1_score(y_true_all, y_pred, confidence_level=0.95)

        results = {
            'model': 'MLP',
            'param_grid': option['param_grid'],
            'cv': option['cv'],
            'accuracy': round(accuracy, 4),
            'auc': round(auc, 4),
            'sensitivity': round(sensitivity, 4),
            'specificity': round(specificity, 4),
            'precision': round(precision, 4),
            'f1': round(f1, 4),
            'accuracy_ci': [round(c, 4) for c in accuracy_ci],
            'auc_ci': [round(c, 4) for c in auc_ci],
            'sensitivity_ci': [round(c, 4) for c in sensitivity_ci],
            'specificity_ci': [round(c, 4) for c in specificity_ci],
            'precision_ci': [round(c, 4) for c in precision_ci],
            'f1_ci': [round(c, 4) for c in f1_ci],
        }
        # Print summary for this option
        print(f"Model: {results['model']}, Param Grid: {results['param_grid']}, CV: {results['cv']}")
        print(f" Accuracy: {results['accuracy']:.4f}, AUC: {results['auc']:.4f}")
    return results, y_pred_proba_all, selected_features_all, shapley_values_all, explainers_all

In [None]:
# This function returns a dictionary of classifiers and their parameter grids.
# Currently, it only includes MLPClassifier with several hidden layer configurations.
def get_classifiers_and_param_sets(random_state):
    classifiers = {
        'mlp': {
            'classifier': MLPClassifier(random_state=random_state),
            'param_sets': [
                {'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 50, 50)]},
            ]
        }
    }
    return classifiers

def generate_options():
    t_stat_ranges = [np.arange(10, 21, 1)]
    options = []
    classifiers = get_classifiers_and_param_sets(random_state)
    for name, classifier_info in classifiers.items():
        for param_set in classifier_info['param_sets']:
            for t_range in t_stat_ranges:
                options.append({
                        'param_grid': {
                            **param_set,
                            'selector__k': list(t_range)
                        },
                        'cv': 10,
                        'classifier': classifier_info['classifier'],
                        'name': f"{name}_t_stat"
                })
    return options
options = generate_options()

In [14]:
results, y_pred_proba_all, selected_features_all, shapley_values_all, explainers_all = run_loo(X_corr, y, options)

100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
100%|██████████| 1/1 [00:00<00:00,  3.70it/s]
100%|██████████| 1/1 [00:00<00:00,  3.13it/s]
100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
100%|██████████| 1/1 [00:01<00:00,  1.37s/it]
100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
100%|██████████| 1/1 [00:01<00:00,  1.69s/it]
100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
100%|██████████| 1/1 [00:00<00:00,  5.46it/s]
100%|██████████| 1/1 [00:00<00:00,  1.67it/s]
100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
100%|██████████| 1/1 [00:00<00:00,  2.27it/s]
100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
100%|██████████| 1/1 [00:01<00:00,  1.52s/it]
100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
100%|██████████| 1/1 [00:00<00:00,

Model: MLP, Param Grid: {'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 50, 50)], 'selector__k': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]}, CV: 10
 Accuracy: 0.8000, AUC: 0.8480


In [15]:
results

{'model': 'MLP',
 'param_grid': {'classifier__hidden_layer_sizes': [(50,),
   (100,),
   (100, 50),
   (100, 50, 50)],
  'selector__k': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
 'cv': 10,
 'accuracy': 0.8,
 'auc': 0.848,
 'sensitivity': 0.8,
 'specificity': 0.8,
 'precision': 0.8,
 'f1': 0.8,
 'accuracy_ci': [0.6618, 0.891],
 'auc_ci': [0.7338, 0.9622],
 'sensitivity_ci': [0.6831, 0.9169],
 'specificity_ci': [0.584, 0.9193],
 'precision_ci': [0.6831, 0.9169],
 'f1_ci': [0.6831, 0.9169]}

In [None]:
# Saving results
def save_results(y_pred_proba_all, shapley_values_all, selected_features_all, explainers_all, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump({
            'y_pred_proba_all': y_pred_proba_all, 
            'shapley_values_all': shapley_values_all,
            'selected_features_all': selected_features_all,
            'explainers_all': explainers_all
        }, f)

output_file = f'{project_path}/results_corr.pkl'

save_results(y_pred_proba_all, shapley_values_all, selected_features_all, explainers_all, output_file)

In [None]:
# Loading results
def load_results(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data['y_pred_proba_all'], data['shapley_values_all'], data['selected_features_all']

# Load data from file
loaded_y_pred_proba_all, loaded_shapley_values_all, loaded_selected_features_all = load_results(output_file)