### Import Modules

In [None]:
import os
import numpy as np
import pandas as pd
from functools import reduce
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel, SelectPercentile, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor
import shap
import warnings
warnings.filterwarnings('ignore')

### Read Training Data

In [None]:
train_data_path = 'train'
predictors_paths = {
    'GM': os.path.join(train_data_path, 'GM.csv'),
    'WM': os.path.join(train_data_path, 'WM.csv'),
    'ReHo': os.path.join(train_data_path, 'ReHo.csv'),
    'PCGcorr': os.path.join(train_data_path, 'PCGcorr.csv'),
    'FA': os.path.join(train_data_path, 'FA.csv'),
    'MD': os.path.join(train_data_path, 'MD.csv')
}
additional_variables_path = os.path.join(train_data_path, 'Subjects.csv')

# Predictors
modalities = ['GM', 'ReHo', 'MD']
selected_modalities = {modality: predictors_paths[modality] for modality in modalities if modality in predictors_paths}
def read_and_rename(modality, path):
    df = pd.read_csv(path)
    df = df.rename(columns={label: f"{label}_{modality}" for label in df.columns if label != 'ID'})
    return df
dfs = [read_and_rename(modality, path) for modality, path in selected_modalities.items()]
predictors_df = reduce(lambda left, right: pd.merge(left, right, on='ID'), dfs)

# Response and confounding variables
additional_variables = ['Memory', 'Age', 'Sex', 'EducationYear']
df = pd.read_csv(additional_variables_path)
selected_variables = [variable for variable in additional_variables if variable in df.columns]
additional_variables_df = df[["ID"] + selected_variables]

# Merge predictors with response and confounding variables on 'ID'
df = pd.merge(additional_variables_df, predictors_df, on='ID')

# Prepare X and y
X = df.drop(columns=['ID', 'Memory'])
y = df['Memory']

# Split into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Sample size for training: {X_train.shape[0]}')
print(f'Sample size for test: {X_test.shape[0]}')

### Select Features

In [None]:
feature_selection_method = 'Lasso' # 'Lasso' or 'MI'

if feature_selection_method == 'Lasso':
    # Lasso regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    lasso = LassoCV(cv=5, random_state=42).fit(X_train_scaled, y_train)
    print(f'Regularization strength hyperparameter for Lasso: {lasso.alpha_:.3f}')
    selector = SelectFromModel(lasso, prefit=True)
    X_train_selected = selector.transform(X_train)
elif feature_selection_method == 'MI':
    # Mutual information
    selector = SelectPercentile(mutual_info_regression, percentile=50)
    X_train_selected = selector.fit_transform(X_train, y_train)

selected_mask = selector.get_support()
print(f"Selected features by {feature_selection_method}: {selected_mask.sum()} out of {X_train.shape[1]} total features")
selected_features = X_train.columns[selected_mask]
X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected = pd.DataFrame(selector.transform(X_test), columns=selected_features)

### Train and Test Model

In [None]:
ml_method = 'ET' # 'ET', 'RF' or 'XGB'

# Hyperparameter grid for ExtraTreesRegressor
param_grid_et = {
    'n_estimators': [100, 200, 300, 400],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 7],  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5, 0.8], # Number of features to consider when looking for the best split
    'bootstrap': [False] # Whether bootstrap samples are used when building trees
}
# Hyperparameter grid for RandomForestRegressor
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}
# Hyperparameter grid for XGBRegressor
param_grid_xgb = {
    'n_estimators': [100, 200, 300],  # Number of gradient boosted trees (boosting rounds)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage used to prevent overfitting
    'max_depth': [3, 6, 9, 12],  # Maximum depth of each tree
    'min_child_weight': [1, 5, 10],  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.5, 0.7, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0],  # Subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.1, 1],  # L1 regularization term on weights
    'reg_lambda': [1, 2, 5]  # L2 regularization term on weights
}

if ml_method == 'ET':
    regressor = ExtraTreesRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_et, cv=5, scoring='neg_mean_absolute_error')
elif ml_method == 'RF':
    regressor = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_rf, cv=5, scoring='neg_mean_absolute_error')
elif ml_method == 'XGB':
    regressor = XGBRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_absolute_error')

grid_search.fit(X_train_selected, y_train)
best_params = grid_search.best_params_
print(f'Best cross-validation score {-grid_search.best_score_:.3f} by {ml_method} for hyperparameters: ', best_params)

if ml_method == 'ET':
    regressor = ExtraTreesRegressor(**best_params)
elif ml_method == 'RF':
    regressor = RandomForestRegressor(**best_params)
elif ml_method == 'XGB':
    regressor = XGBRegressor(**best_params)
regressor.fit(X_train_selected, y_train)
predictions = regressor.predict(X_test_selected)
mae = mean_absolute_error(y_test, predictions)
print(f"MAE by {ml_method}: {mae:.3f}")

### Feature Importances

In [None]:
feature_importances = regressor.feature_importances_
features_and_importances = zip(selected_features, feature_importances)
sorted_features_and_importances = sorted(features_and_importances, key=lambda x: x[1], reverse=True)
top_features_and_importances = sorted_features_and_importances[:9]
print(f"Top features' importances by {ml_method}:")
for no, feature_importance in enumerate(top_features_and_importances):
    print(f"{no + 1}. {feature_importance[0]}: {feature_importance[1]:.3f}")

### SHAP (SHapley Additive exPlanations)

In [None]:
explainer = shap.Explainer(regressor)
shap_values = explainer(X_test_selected)
shap.initjs()
shap.summary_plot(shap_values, X_test_selected, max_display=9)

### Inference

In [None]:
test_data_path = 'test'
predictors_paths = {
    'GM': os.path.join(test_data_path, 'GM.csv'),
    'WM': os.path.join(test_data_path, 'WM.csv'),
    'ReHo': os.path.join(test_data_path, 'ReHo.csv'),
    'PCGcorr': os.path.join(test_data_path, 'PCGcorr.csv'),
    'FA': os.path.join(test_data_path, 'FA.csv'),
    'MD': os.path.join(test_data_path, 'MD.csv')
}
additional_variables_path = os.path.join(test_data_path, 'Subjects.csv')

# Predictors
selected_modalities = {modality: predictors_paths[modality] for modality in modalities if modality in predictors_paths}
dfs = [read_and_rename(modality, path) for modality, path in selected_modalities.items()]
predictors_df = reduce(lambda left, right: pd.merge(left, right, on='ID'), dfs)

# Confounding variables
df = pd.read_csv(additional_variables_path)
selected_variables = [variable for variable in additional_variables if variable in df.columns]
additional_variables_df = df[['ID'] + selected_variables]

# Merge predictors with confounding variables on 'ID'
df = pd.merge(additional_variables_df, predictors_df, on='ID')

# Apply trained model
X_ext = df.drop(columns=['ID'])
X_ext_selected = pd.DataFrame(selector.transform(X_ext), columns=selected_features)
predictions_ext = regressor.predict(X_ext_selected)

# Save predictions
np.savetxt(os.path.join(test_data_path, "Predictions.txt"), predictions_ext)

# SHAP
shap_values = explainer(X_ext_selected)
shap.initjs()
shap.summary_plot(shap_values, X_ext_selected, max_display=9)