# Step 1: Feature Engineering & Model Comparisons


### This first part does the following
- Configure project paths and load input files (CSV/PKL).
- Prepare connectivity features from time series data (optionally can choose to select nodes from single network).
- Apply participant exclusion. 
- Residualize features on covariates (e.g., Age/Sex) to remove nuisance variance.
- Standardize features within a modeling pipeline.
- Define and run ML models (Ridge/Lasso/Elastic Net) with cross-validation and hyperparameter tuning.
- Save results (R², MAE, RMSE, Pearson and Spearman correlations).
- Compare models.

In [6]:
#import relevant libraries
import os, sys, json, math, random, pathlib, itertools, functools, warnings
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pickle
from nilearn.connectome import ConnectivityMeasure
from scipy.stats import spearmanr


# Set directory that has the fmri data
input_dir = 'C:/Users/mk7kc/Desktop/connectivity/vcap/vcap_Shen/dat'
os.chdir(input_dir)

In [None]:
# Specify output dir (make if it doesn't already exist)
output_dir=''

if not os.path.isdir(output_dir):
    os.makedirs(output_dir)   

## Read in characterization data

In [None]:
# Load in characterization data, remove participants with too high of a mean FD value (motion artifact too high)
# See exclusion.txt for more info
char_dir = 'C:/Users/mk7kc/Desktop/connectivity/vcap/vcap_MSDL/char_data/'
char_data = pd.read_csv('C:/Users/mk7kc/Desktop/connectivity/vcap/vcap_MSDL/char_data/char.csv')
exclude = [10, 43, 59, 78, 80, 83, 93]

# Demographic covariates
age = np.delete(char_data['Age'].values, exclude)
sex = np.delete(char_data['Sex'].values, exclude)

# Create function for faster extraction
def load_char(file, column):
    data = pd.read_csv(file)
    column_vals = data[column].values
    return np.delete(column_vals, exclude)

# Social Network metrics
SNComposite = load_char(
    os.path.join(char_dir, 'faOut1mlPromax_tenBergescores.csv'), 'ML1'
)

AnticipatedSupport = load_char(
    os.path.join(char_dir, 'socialnetwork_subscale_scores.csv'), 'PercSupp_Anticip'
)

SN_metrics = np.column_stack([SNComposite, AnticipatedSupport])


# Cognitive metrics
vocab = load_char(
    os.path.join(char_dir, 'vcap_cog_MK.csv'), 'vocab'
)

proc_speed = load_char(
    os.path.join(char_dir, 'vcap_cog_MK.csv'), 'proc_speed'
)

mem = load_char(
    os.path.join(char_dir, 'vcap_cog_MK.csv'), 'memory'
)

cog_metrics = np.column_stack([vocab, proc_speed, mem])  


variables={"Social Support":SN_metrics[:,0],
           "Anticipated Support":SN_metrics[:,1],
          "Vocab":cog_metrics[:,0],
          "Processing Speed":cog_metrics[:,1],
          "Memory":cog_metrics[:,2],
           "Age":age,
           "Sex":sex,
          }
df=pd.DataFrame(variables)
df.shape()
# Standardize values
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

# Visualize pairplots and distributions
_ = sns.pairplot(df_scaled, vars=df.columns,kind="reg", diag_kind="kde",height=1.7)



### Choose fMRI Input

#### Option 1: Dictionary Learning output from high performance cluster processing of CPAC 4D standard to func outputs
Spatial maps generated via map learning algorithm based on spatial component sparsity then timeseries extracted for connectivty measures. To see model and gridsearch parameters see VCAP_CPAC folder for python scripts run on HPC for network & region extraction & connectome generation. **Note: participants that needed to be excluded were already excluded in the HPC step, so no need to do it here.**



In [None]:
# Load in partial correlation data, use upper triangle data for features, Fishers Z transform 
fc_noGM_noLP = np.load('12comp_pcorr_noGM_noLP.npy')
fc_noGM_noLP = fc_noGM_noLP.reshape(92,71,71)
fc_noGM_noLP_feature=[]
for i in range(0,92):
    fv=fc_noGM_noLP[i][np.triu_indices(71, k = 1)] #k=1 excludes diagonal values
    fc_noGM_noLP_feature.append(fv)
x = np.arctanh(fc_noGM_noLP_feature) # 66 features

# Plot components using output from the best estimator model's components_img_
dictlearning_components_img = nib.load(input_dir+'.nii.gz')

# Plot atlas
# plot_prob_atlas(dictlearning_components_img,
#                 display_mode='z')
# plot_prob_atlas(dictlearning_components_img,
#                display_mode='x')
# plot_prob_atlas(dictlearning_components_img,
#                display_mode='y')

# Save images
#plot_prob_atlas(dictlearning_components_img,output_file=output_dir+'vcap_All_DictLearning_Components.png')
#plot_prob_atlas(dictlearning_components_img,
#                display_mode='x',output_file=output_dir+'vcap_All_DictLearning_Components_x.svg')

# Plot map for each component separately and save it into visualizations folder
# for i, cur_img in enumerate(iter_img(dictlearning_components_img)):
#     vis_name = os.path.join(output_dir,"Comp_y_"+str(i+1))
#     plot_stat_map(cur_img, display_mode="xz", title="Comp"+str(i+1),
#                   colorbar=True,output_file = vis_name) 
    
# Examine explained variance using best estimator model's scores   
scores = np.load(os.path.join(input_dir,'scores_12comp_220511_92participants_noGM_noLP.npy'))
print('My components explain %s perct. of the variance in the dataset' % str(round(np.sum(scores)*100,2)))

# Plot the explained variances per component
plt.figure(figsize=(10, 10))
numbers = np.arange(1,13)
plt.barh(numbers,scores)
plt.ylabel('Component #', size=30)
plt.xlabel('Explained Variance Ratio', size=30)
plt.yticks(np.arange(1,13))
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=10)
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%.3f'))
plt.tight_layout()
#plt.savefig(os.path.join(output_dir,"ExplainedVariance_80comp.svg"))

#### Option 2: Atlas output from fMRIPrep

In [33]:
# Specify atlas name in files
atlas_label = 'shen_atlas' # e.g. 'shen_atlas'
task_label = 'rest_1' # e.g. 'rest_1'
with open(f'vcap_{atlas_label}_ts_{task_label}.pkl', 'rb') as handle:
    fcdat = pickle.load(handle) 
    
# (Optional) Only extract nodes from certain networks, e.g. DMN network in Shen atlas
network_use = False
labels = pd.read_csv('shen_268_parcellation_networklabels.csv')
network_indices = labels.loc[labels['Network'] == 3].index # DMN = 3

# Extract time series from participants used for final analysis (remove those who meet exclusion criteria)
# List of participant IDs to exclude
exclude_id = char_data.iloc[exclude,0].values
exclude_id_str = {f"{n:03d}" for n in exclude_id}

# Get participant IDs
ids = sorted(fcdat.keys())
include_id = [include for include in ids if include not in exclude_id_str]
if network_use:
    ts = [fcdat[include][:, network_indices] for include in include_id]
else:
    ts = [fcdat[include] for include in include_id]             

# Connectivity
conn = ConnectivityMeasure(kind='partial correlation', vectorize=True, discard_diagonal=True)
x = np.arctanh(conn.fit_transform(ts))  # shape: (n_subjects, n_features)
print("X shape (subjects, features):", x.shape)

X shape (subjects, features): (92, 35778)


## Regress out covariates:
A linear regression analysis was performed at every feature to remove the effects of covariates. The residuals of this regression were then substituted for the feature values.

In [None]:
covar = ['Age', 'Sex'] # add any other demo or covariates as needed
df_scaled_const = sm.add_constant(df_scaled[covar].values)

# # For loop version (takes longer)
# x_res = np.empty_like(x)
# for i in range(x.shape[1]):
#     x_res[:,i] = sm.OLS(x[:,i],df_scaled2).fit().resid 

# Vectorized version
M = np.eye(df_scaled_const.shape[0]) - df_scaled_const @ np.linalg.pinv(df_scaled_const)
x_res = M @ x    

## Define model

In [None]:
def run_ML(X, Y, target_names, model, perm=1000, test_size=0.2,
           cv_splits=5, cv_repeats=5, random_seed=1,
           date_prefix=None,descriptor=None):
    """
    X: residualized connectivity data
    Y: predictor vars
    model: choose between ridge, lasso, elasticnet
    """
    model = model.lower()
    if model not in {'ridge','lasso','elasticnet'}:
        raise ValueError("check spelling of model type")
    n, n_targets = Y.shape

    # CV for inner tuning
    cv = RepeatedKFold(n_splits=cv_splits, n_repeats=cv_repeats, random_state=random_seed)

    # Search spaces
    alphas_ridge = np.linspace(4, 15, 50)
    alphas_lasso = np.linspace(0.1, .2, 100)
    alphas_e = np.linspace(0.2, 0.5, 20)
    l1_ratios = np.linspace(0.7, 1, 10) #  pure Ridge regression (l1_ratio=0) to pure Lasso regression (l1_ratio=1)
    
    # Arrays to store results
    r2 = np.full((perm, n_targets), np.nan)
    mae = np.full((perm, n_targets), np.nan)
    rmse = np.full((perm, n_targets), np.nan)
    pearson = np.full((perm, n_targets), np.nan)
    spearman = np.full((perm, n_targets), np.nan)

    # Optimized hyperparameters
    opt_alpha = np.full((perm, n_targets), np.nan)
    opt_l1 = np.full((perm, n_targets), np.nan)

    for p in range(perm):
        rs = np.random.RandomState(random_seed + p)
        idx = np.arange(n).copy()
        rs.shuffle(idx)
        n_test = int(np.ceil(n * test_size))
        test_idx, train_idx = idx[:n_test], idx[n_test:]
        X_train, X_test = X[train_idx], X[test_idx]

        for t in range(n_targets):
            y = Y[:, t]
            y_train, y_test = y[train_idx], y[test_idx]

            # Build estimator
            if model == 'ridge':
                est = RidgeCV(alphas=alphas_ridge, cv=cv, scoring='r2')
                est_name = 'ridgecv'
            elif model == 'lasso':
                est = LassoCV(alphas=alphas_lasso, cv=cv, n_jobs=-1,
                              random_state=random_seed + p, max_iter=1e7)
                est_name = 'lassocv'
            elif model == 'elasticnet':
                est = ElasticNetCV(alphas=alphas_e, l1_ratio=l1_ratios, cv=cv, n_jobs=-1,
                                   random_state=random_seed + p, max_iter=1e7)
                est_name = 'elasticnetcv'
                
            pipe = Pipeline([
                ('sc', StandardScaler()),
                (est_name, est)
            ])

            pipe.fit(X_train, y_train)
            yhat = pipe.predict(X_test)

            # Metrics
            r2[p, t]   = r2_score(y_test, yhat)
            mae[p, t]  = mean_absolute_error(y_test, yhat)
            rmse[p, t] = mean_squared_error(y_test, yhat, squared=False)
            pearson[p, t]  = np.corrcoef(y_test, yhat)[1, 0] if y_test.std() > 0 else np.nan
            rho, _ = spearmanr(y_test, yhat)
            spearman[p, t] = rho if y_test.std() > 0 else np.nan
            
            # Hyperparameters
            result = pipe.named_steps[est_name]
            if hasattr(result, 'alpha_'): opt_alpha[p, t] = result.alpha_
            if hasattr(result, 'l1_ratio_'): opt_l1[p, t] = result.l1_ratio_

        if (p + 1) % max(1, perm // 5) == 0:
            print(f"{p+1}/{perm} permutations")

    if date_prefix and descriptor:
        # Save function
        def save(outcome, array):
            np.savetxt(os.path.join(output_dir, f'{date_prefix}_{descriptor}_{outcome}.csv'), array, delimiter=',')

        save('r2', r2)
        save('mae', mae)
        save('rmse', rmse)
        save('pearson', pearson)
        save('spearman', spearman)
        save('alpha', opt_alpha)
        if model == 'elasticnet':
            save('l1_ratio', opt_l1)

    return {
        'r2': r2, 'mae': mae, 'rmse': rmse, 'pearson': pearson, 'spearman': spearman,
        'alpha': opt_alpha, 'l1_ratio': opt_l1 if model == 'elasticnet' else None,
        'targets': target_names, 'model': model,
        'config': {'perm': perm, 'test_size': test_size, 'cv_splits': cv_splits, 'cv_repeats': cv_repeats}
    }

## Run model

In [None]:
# Create final Y dataframe
Y = df_scaled.drop(columns=["Age", "Sex"]).values
y_metric_names = df_scaled.drop(columns=["Age", "Sex"]).columns.tolist()

# Run model
run_ML(x_res, Y, y_metric_names, model='elasticnet', perm=1000,
                    test_size=0.2, cv_splits=5, cv_repeats=5, random_seed=1,
                    date_prefix='230301',descriptor='elasticnet_partialcorr_DMN' if use_dmn_only else 'elasticnet_partialcorr_all')

## Visualize

## Compare the models

In [None]:
# === Model comparison ===
from collections import OrderedDict

seed = 1
perm = 500  
results_by_model = OrderedDict()
for m in ["ridge", "lasso", "elasticnet"]:
    results_by_model[m] = run_ML(
        X_res, Y, target_names=y_metric_names, model=m,
        perm=perm, test_size=0.2, cv_splits=5, cv_repeats=5,
        random_seed=seed)
    
# This following part I got ChatGPT to clean up my code to be much more efficient
def summarize_metric(metric_name: str):
    rows = []
    for m, res in results_by_model.items():
        mat = res[metric_name] 
        rows.append(pd.DataFrame({
            "model": m,
            "target": res["targets"],
            "mean": np.nanmean(mat, axis=0),
            "std":  np.nanstd(mat, axis=0)
        }))
    return pd.concat(rows, ignore_index=True)

sum_r2       = summarize_metric("r2")
sum_mae      = summarize_metric("mae")
sum_rmse     = summarize_metric("rmse")
sum_pearson  = summarize_metric("pearson")
sum_spearman = summarize_metric("spearman")

metric_to_pick = "r2"  # or "pearson"
summary = {"r2": sum_r2, "pearson": sum_pearson, "mae": sum_mae, "rmse": sum_rmse, "spearman": sum_spearman}[metric_to_pick]

best_per_target = (
    summary.sort_values(["target", "mean"], ascending=[True, False])
           .groupby("target", as_index=False)
           .first()[["target", "model", "mean", "std"]]
           .rename(columns={"mean": f"{metric_to_pick}_mean", "std": f"{metric_to_pick}_std"})
)

print("\nBest model per target (by {}):".format(metric_to_pick))
print(best_per_target.to_string(index=False))

# 4) Quick visualization: bar chart of mean ± SD for your metric
plt.figure(figsize=(10, 5))
for i, tgt in enumerate(y_metric_names):
    df_t = summary[summary["target"] == tgt]
    plt.errorbar(
        x=np.arange(len(df_t)) + i*(0.04),  # small offset per target to reduce overlap
        y=df_t["mean"], yerr=df_t["std"],
        fmt="o-", capsize=4, label=f"{tgt}" if i == 0 else None
    )
plt.xticks(
    ticks=np.arange(len(df_t)), 
    labels=df_t["model"].values, 
    rotation=0
)
plt.ylabel(f"{metric_to_pick.capitalize()} (mean ± SD)")
plt.title(f"Model comparison across {perm} permutations")
plt.tight_layout()
plt.show()

