In [None]:
import os
import random
import sklearn
import numpy as np
import pandas as pd
import os.path as op
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
paths_data = op.join("/path", "to", "data")
paths_save = op.join("paths", "to", "output")
os.makedirs(paths_save, exist_ok = True)

In [None]:
df_demo = pd.read_csv(op.join(paths_data, "demographics.csv"))
df_demo = df_demo[["participant", "age"]]
df_demo.head()

In [None]:
df_scores = pd.read_csv(op.join(paths_data, "fazekas_scores.csv"))
df_scores = df_scores[["participant", "total"]].rename(columns = {"total": "fazekas"})
df_scores.head()

In [None]:
df = pd.read_csv(op.join(paths_data, "profiles.csv"))
df = df.merge(df_scores, on = "participant")
df = df.merge(df_demo, on = "participant")
df = df[df["dataset"] == "single-shell"]
df = df[df["method"].isin(["afq-original", "afq-fwe", "afq-msmt"])]
df["method"] = df["method"].str.removeprefix("afq-").str.upper()
df = df[df["metric"].isin(["DTI-FA", "DTI-MD"])]
df = df[df["fazekas"] > 1] # only two participants with fazekas 1
df.head()

In [None]:
df_dict = {} # intialize dictionary 
for key, df_group in df.groupby("method"):
  df_group = (df_group.pivot(
    index = ["participant", "fazekas", "age"], 
    columns = ["tract", "metric", "node"], 
    values = "value")
    .reset_index())
  df_dict[key] = df_group

In [None]:
# x_names = predictor names, y_names = target names
x_names = [x for x in df_dict["ORIGINAL"].columns 
           if x[0] not in ["participant", "fazekas"]]
y_names = "fazekas"

# define number of cross-validation folds and repeats
n_splits  = 5
n_repeats = 1000

# define k-fold cross-validation method
random.seed() # initialize random seed
random_state = random.randint(0, 1e5) # random number generator
print(f"Random state: {random_state}")
kfold = RepeatedStratifiedKFold(
  n_splits = n_splits, n_repeats = n_repeats, random_state = random_state)


# create method feature and label variables
X = df_dict["ORIGINAL"][x_names].values
y = df_dict["ORIGINAL"][y_names].values

df_loops = [] # initialize
for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
  i_repeat = (i // n_splits) + 1 # current repeat
  i_fold   = (i % n_splits) + 1  # current fold

  df_loops.append({
    "repeat": i_repeat, "fold": i_fold, 
    "train_index": train_index, 
    "test_index": test_index
  })

df_loops = pd.DataFrame(df_loops)
df_loops.head()

In [None]:
# define current method
method = "ORIGINAL"

# create estimator pipeline
pipeline = Pipeline([
  ("imputer", SimpleImputer(strategy = "median")),
  ("z-score", StandardScaler()),
  ("PCA", PCA()), 
  ("estimator", LogisticRegressionCV(
    Cs           = 10, 
    cv           = 3, 
    penalty      = "l2",
    solver       = "lbfgs", 
    max_iter     = 1000, 
    class_weight = "balanced",
    n_jobs       = -1, 
    multi_class  = "multinomial")) 
])

# create method feature and label variables
X = df_dict[method][x_names].values
y = df_dict[method][y_names].values

for i_repeat, df_repeat in df_loops.groupby("repeat"): # for each repeat
  # initialize predictions arrays
  y_true = np.ones(y.shape) * np.nan
  y_pred = np.ones(y.shape) * np.nan
  y_prob = np.ones(y.shape + (5,)) * np.nan

  for _, (_, _, train_index, test_index) in df_repeat.iterrows(): # for each fold/row
    # split data into training and testing set  
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # train model and predict on training split
    if "pipeline_clone" in locals(): del pipeline_clone # delete pipeline clone
    pipeline_clone = sklearn.base.clone(pipeline) # copy pipeline
    pipeline_clone.fit(X_train, y_train) # fit model

    # predict and store prediction on test set
    y_true[test_index]   = y_test # store true values
    y_pred[test_index]   = pipeline_clone.predict(X_test)
    y_prob[test_index,:] = pipeline_clone.predict_proba(X_test)

  curr_save = op.join(paths_save, "single-shell", method)
  os.makedirs(curr_save, exist_ok = True)

  base_name = f"figure11_single-shell_method-{method}"
  base_name += f"_repeat-{i_repeat:04d}"

  save_name = f"{base_name}_ytrue.npy"
  np.save(op.join(curr_save, save_name), y_true)
  print(f"Saved: {save_name}")

  save_name = f"{base_name}_ypred.npy"
  np.save(op.join(curr_save, save_name), y_pred)
  print(f"Saved: {save_name}")

  save_name = f"{base_name}_yprob.npy"
  np.save(op.join(curr_save, save_name), y_prob)
  print(f"Saved: {save_name}")

In [None]:
# define current method
method = "FWE"

# create estimator pipeline
pipeline = Pipeline([
  ("imputer", SimpleImputer(strategy = "median")),
  ("z-score", StandardScaler()),
  ("PCA", PCA()), 
  ("estimator", LogisticRegressionCV(
    Cs           = 10, 
    cv           = 3, 
    penalty      = "l2",
    solver       = "lbfgs", 
    max_iter     = 1000, 
    class_weight = "balanced",
    n_jobs       = -1, 
    multi_class  = "multinomial")) 
])

# create method feature and label variables
X = df_dict[method][x_names].values
y = df_dict[method][y_names].values

for i_repeat, df_repeat in df_loops.groupby("repeat"): # for each repeat
  # initialize predictions arrays
  y_true = np.ones(y.shape) * np.nan
  y_pred = np.ones(y.shape) * np.nan
  y_prob = np.ones(y.shape + (5,)) * np.nan

  for _, (_, _, train_index, test_index) in df_repeat.iterrows(): # for each fold/row
    # split data into training and testing set  
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # train model and predict on training split
    if "pipeline_clone" in locals(): del pipeline_clone # delete pipeline clone
    pipeline_clone = sklearn.base.clone(pipeline) # copy pipeline
    pipeline_clone.fit(X_train, y_train) # fit model

    # predict and store prediction on test set
    y_true[test_index]   = y_test # store true values
    y_pred[test_index]   = pipeline_clone.predict(X_test)
    y_prob[test_index,:] = pipeline_clone.predict_proba(X_test)

  curr_save = op.join(paths_save, "single-shell", method)
  os.makedirs(curr_save, exist_ok = True)

  base_name = f"figure11_single-shell_method-{method}"
  base_name += f"_repeat-{i_repeat:04d}"

  save_name = f"{base_name}_ytrue.npy"
  np.save(op.join(curr_save, save_name), y_true)
  print(f"Saved: {save_name}")

  save_name = f"{base_name}_ypred.npy"
  np.save(op.join(curr_save, save_name), y_pred)
  print(f"Saved: {save_name}")

  save_name = f"{base_name}_yprob.npy"
  np.save(op.join(curr_save, save_name), y_prob)
  print(f"Saved: {save_name}")

In [None]:
# define current method
method = "MSMT"

# create estimator pipeline
pipeline = Pipeline([
  ("imputer", SimpleImputer(strategy = "median")),
  ("z-score", StandardScaler()),
  ("PCA", PCA()), 
  ("estimator", LogisticRegressionCV(
    Cs           = 10, 
    cv           = 3, 
    penalty      = "l2",
    solver       = "lbfgs", 
    max_iter     = 1000, 
    class_weight = "balanced",
    n_jobs       = -1, 
    multi_class  = "multinomial")) 
])

# create method feature and label variables
X = df_dict[method][x_names].values
y = df_dict[method][y_names].values

for i_repeat, df_repeat in df_loops.groupby("repeat"): # for each repeat
  # initialize predictions arrays
  y_true = np.ones(y.shape) * np.nan
  y_pred = np.ones(y.shape) * np.nan
  y_prob = np.ones(y.shape + (5,)) * np.nan

  for _, (_, _, train_index, test_index) in df_repeat.iterrows(): # for each fold/row
    # split data into training and testing set  
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # train model and predict on training split
    if "pipeline_clone" in locals(): del pipeline_clone # delete pipeline clone
    pipeline_clone = sklearn.base.clone(pipeline) # copy pipeline
    pipeline_clone.fit(X_train, y_train) # fit model

    # predict and store prediction on test set
    y_true[test_index]   = y_test # store true values
    y_pred[test_index]   = pipeline_clone.predict(X_test)
    y_prob[test_index,:] = pipeline_clone.predict_proba(X_test)

  curr_save = op.join(paths_save, "single-shell", method)
  os.makedirs(curr_save, exist_ok = True)

  base_name = f"figure11_single-shell_method-{method}"
  base_name += f"_repeat-{i_repeat:04d}"

  save_name = f"{base_name}_ytrue.npy"
  np.save(op.join(curr_save, save_name), y_true)
  print(f"Saved: {save_name}")

  save_name = f"{base_name}_ypred.npy"
  np.save(op.join(curr_save, save_name), y_pred)
  print(f"Saved: {save_name}")

  save_name = f"{base_name}_yprob.npy"
  np.save(op.join(curr_save, save_name), y_prob)
  print(f"Saved: {save_name}")