In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, RidgeCV, Lasso
from sklearn.svm import SVR
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import *
from random import seed
from scipy import stats
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import shap
%matplotlib inline

seed(888)

# 1 Load data from "1 Preprocessing"

In [None]:
# load data from pickle and convert to dataframe
brain_test = pd.read_pickle("1_brain_test.pkl")
brain_train = pd.read_pickle("1_brain_train.pkl")
main_test = pd.read_pickle("1_main_test.pkl")
main_train = pd.read_pickle("1_main_train.pkl")
brain_test = pd.DataFrame(brain_test)
brain_train = pd.DataFrame(brain_train)
main_test = pd.DataFrame(main_test)
main_train = pd.DataFrame(main_train)

# 2 BrainAge with different segmentations

## 2.1 A2009s Segmentation

In [None]:
# select variables belonging to the A2009s segmentation
X_train = brain_train.iloc[:,1657:2544:2]
Y_train = brain_train.iloc[:,-1]
X_test = brain_test.iloc[:,1657:2544:2]
Y_test = brain_test.iloc[:,-1]

# standardize x-data
X_train = stats.zscore(X_train)
X_test = stats.zscore(X_test)

# set of alphas to try (=penalization)
alpha_parameters = np.power(10,np.linspace(start=-3, stop=5, num=100))

### 2.1.1 Cross-validation

In [None]:
X_CV = X_train
Y_CV = Y_train

# randomly split data (only training no PA) into training and testing set
X_train_cv, X_test_cv, Y_train_cv, Y_test_cv = train_test_split(X_CV,Y_CV)

#define the model
model_ridge = RidgeCV(alphas = alpha_parameters)
model_lasso = LassoCV(alphas = alpha_parameters, max_iter=100000)
model_SVR = SVR(kernel = "rbf")
#perform 10-fold cross validation
scores_ridge = cross_val_score(model_ridge, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
scores_lasso = cross_val_score(model_lasso, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
scores_SVR = cross_val_score(model_SVR, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
#calculate the mean-absolute error
mae_ridge = -1 * scores_ridge.mean()
mae_lasso = -1 * scores_lasso.mean()
mae_SVR = -1 * scores_SVR.mean()
print("The MAE for cross-validation (Ridge):", mae_ridge)
print("The MAE for cross-validation (Lasso):", mae_lasso)
print("The MAE for cross-validation (SVR):", mae_SVR)

# for correction
model_ridge.fit(X_train_cv,Y_train_cv)
y_pred_cv = model_ridge.predict(X_test_cv)

### 2.1.2 Prediction

In [None]:
# models to try
model_Lasso = LassoCV(alphas = alpha_parameters, max_iter=100000)
model_Ridge = RidgeCV(alphas = alpha_parameters)
model_SVR = SVR(kernel = 'rbf')

# train model 
model_Lasso.fit(X_train,Y_train)
model_Ridge.fit(X_train,Y_train)
model_SVR.fit(X_train,Y_train)

# get predicted values for test set
y_pred_Lasso = model_Lasso.predict(X_test)
y_pred_Ridge = model_Ridge.predict(X_test)
y_pred_SVR = model_SVR.predict(X_test)

# calculate brain age gap
brain_age_delta_Lasso = y_pred_Lasso-Y_test
brain_age_delta_Ridge = y_pred_Ridge-Y_test
brain_age_delta_SVR = y_pred_SVR-Y_test

# get mean absolute error (MAE)
print("The MAE for testing set using Lasso:", mean_absolute_error(Y_test,y_pred_Lasso))
print("The MAE for testing set using Ridge:", mean_absolute_error(Y_test,y_pred_Ridge))
print("The MAE for testing set using SVR:", mean_absolute_error(Y_test,y_pred_SVR))

# Spearman rank order correlations between models of the same segmentation
print("The Spearman r for Ridge vs Lasso:", stats.spearmanr(brain_age_delta_Ridge, brain_age_delta_Lasso))
print("The Spearman r for Ridge vs SVR:", stats.spearmanr(brain_age_delta_Ridge, brain_age_delta_SVR))
print("The Spearman r for SVR vs Lasso:", stats.spearmanr(brain_age_delta_SVR, brain_age_delta_Lasso))

# top x coefficients and their names
coeff = np.asarray(model_Ridge.coef_)
coeffabs = np.asarray(abs(model_Ridge.coef_))
top_coefficients = np.argsort(coeffabs)[-20:]
print(X_train.iloc[:,top_coefficients].columns)
print(coeff[top_coefficients])

# plot figure with x: actual age Y: predicted age, and a line with slope 1 for reference
plt.figure()
plt.scatter(Y_test, y_pred_Ridge, alpha=0.5, s=10, color = "darkcyan")
plt.axline((60,60), slope=1, color='r')
plt.xticks(range(40,90,5))
plt.yticks(range(40,90,5))
plt.ylabel('Predicted Age', fontsize = 16)
plt.xlabel('Age', fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

# plot the brainAGE and a line with the correlation between brainAGE and age 
plt.figure()
plt.scatter(Y_test,brain_age_delta_Ridge, alpha=0.5, s=10, color = "darkcyan")
plt.axline((65,0), slope=-0.69, color="blue")
plt.axline((60,0),slope=0, color = "black")
plt.ylabel("BrainAGE", fontsize = 16)
plt.xlabel("Age", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

### 2.1.3 Correction

In [None]:
# reformat cross validation data for regression model
y_pred_cv = np.asarray(y_pred_cv)
y_pred_cv = np.reshape(y_pred_cv, (-1,1))
Y_test_cv = np.asarray(Y_test_cv)
Y_test_cv = np.reshape(Y_test_cv, (-1,1))

# now using correction from the R code that's online from Cole
reg = LinearRegression().fit(Y_test_cv, y_pred_cv)
coef = float(reg.coef_)
intercept = float(reg.intercept_)
print("Coefficient and Intercept:", coef, intercept)

# correction by cole
function = lambda t: (t-intercept)/coef
vfunc = np.vectorize(function)
corr_brainage_1 = vfunc(y_pred_Ridge)
corr_brainage_delta_1 = corr_brainage_1-Y_test

# plots corrected
plt.figure()
plt.scatter(Y_test, corr_brainage_1, alpha=0.5, s=10, color="darkcyan")
plt.axline((65,65), slope=coef, color="blue")
plt.ylabel("Predicted Age Corrected", fontsize = 16)
plt.axline((60,60), slope=1, color='r')
plt.xlabel("Age", fontsize = 16)
plt.xticks(range(40,90,5), fontsize = 14)
plt.yticks(range(30,115,10), fontsize = 14)
plt.show()

plt.figure()
plt.scatter(Y_test, corr_brainage_delta_1, alpha=0.5, s=10, color="darkcyan")
plt.axline((60,0),slope=0, color = "black")
plt.ylabel("BrainAGE Corrected", fontsize = 16)
plt.xlabel("Age", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

## 2.2 ASEG Segmentation

In [None]:
# select Freesurfer ASEG for training data
whole_brain = brain_train.iloc[:,27:70:2]
left_hemi = brain_train.iloc[:,103:134:2]
right_hemi = brain_train.iloc[:,165:196:2]
age = brain_train["Age"]
ASEG_train = pd.DataFrame(pd.concat([whole_brain,left_hemi, right_hemi,age],axis = 1))

# select Freesurfer ASEG for testing data
whole_brain = brain_test.iloc[:,27:70:2]
left_hemi = brain_test.iloc[:,103:134:2]
right_hemi = brain_test.iloc[:,165:196:2]
age = brain_test["Age"]
ASEG_test = pd.DataFrame(pd.concat([whole_brain,left_hemi, right_hemi,age],axis = 1))

X_train = ASEG_train.iloc[:,:-1]
Y_train = ASEG_train.iloc[:,-1]
X_test = ASEG_test.iloc[:,:-1]
Y_test = ASEG_test.iloc[:,-1]

# standardize x-data
X_train = stats.zscore(X_train)
X_test = stats.zscore(X_test)

### 2.2.1 Cross-validation

In [None]:
X_CV = X_train
Y_CV = Y_train

# randomly split data (only training no PA) into training and testing set
X_train_cv, X_test_cv, Y_train_cv, Y_test_cv = train_test_split(X_CV,Y_CV)

#define the model
model_ridge = RidgeCV(alphas = alpha_parameters)
model_lasso = LassoCV(alphas = alpha_parameters, max_iter=100000)
model_SVR = SVR(kernel = "rbf")
#perform 10-fold cross validation
scores_ridge = cross_val_score(model_ridge, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
scores_lasso = cross_val_score(model_lasso, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
scores_SVR = cross_val_score(model_SVR, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
#calculate the mean-absolute error
mae_ridge = -1 * scores_ridge.mean()
mae_lasso = -1 * scores_lasso.mean()
mae_SVR = -1 * scores_SVR.mean()
print("The MAE for cross-validation (Ridge):", mae_ridge)
print("The MAE for cross-validation (Lasso):", mae_lasso)
print("The MAE for cross-validation (SVR):", mae_SVR)

# for correction
model_SVR.fit(X_train_cv,Y_train_cv)
y_pred_cv = model_SVR.predict(X_test_cv)

### 2.2.2 Prediction

In [None]:
# models to try
model_Lasso = LassoCV(alphas = alpha_parameters, max_iter=100000)
model_Ridge = RidgeCV(alphas = alpha_parameters)
model_SVR = SVR(kernel = 'rbf')

# train model 
model_Lasso.fit(X_train,Y_train)
model_Ridge.fit(X_train,Y_train)
model_SVR.fit(X_train,Y_train)

# get predicted values for test set
y_pred_Lasso = model_Lasso.predict(X_test)
y_pred_Ridge = model_Ridge.predict(X_test)
y_pred_SVR = model_SVR.predict(X_test)

# calculate brain age gap
brain_age_delta_Lasso = y_pred_Lasso-Y_test
brain_age_delta_Ridge = y_pred_Ridge-Y_test
brain_age_delta_SVR = y_pred_SVR-Y_test

# get mean absolute error (MAE)
print("The MAE for testing set using Lasso:", mean_absolute_error(Y_test,y_pred_Lasso))
print("The MAE for testing set using Ridge:", mean_absolute_error(Y_test,y_pred_Ridge))
print("The MAE for testing set using SVR:", mean_absolute_error(Y_test,y_pred_SVR))

# Spearman rank order correlations between models of the same segmentation
print("The Spearman r for Ridge vs Lasso:", stats.spearmanr(brain_age_delta_Ridge, brain_age_delta_Lasso))
print("The Spearman r for Ridge vs SVR:", stats.spearmanr(brain_age_delta_Ridge, brain_age_delta_SVR))
print("The Spearman r for SVR vs Lasso:", stats.spearmanr(brain_age_delta_SVR, brain_age_delta_Lasso))

# top x coefficients and their names
coeff = np.asarray(model_Ridge.coef_)
coeffabs = np.asarray(abs(model_Ridge.coef_))
top_coefficients = np.argsort(coeffabs)[-20:]
print(X_train.iloc[:,top_coefficients].columns)
print(coeff[top_coefficients])

# plot figure with x: actual age Y: predicted age, and a line with slope 1 for reference
plt.figure()
plt.scatter(Y_test, y_pred_Ridge, alpha=0.5, s=10, color = "darkcyan")
plt.axline((60,60), slope=1, color='r')
plt.xticks(range(40,90,5))
plt.yticks(range(40,90,5))
plt.ylabel('Predicted Age', fontsize = 16)
plt.xlabel('Age', fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

# plot the brainAGE and a line with the correlation between brainAGE and age 
plt.figure()
plt.scatter(Y_test,brain_age_delta_Ridge, alpha=0.5, s=10, color = "darkcyan")
plt.axline((65,0), slope=-0.69, color="blue")
plt.axline((60,0),slope=0, color = "black")
plt.ylabel("BrainAGE", fontsize = 16)
plt.xlabel("Age", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

### 2.2.3 Correction

In [None]:
# reformat cross validation data for regression model
y_pred_cv = np.asarray(y_pred_cv)
y_pred_cv = np.reshape(y_pred_cv, (-1,1))
Y_test_cv = np.asarray(Y_test_cv)
Y_test_cv = np.reshape(Y_test_cv, (-1,1))

# now using correction from the R code that's online from Cole
reg = LinearRegression().fit(Y_test_cv, y_pred_cv)
coef = float(reg.coef_)
intercept = float(reg.intercept_)
print("Coefficient and Intercept:", coef, intercept)

# correction by cole
function = lambda t: (t-intercept)/coef
vfunc = np.vectorize(function)
corr_brainage_2 = vfunc(y_pred_Ridge)
corr_brainage_delta_2 = corr_brainage_2-Y_test

# plots corrected
plt.figure()
plt.scatter(Y_test, corr_brainage_2, alpha=0.5, s=10, color="darkcyan")
plt.axline((65,65), slope=coef, color="blue")
plt.ylabel("Predicted Age Corrected", fontsize = 16)
plt.axline((60,60), slope=1, color='r')
plt.xlabel("Age", fontsize = 16)
plt.xticks(range(40,90,5), fontsize = 14)
plt.yticks(range(30,115,10), fontsize = 14)
plt.show()

plt.figure()
plt.scatter(Y_test, corr_brainage_delta_2, alpha=0.5, s=10, color="darkcyan")
plt.axline((60,0),slope=0, color = "black")
plt.ylabel("BrainAGE Corrected", fontsize = 16)
plt.xlabel("Age", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

## 2.3 Desikan Segmentation

In [None]:
# select variables belonging to the segmentation
X_train = brain_train.iloc[:,845:976:2]
Y_train = brain_train.iloc[:,-1]
X_test = brain_test.iloc[:,845:976:2]
Y_test = brain_test.iloc[:,-1]

# standardize x-data
X_train = stats.zscore(X_train)
X_test = stats.zscore(X_test)

### 2.3.1 Cross-validation

In [None]:
X_CV = X_train
Y_CV = Y_train

# randomly split data (only training no PA) into training and testing set
X_train_cv, X_test_cv, Y_train_cv, Y_test_cv = train_test_split(X_CV,Y_CV)

#define the model
model_ridge = RidgeCV(alphas = alpha_parameters)
model_lasso = LassoCV(alphas = alpha_parameters, max_iter=100000)
model_SVR = SVR(kernel = "rbf")
#perform 10-fold cross validation
scores_ridge = cross_val_score(model_ridge, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
scores_lasso = cross_val_score(model_lasso, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
scores_SVR = cross_val_score(model_SVR, X_CV, Y_CV, cv=10, scoring='neg_mean_absolute_error')
#calculate the mean-absolute error
mae_ridge = -1 * scores_ridge.mean()
mae_lasso = -1 * scores_lasso.mean()
mae_SVR = -1 * scores_SVR.mean()
print("The MAE for cross-validation (Ridge):", mae_ridge)
print("The MAE for cross-validation (Lasso):", mae_lasso)
print("The MAE for cross-validation (SVR):", mae_SVR)

# for correction
model_ridge.fit(X_train_cv,Y_train_cv)
y_pred_cv = model_ridge.predict(X_test_cv)

### 2.3.2 Prediction

In [None]:
# models to try
model_Lasso = LassoCV(alphas = alpha_parameters, max_iter=100000)
model_Ridge = RidgeCV(alphas = alpha_parameters)
model_SVR = SVR(kernel = 'rbf')

# train model 
model_Lasso.fit(X_train,Y_train)
model_Ridge.fit(X_train,Y_train)
model_SVR.fit(X_train,Y_train)

# get predicted values for test set
y_pred_Lasso = model_Lasso.predict(X_test)
y_pred_Ridge = model_Ridge.predict(X_test)
y_pred_SVR = model_SVR.predict(X_test)

# calculate brain age gap
brain_age_delta_Lasso = y_pred_Lasso-Y_test
brain_age_delta_Ridge = y_pred_Ridge-Y_test
brain_age_delta_SVR = y_pred_SVR-Y_test

# get mean absolute error (MAE)
print("The MAE for testing set using Lasso:", mean_absolute_error(Y_test,y_pred_Lasso))
print("The MAE for testing set using Ridge:", mean_absolute_error(Y_test,y_pred_Ridge))
print("The MAE for testing set using SVR:", mean_absolute_error(Y_test,y_pred_SVR))

# Spearman rank order correlations between models of the same segmentation
print("The Spearman r for Ridge vs Lasso:", stats.spearmanr(brain_age_delta_Ridge, brain_age_delta_Lasso))
print("The Spearman r for Ridge vs SVR:", stats.spearmanr(brain_age_delta_Ridge, brain_age_delta_SVR))
print("The Spearman r for SVR vs Lasso:", stats.spearmanr(brain_age_delta_SVR, brain_age_delta_Lasso))

# top x coefficients and their names
coeff = np.asarray(model_Ridge.coef_)
coeffabs = np.asarray(abs(model_Ridge.coef_))
top_coefficients = np.argsort(coeffabs)[-20:]
print(X_train.iloc[:,top_coefficients].columns)
print(coeff[top_coefficients])

# plot figure with x: actual age Y: predicted age, and a line with slope 1 for reference
plt.figure()
plt.scatter(Y_test, y_pred_Ridge, alpha=0.5, s=10, color = "darkcyan")
plt.axline((60,60), slope=1, color='r')
plt.xticks(range(40,90,5))
plt.yticks(range(40,90,5))
plt.ylabel('Predicted Age', fontsize = 16)
plt.xlabel('Age', fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

# plot the brainAGE and a line with the correlation between brainAGE and age 
plt.figure()
plt.scatter(Y_test,brain_age_delta_Ridge, alpha=0.5, s=10, color = "darkcyan")
plt.axline((65,0), slope=-0.69, color="blue")
plt.axline((60,0),slope=0, color = "black")
plt.ylabel("BrainAGE", fontsize = 16)
plt.xlabel("Age", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

### 2.3.3 Correction

In [None]:
# reformat cross validation data for regression model
y_pred_cv = np.asarray(y_pred_cv)
y_pred_cv = np.reshape(y_pred_cv, (-1,1))
Y_test_cv = np.asarray(Y_test_cv)
Y_test_cv = np.reshape(Y_test_cv, (-1,1))

# now using correction from the R code that's online from Cole
reg = LinearRegression().fit(Y_test_cv, y_pred_cv)
coef = float(reg.coef_)
intercept = float(reg.intercept_)
print("Coefficient and Intercept:", coef, intercept)

# correction by cole
function = lambda t: (t-intercept)/coef
vfunc = np.vectorize(function)
corr_brainage_3 = vfunc(y_pred_Ridge)
corr_brainage_delta_3 = corr_brainage_3-Y_test

# plots corrected
plt.figure()
plt.scatter(Y_test, corr_brainage_3, alpha=0.5, s=10, color="darkcyan")
plt.axline((65,65), slope=coef, color="blue")
plt.ylabel("Predicted Age Corrected", fontsize = 16)
plt.axline((60,60), slope=1, color='r')
plt.xlabel("Age", fontsize = 16)
plt.xticks(range(40,90,5), fontsize = 14)
plt.yticks(range(30,115,10), fontsize = 14)
plt.show()

plt.figure()
plt.scatter(Y_test, corr_brainage_delta_3, alpha=0.5, s=10, color="darkcyan")
plt.axline((60,0),slope=0, color = "black")
plt.ylabel("BrainAGE Corrected", fontsize = 16)
plt.xlabel("Age", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize=14)
plt.show()

# 3 Add BrainAge Deltas to main_test

In [None]:
main_test["BADA2009"] = corr_brainage_delta_1
main_test["BADASEG"] = corr_brainage_delta_2
main_test["BADDesikan"] = corr_brainage_delta_3

# 4 Correlation between deltas of different segmentations

In [None]:
print("Correlation ASEG x Destrieux:", stats.spearmanr(corr_brainage_delta_2, corr_brainage_delta_1))
print("Correlation ASEG x Desikan:", stats.spearmanr(corr_brainage_delta_2, corr_brainage_delta_3))
print("Correlation Desikan x Destreux:", stats.spearmanr(corr_brainage_delta_3, corr_brainage_delta_1))

# 5 Pickle data for next script

In [None]:
# pickle results
main_test.to_pickle("2_main_test.pkl")
main_train.to_pickle("2_main_train.pkl")