In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scipy.stats import pearsonr, sem

In [2]:
def get_X_y_math_verbal(conn_mat_filename: str, general_scores_filename: str, math_scores_filename: str, verbal_scores_filename: str):
    Glasser_conn_mat = np.load(conn_mat_filename)

    # get upper triangle indices, without diagonal 0 (k = 1 starts from the k' diagonal)
    indices = np.triu_indices(360, k=1)

    # create X: rows for subjects, and flattened upper triangle mat for each subject
    X = []
    for i in range(Glasser_conn_mat.shape[2]):
        X.append(Glasser_conn_mat[:, :, i][indices])
    # convert X to data frame for pipeline parameters
    X = pd.DataFrame(X)

    # read scores
    y = pd.read_csv(general_scores_filename, header=None).to_numpy()
    y_math = pd.read_csv(math_scores_filename, header=None).to_numpy()
    y_verbal = pd.read_csv(verbal_scores_filename, header=None).to_numpy()

    return X, y, y_math, y_verbal

In [3]:
X, y, y_math, y_verbal= get_X_y_math_verbal(
    conn_mat_filename="data/Glasser_conn_mat_158_subj.npy",
    general_scores_filename="data/general_scores_158_subj.csv",
    math_scores_filename="data/math_scores_158_subj.csv",
    verbal_scores_filename="data/verbal_scores_158_subj.csv",
)

In [4]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64610,64611,64612,64613,64614,64615,64616,64617,64618,64619
0,0.256043,0.543582,0.834341,0.737259,0.611785,0.551843,0.268657,0.506335,0.347711,0.313753,...,0.048507,-0.001441,0.037499,0.057737,0.108180,0.056225,0.042079,0.084301,0.041814,0.307135
1,0.483221,0.612067,0.912712,0.845826,0.808821,0.462722,0.192100,0.297347,0.760023,0.656678,...,0.455484,0.171413,0.093261,0.365236,0.236431,0.193829,0.247808,0.266242,0.295980,0.532271
2,0.126272,0.541725,0.894314,0.881399,0.768822,0.541432,0.282807,0.175312,0.360123,0.392476,...,0.291799,0.033146,0.026356,0.229688,0.116440,0.049035,0.071814,0.219577,0.078079,0.439387
3,0.528645,0.548732,0.897949,0.874365,0.722720,0.452277,0.589948,0.519460,0.393255,0.331269,...,0.277162,0.061410,0.055927,0.003119,0.063941,0.240621,0.062928,0.136225,0.082756,0.281293
4,0.408504,0.533769,0.918164,0.885054,0.722196,0.653343,0.433824,-0.006783,0.548261,0.341645,...,0.194007,-0.084169,-0.293965,-0.067129,0.015106,-0.066016,0.092298,0.061617,0.007905,0.311022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,0.497834,0.592031,0.945841,0.886041,0.737326,0.665670,0.324835,0.187993,0.534930,0.465906,...,0.435125,0.082796,0.125401,0.225401,0.045384,0.137809,0.239056,0.178493,0.124486,0.334965
154,0.274340,0.601339,0.899841,0.832969,0.679898,0.506748,0.209305,0.273583,0.391672,0.145301,...,0.274637,0.080526,0.021162,0.204128,0.080962,0.033422,0.177067,0.244266,0.238470,0.499744
155,0.649097,0.548860,0.792040,0.734714,0.501993,0.564024,0.376806,0.250711,0.529989,0.315082,...,0.481008,0.202375,-0.087569,-0.172030,0.112162,0.011920,-0.065025,-0.027739,0.059129,0.414311
156,0.457167,0.697228,0.949361,0.923404,0.861765,0.720488,0.334928,0.192592,0.507035,0.459623,...,0.261736,0.200122,0.139826,0.057603,0.277383,0.374788,0.009510,0.280210,0.196362,0.292409


In [5]:
y

array([[698.],
       [623.],
       [664.],
       [776.],
       [663.],
       [620.],
       [698.],
       [770.],
       [729.],
       [732.],
       [750.],
       [600.],
       [730.],
       [667.],
       [595.],
       [697.],
       [672.],
       [699.],
       [728.],
       [581.],
       [689.],
       [650.],
       [729.],
       [726.],
       [742.],
       [695.],
       [741.],
       [695.],
       [577.],
       [600.],
       [664.],
       [712.],
       [729.],
       [734.],
       [745.],
       [647.],
       [709.],
       [670.],
       [747.],
       [671.],
       [596.],
       [738.],
       [786.],
       [706.],
       [689.],
       [670.],
       [780.],
       [698.],
       [663.],
       [734.],
       [667.],
       [723.],
       [768.],
       [750.],
       [740.],
       [738.],
       [734.],
       [729.],
       [745.],
       [585.],
       [692.],
       [669.],
       [737.],
       [711.],
       [730.],
       [728.],
       [64

In [6]:
y_math

array([[149.],
       [133.],
       [134.],
       [143.],
       [108.],
       [110.],
       [145.],
       [150.],
       [135.],
       [140.],
       [142.],
       [120.],
       [141.],
       [137.],
       [106.],
       [138.],
       [113.],
       [138.],
       [148.],
       [117.],
       [145.],
       [113.],
       [139.],
       [130.],
       [144.],
       [140.],
       [138.],
       [145.],
       [117.],
       [ 90.],
       [110.],
       [129.],
       [139.],
       [130.],
       [145.],
       [112.],
       [138.],
       [132.],
       [136.],
       [124.],
       [110.],
       [150.],
       [148.],
       [138.],
       [148.],
       [148.],
       [150.],
       [147.],
       [132.],
       [145.],
       [130.],
       [137.],
       [148.],
       [142.],
       [149.],
       [140.],
       [138.],
       [141.],
       [135.],
       [117.],
       [136.],
       [125.],
       [146.],
       [137.],
       [150.],
       [132.],
       [ 9

In [7]:
y_verbal

array([[124.],
       [121.],
       [120.],
       [148.],
       [142.],
       [100.],
       [130.],
       [135.],
       [130.],
       [148.],
       [143.],
       [111.],
       [137.],
       [148.],
       [122.],
       [129.],
       [145.],
       [130.],
       [132.],
       [105.],
       [130.],
       [132.],
       [134.],
       [150.],
       [136.],
       [120.],
       [148.],
       [116.],
       [113.],
       [134.],
       [120.],
       [145.],
       [148.],
       [145.],
       [142.],
       [129.],
       [133.],
       [111.],
       [148.],
       [131.],
       [117.],
       [147.],
       [150.],
       [130.],
       [125.],
       [125.],
       [148.],
       [134.],
       [130.],
       [132.],
       [120.],
       [144.],
       [147.],
       [140.],
       [140.],
       [140.],
       [146.],
       [135.],
       [145.],
       [116.],
       [130.],
       [134.],
       [137.],
       [139.],
       [145.],
       [148.],
       [14

In [8]:
print(f"Mean General Score: {np.round(np.mean(y), 2)}")
print(f"Mean Math Score: {np.round(np.mean(y_math), 2)}")
print(f"Mean Verbal Score: {np.round(np.mean(y_verbal), 2)}")

Mean General Score: 682.69
Mean Math Score: 132.18
Mean Verbal Score: 130.85


In [9]:
print(f"Standard deviation General Score: {np.round(np.std(y), 2)}")
print(f"Standard deviation Math Score: {np.round(np.std(y_math), 2)}")
print(f"Standard deviation Verbal Score: {np.round(np.std(y_verbal), 2)}")

Standard deviation General Score: 60.07
Standard deviation Math Score: 14.32
Standard deviation Verbal Score: 13.35


# Linear Regression - General Scores (for comparison)

In [14]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = LinearRegression()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data 
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-1.28 -0.67 -0.68 -0.76 -1.81 -1.55 -1.16 -0.62 -1.39 -0.52]
mean MSE across folds: 1.04
MSE standard error across folds: 0.14
Test MSE: 1.47, r: 0.052, p value: 0.85


# Linear Regression - Math Scores

In [15]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_math, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = LinearRegression()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data 
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-0.58 -0.75 -1.85 -0.69 -1.56 -1.58 -1.47 -0.67 -1.52 -0.65]
mean MSE across folds: 1.13
MSE standard error across folds: 0.16
Test MSE: 1.009, r: 0.203, p value: 0.45


# Linear Regression - Verbal Scores

In [16]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_verbal, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = LinearRegression()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data 
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-1.47 -1.14 -1.36 -1.   -1.63 -0.72 -1.27 -0.68 -0.86 -0.78]
mean MSE across folds: 1.09
MSE standard error across folds: 0.11
Test MSE: 1.146, r: -0.011, p value: 0.969


# Ridge - General scores for comparison

In [17]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

rdg = Ridge()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(rdg, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data 
rdg.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = rdg.predict(X_test_pca)
test_mse_rdg = mean_squared_error(y_test, y_test_predicted)
r_rdg, p_val_rdg = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_rdg, 3)}, r: {np.round(r_rdg, 3)}, p value: {np.round(p_val_rdg, 3)}")

cross validation negative MSE score:  [-1.28 -0.67 -0.68 -0.76 -1.81 -1.55 -1.16 -0.62 -1.39 -0.52]
mean MSE across folds: 1.04
MSE standard error across folds: 0.14
Test MSE: 1.47, r: 0.052, p value: 0.85


# Ridge - Math scores

In [18]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_math, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

rdg = Ridge()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(rdg, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data 
rdg.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = rdg.predict(X_test_pca)
test_mse_rdg = mean_squared_error(y_test, y_test_predicted)
r_rdg, p_val_rdg = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_rdg, 3)}, r: {np.round(r_rdg, 3)}, p value: {np.round(p_val_rdg, 3)}")

cross validation negative MSE score:  [-0.58 -0.75 -1.85 -0.69 -1.56 -1.58 -1.47 -0.67 -1.52 -0.65]
mean MSE across folds: 1.13
MSE standard error across folds: 0.16
Test MSE: 1.009, r: 0.203, p value: 0.45


# Ridge - verbal scores

In [19]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_verbal, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

rdg = Ridge()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(rdg, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data 
rdg.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = rdg.predict(X_test_pca)
test_mse_rdg = mean_squared_error(y_test, y_test_predicted)
r_rdg, p_val_rdg = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_rdg, 3)}, r: {np.round(r_rdg, 3)}, p value: {np.round(p_val_rdg, 3)}")

cross validation negative MSE score:  [-1.47 -1.14 -1.36 -1.   -1.63 -0.72 -1.27 -0.68 -0.86 -0.78]
mean MSE across folds: 1.09
MSE standard error across folds: 0.11
Test MSE: 1.146, r: -0.011, p value: 0.969


# Lasso - general scores for comparison

In [20]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = Lasso()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-1.2  -0.72 -0.79 -0.78 -1.63 -1.59 -1.11 -0.57 -1.35 -0.48]
mean MSE across folds: 1.02
MSE standard error across folds: 0.13
Test MSE: 1.399, r: 0.191, p value: 0.479


# Lasso - Math scores

In [21]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_math, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = Lasso()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-0.57 -0.78 -1.57 -0.68 -1.29 -1.35 -1.3  -0.52 -1.47 -0.6 ]
mean MSE across folds: 1.01
MSE standard error across folds: 0.13
Test MSE: 0.991, r: 0.202, p value: 0.452


# Lasso - verbal scores

In [22]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_verbal, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = Lasso()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-1.43 -1.12 -1.52 -0.93 -1.51 -0.7  -1.23 -0.67 -0.76 -0.68]
mean MSE across folds: 1.05
MSE standard error across folds: 0.11
Test MSE: 1.072, r: 0.072, p value: 0.79


# Elastic net - general scores for comparison

In [23]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = ElasticNet()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-1.22 -0.7  -0.72 -0.78 -1.68 -1.56 -1.11 -0.59 -1.36 -0.5 ]
mean MSE across folds: 1.02
MSE standard error across folds: 0.13
Test MSE: 1.428, r: 0.127, p value: 0.639


# Elastic net - math scores

In [24]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_math, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = ElasticNet()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-0.58 -0.75 -1.7  -0.69 -1.36 -1.44 -1.38 -0.55 -1.49 -0.61]
mean MSE across folds: 1.05
MSE standard error across folds: 0.14
Test MSE: 1.003, r: 0.195, p value: 0.469


# Elastic net - verbal scores

In [25]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_verbal, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

lr = ElasticNet()

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(lr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
lr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = lr.predict(X_test_pca)
test_mse_lr = mean_squared_error(y_test, y_test_predicted)
r_lr, p_val_lr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_lr, 3)}, r: {np.round(r_lr, 3)}, p value: {np.round(p_val_lr, 3)}")

cross validation negative MSE score:  [-1.45 -1.14 -1.46 -0.96 -1.56 -0.71 -1.27 -0.68 -0.8  -0.74]
mean MSE across folds: 1.08
MSE standard error across folds: 0.11
Test MSE: 1.11, r: 0.021, p value: 0.939


# Random Forest - general scores for comparison

In [26]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=75, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

# Parameters grid for hyperparameter tuning 
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

r_vec = []
p_value_vec = []
fold_mse_train = []
fold_mse_val = []
fold_config = []

kf = KFold(n_splits=10, shuffle=True, random_state=0)

for train_index, val_index in kf.split(X_train_pca):
    X_train_fold, X_val_fold = X_train_pca[train_index], X_train_pca[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    random_forest = RandomForestRegressor(random_state=0)
    grid_search = GridSearchCV(
        random_forest,
        param_grid=param_grid_rf,
        scoring="neg_mean_squared_error",
        cv=10,
        n_jobs=-1,
    )
    grid_search.fit(X_train_fold, y_train_fold)

    # Fit and Predict
    best_estimator = grid_search.best_estimator_
    y_train_predicted = best_estimator.predict(X_train_fold)
    y_predicted = best_estimator.predict(X_val_fold)
    fold_config.append(grid_search.best_params_)
    train_mse, val_mse = mean_squared_error(y_train_fold, y_train_predicted), mean_squared_error(y_val_fold, y_predicted)

    r, p_value = pearsonr(y_predicted, y_val_fold)
    r_vec.append(r)
    p_value_vec.append(p_value)
    fold_mse_train.append(train_mse)
    fold_mse_val.append(val_mse)

print("mean r: ", np.round(np.mean(r_vec), 2), ", mean p-value: ", np.round(np.mean(p_value_vec), 2))

print(f"min MSE: {np.round(np.min(fold_mse_val), 2)} in fold {np.argmin(fold_mse_val) + 1}")
# Configuration that received the minimal MSE
config = fold_config[np.argmin(fold_mse_val)]
print(config)

# Random Forest Regressor with the most accurate configuration
chosen_random_forest = RandomForestRegressor(random_state=0, **config)

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(chosen_random_forest, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
chosen_random_forest.fit(X_train_pca, y_train)

# Predict on unseen test set
y_pred = chosen_random_forest.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
r, p_value = pearsonr(y_test, y_pred)
print(f"MSE: {np.round(mse, 3)}, r: {np.round(r, 3)}, p value: {np.round(p_value, 3)}")

mean r:  0.13 , mean p-value:  0.49
min MSE: 0.58 in fold 4
{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
cross validation negative MSE score:  [-1.12 -0.75 -1.14 -0.75 -1.31 -1.6  -1.05 -0.77 -1.39 -0.35]
mean MSE across folds: 1.02
MSE standard error across folds: 0.12
MSE: 1.389, r: 0.235, p value: 0.381


# Random Forest - math scores

In [27]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_math, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=75, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

# Parameters grid for hyperparameter tuning 
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

r_vec = []
p_value_vec = []
fold_mse_train = []
fold_mse_val = []
fold_config = []

kf = KFold(n_splits=10, shuffle=True, random_state=0)

for train_index, val_index in kf.split(X_train_pca):
    X_train_fold, X_val_fold = X_train_pca[train_index], X_train_pca[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    random_forest = RandomForestRegressor(random_state=0)
    grid_search = GridSearchCV(
        random_forest,
        param_grid=param_grid_rf,
        scoring="neg_mean_squared_error",
        cv=10,
        n_jobs=-1,
    )
    grid_search.fit(X_train_fold, y_train_fold)

    # Fit and Predict
    best_estimator = grid_search.best_estimator_
    y_train_predicted = best_estimator.predict(X_train_fold)
    y_predicted = best_estimator.predict(X_val_fold)
    fold_config.append(grid_search.best_params_)
    train_mse, val_mse = mean_squared_error(y_train_fold, y_train_predicted), mean_squared_error(y_val_fold, y_predicted)

    r, p_value = pearsonr(y_predicted, y_val_fold)
    r_vec.append(r)
    p_value_vec.append(p_value)
    fold_mse_train.append(train_mse)
    fold_mse_val.append(val_mse)

print("mean r: ", np.round(np.mean(r_vec), 2), ", mean p-value: ", np.round(np.mean(p_value_vec), 2))

print(f"min MSE: {np.round(np.min(fold_mse_val), 2)} in fold {np.argmin(fold_mse_val) + 1}")
# Configuration that received the minimal MSE
config = fold_config[np.argmin(fold_mse_val)]
print(config)

# Random Forest Regressor with the most accurate configuration
chosen_random_forest = RandomForestRegressor(random_state=0, **config)

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(chosen_random_forest, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
chosen_random_forest.fit(X_train_pca, y_train)

# Predict on unseen test set
y_pred = chosen_random_forest.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
r, p_value = pearsonr(y_test, y_pred)
print(f"MSE: {np.round(mse, 3)}, r: {np.round(r, 3)}, p value: {np.round(p_value, 3)}")

mean r:  0.18 , mean p-value:  0.51
min MSE: 0.38 in fold 5
{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
cross validation negative MSE score:  [-0.63 -1.2  -1.69 -0.58 -1.07 -1.48 -0.95 -0.7  -1.47 -0.56]
mean MSE across folds: 1.03
MSE standard error across folds: 0.13
MSE: 0.853, r: 0.66, p value: 0.005


# Random Forest - verbal scores

In [28]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_verbal, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=75, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

# Parameters grid for hyperparameter tuning 
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

r_vec = []
p_value_vec = []
fold_mse_train = []
fold_mse_val = []
fold_config = []

kf = KFold(n_splits=10, shuffle=True, random_state=0)

for train_index, val_index in kf.split(X_train_pca):
    X_train_fold, X_val_fold = X_train_pca[train_index], X_train_pca[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    random_forest = RandomForestRegressor(random_state=0)
    grid_search = GridSearchCV(
        random_forest,
        param_grid=param_grid_rf,
        scoring="neg_mean_squared_error",
        cv=10,
        n_jobs=-1,
    )
    grid_search.fit(X_train_fold, y_train_fold)

    # Fit and Predict
    best_estimator = grid_search.best_estimator_
    y_train_predicted = best_estimator.predict(X_train_fold)
    y_predicted = best_estimator.predict(X_val_fold)
    fold_config.append(grid_search.best_params_)
    train_mse, val_mse = mean_squared_error(y_train_fold, y_train_predicted), mean_squared_error(y_val_fold, y_predicted)

    r, p_value = pearsonr(y_predicted, y_val_fold)
    r_vec.append(r)
    p_value_vec.append(p_value)
    fold_mse_train.append(train_mse)
    fold_mse_val.append(val_mse)

print("mean r: ", np.round(np.mean(r_vec), 2), ", mean p-value: ", np.round(np.mean(p_value_vec), 2))

print(f"min MSE: {np.round(np.min(fold_mse_val), 2)} in fold {np.argmin(fold_mse_val) + 1}")
# Configuration that received the minimal MSE
config = fold_config[np.argmin(fold_mse_val)]
print(config)

# Random Forest Regressor with the most accurate configuration
chosen_random_forest = RandomForestRegressor(random_state=0, **config)

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(chosen_random_forest, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
chosen_random_forest.fit(X_train_pca, y_train)

# Predict on unseen test set
y_pred = chosen_random_forest.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
r, p_value = pearsonr(y_test, y_pred)
print(f"MSE: {np.round(mse, 3)}, r: {np.round(r, 3)}, p value: {np.round(p_value, 3)}")


mean r:  0.09 , mean p-value:  0.59
min MSE: 0.47 in fold 5
{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
cross validation negative MSE score:  [-1.56 -1.05 -1.64 -0.98 -1.33 -0.74 -1.25 -0.72 -0.62 -0.56]
mean MSE across folds: 1.04
MSE standard error across folds: 0.12
MSE: 1.054, r: 0.33, p value: 0.212


# Gradient boosting - general scores for comparison

In [29]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

gbr = GradientBoostingRegressor(random_state=0)

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(gbr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
gbr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = gbr.predict(X_test_pca)
test_mse_gbr = mean_squared_error(y_test, y_test_predicted)
r_gbr, p_val_gbr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_gbr, 3)}, r: {np.round(r_gbr, 3)}, p value: {np.round(p_val_gbr, 3)}")

cross validation negative MSE score:  [-1.5  -0.6  -1.45 -0.86 -1.44 -1.64 -1.36 -0.82 -1.41 -0.73]
mean MSE across folds: 1.18
MSE standard error across folds: 0.12
Test MSE: 1.358, r: 0.299, p value: 0.26


# Gradient Boosting - math scores

In [30]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_math, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

gbr = GradientBoostingRegressor(random_state=0)

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(gbr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
gbr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = gbr.predict(X_test_pca)
test_mse_gbr = mean_squared_error(y_test, y_test_predicted)
r_gbr, p_val_gbr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_gbr, 3)}, r: {np.round(r_gbr, 3)}, p value: {np.round(p_val_gbr, 3)}")

cross validation negative MSE score:  [-0.32 -1.26 -2.18 -0.57 -1.44 -1.93 -1.31 -0.71 -1.5  -0.62]
mean MSE across folds: 1.18
MSE standard error across folds: 0.19
Test MSE: 0.903, r: 0.309, p value: 0.244


# Gradient boosting - verbal scores

In [31]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_verbal, test_size=0.1, random_state=0)

# Normalization of features and behavioral scores
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train).ravel()
y_test = scaler.transform(y_test).ravel()

# Apply PCA for feature selection
pca = PCA(n_components=20, svd_solver="full", random_state=0)
X_train_pca = pca.fit_transform(X_train)
# Apply PCA on the testing data
X_test_pca = pca.transform(X_test)

gbr = GradientBoostingRegressor(random_state=0)

# Report cross validation score across all folds
cross_val_score_list = cross_val_score(gbr, X_train_pca, y_train, cv=10, scoring="neg_mean_squared_error")
print("cross validation negative MSE score: ", np.round(cross_val_score_list, 2))
print("mean MSE across folds:", np.round(-1 * np.mean(cross_val_score_list), 2))
print("MSE standard error across folds:", np.round(sem(cross_val_score_list), 2))

# Fit the model to train data
gbr.fit(X_train_pca, y_train)

# Predict on unseen test set
y_test_predicted = gbr.predict(X_test_pca)
test_mse_gbr = mean_squared_error(y_test, y_test_predicted)
r_gbr, p_val_gbr = pearsonr(y_test, y_test_predicted)
print(f"Test MSE: {np.round(test_mse_gbr, 3)}, r: {np.round(r_gbr, 3)}, p value: {np.round(p_val_gbr, 3)}")

cross validation negative MSE score:  [-1.57 -1.24 -2.09 -1.   -1.8  -1.21 -1.7  -0.76 -0.86 -1.48]
mean MSE across folds: 1.37
MSE standard error across folds: 0.14
Test MSE: 1.262, r: -0.203, p value: 0.451
