In [None]:
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
from nilearn.signal import clean
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSCanonical
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit
from sklearn.utils import shuffle

In [None]:
# Initialize
PATH = '...'

Load data

In [None]:
# Load genetic data
df_cnv = pd.read_csv(PATH + '....csv')
df_cnv.set_index('SampleID', inplace=True)
# Genetic covariates
df_cnv = df_cnv.loc[:, ['TYPE', 'sum_loeuf_inv', 'n_genes', 'gene_id']]

# Load clean data
df_brain = pd.read_csv(PATH + '....csv', index_col=0)
df_phens = pd.read_csv(PATH + '....csv', index_col=0)
df_cov = pd.read_csv(PATH + '....csv', index_col=0)

Unite data

In [None]:
idx_str = df_brain.index
idx_phens = df_phens.index
idx_genetic = df_cnv.index
idx_cov = df_cov.index

idx_all = list(set(idx_str) & set(idx_phens) & set(idx_genetic) & set(idx_cov))

df_cnv = df_cnv.loc[idx_all, :]
df_brain = df_brain.loc[idx_all, :]
df_phens = df_phens.loc[idx_all, :]
df_cov = df_cov.loc[idx_all, :]

df_brain[:] = clean(df_brain.values, confounds=df_cov.loc[:, ['interview_age', 'sex', 'volume', 'scanner']].values,
                    detrend=False, standardize=False)

df_phens[:] = clean(df_phens.values, confounds=df_cov.loc[:, ['interview_age', 'sex']].values,
                    detrend=False, standardize=False)

Final CCO permutations test

In [None]:
def covariance_is(x, y, pls_dim, pca_dim, n_splits, i_iter):
    pls = PLSCanonical(n_components=pls_dim, scale=False,
                       max_iter=1000, tol=1e-5)
    pca = PCA(n_components=pca_dim)
    scaler = StandardScaler()
    # Split data into training and test sets
    kf = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=i_iter)
    cov_is = np.zeros((n_splits, pls_dim))
    for i, (train_index, test_index) in enumerate(kf.split(x)):
        x_train = x[train_index]
        y_train = y[train_index]
        # Scale data
        x_train_ss = scaler.fit_transform(x_train)
        y_train_ss = scaler.fit_transform(y_train)
        # Fit PCA
        x_train_ss_pca = pca.fit_transform(x_train_ss)
        y_train_ss_pca = pca.fit_transform(y_train_ss)
        # Fit model
        pls.fit(x_train_ss_pca, y_train_ss_pca)
        # Predict y
        x_pred = np.dot(x_train_ss_pca, pls.x_rotations_)
        y_pred = np.dot(y_train_ss_pca, pls.y_rotations_)
        for j in range(pls_dim):
            cov_is[i, j] = np.cov(x_pred[:, j], y_pred[:, j])[0, 1]
    cov_is = np.mean(cov_is, axis=0)
    return cov_is

In [None]:
def covariance_oos(x, y, pls_dim, pca_dim, n_splits, i_iter, groups):
    pls = PLSCanonical(n_components=pls_dim, scale=False,
                       max_iter=1000, tol=1e-5)                 
    pca = PCA(n_components=pca_dim)
    scaler = StandardScaler()
    # Split data into training and test sets
    gkf = GroupShuffleSplit(n_splits=n_splits, random_state=i_iter, test_size=0.1)
    cov_oos = np.zeros((n_splits, pls_dim))
    for i, (train_index, test_index) in enumerate(gkf.split(x, y, groups=groups)):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Scale data
        x_train_ss = scaler.fit_transform(x_train)
        x_test_ss = scaler.transform(x_test)
        y_train_ss = scaler.fit_transform(y_train)
        y_test_ss = scaler.transform(y_test)
        # Fit PCA
        x_train_ss_pca = pca.fit_transform(x_train_ss)
        x_test_ss_pca = pca.transform(x_test_ss)
        y_train_ss_pca = pca.fit_transform(y_train_ss)
        y_test_ss_pca = pca.transform(y_test_ss)
        # Fit model
        pls.fit(x_train_ss_pca, y_train_ss_pca)
        # Rotations
        x_rotations = pls.x_rotations_
        y_rotations = pls.y_rotations_
        # Orient rotations
        # x_rotations, y_rotations = orient_pls(x_rotations, y_rotations,
        #                                       x_rotations_orig, y_rotations_orig)
        # Predict y
        x_pred = np.dot(x_test_ss_pca, x_rotations)
        y_pred = np.dot(y_test_ss_pca, y_rotations)
        for j in range(pls_dim):
            cov_oos[i, j] = np.cov(x_pred[:, j], y_pred[:, j])[0, 1]
    cov_oos = np.mean(cov_oos, axis=0)
    return cov_oos

In [12]:
def covariance_is_permute(x, y, pls_dim, pca_dim, n_splits, i_perm, groups):
    pls = PLSCanonical(n_components=pls_dim, scale=False,
                       max_iter=1000, tol=1e-5)                 
    pca = PCA(n_components=pca_dim)
    scaler = StandardScaler()
    # Split data into training and test sets
    gkf = GroupShuffleSplit(n_splits=n_splits, random_state=i_perm, test_size=0.1)
    cov_is = np.zeros((n_splits, pls_dim))
    # for i, (train_index, test_index) in enumerate(kf.split(x)):
    for i, (train_index, test_index) in enumerate(gkf.split(x, y, groups=groups)):
        x_train = x[train_index]
        y_train = y[train_index]
        # Scale data
        x_train_ss = scaler.fit_transform(x_train)
        y_train_ss = scaler.fit_transform(y_train)
        # Fit PCA
        x_train_ss_pca = pca.fit_transform(x_train_ss)
        y_train_ss_pca = pca.fit_transform(y_train_ss)
        # Fit model
        idx_rand = shuffle(np.arange(0, np.shape(y_train_ss_pca)[0]), random_state=i_perm)
        y_train_ss_pca_perm = y_train_ss_pca[idx_rand, :]
        pls.fit(x_train_ss_pca, y_train_ss_pca_perm)
        # Rotations
        x_rotations = pls.x_rotations_
        y_rotations = pls.y_rotations_
        # Predict y
        x_pred = np.dot(x_train_ss_pca, x_rotations)
        y_pred = np.dot(y_train_ss_pca_perm, y_rotations)
        for j in range(pls_dim):
            cov_is[i, j] = np.cov(x_pred[:, j], y_pred[:, j])[0, 1]
    cov_is = np.mean(cov_is, axis=0)
    return cov_is

In [None]:
def permute_covariance(x, y, pls_dim, pca_dim, n_splits, i_perm):
    cov_perm_is = []
    # Permute y
    idx_rand = shuffle(np.arange(0, np.shape(y)[0]), random_state=i_perm)
    y_perm = y[idx_rand, :]
    cov_perm_is.append(covariance_is(x, y_perm, pls_dim, pca_dim, n_splits, i_perm)) # permuted insample covariance
    return cov_perm_is

In [None]:
# Generate relationship matrix
relationship_matrix = pd.read_csv(PATH + '.../king.kin0', sep='\t')

df_group = pd.DataFrame(data=np.arange(0, len(idx_all)), columns=['Group'], index=idx_all)
for i in range(np.shape(relationship_matrix)[0]):
    id1 = relationship_matrix.iloc[i, 1]
    id2 = relationship_matrix.iloc[i, 3]
    # Check if both IDs are in the index
    if id1 in idx_all and id2 in idx_all:
        # Check if both IDs have the same group ID
        if df_group.loc[id1, 'Group'] != df_group.loc[id2, 'Group']:
            # Assign the same group ID to both IDs
            group_id = df_group.loc[id1, 'Group']
            df_group.loc[df_group['Group'] == group_id, 'Group'] = df_group.loc[id2, 'Group']


In [None]:
## Cross validation of PLS components 
# Initialize
n_iter = 100
n_perm = 100
n_splits = 10
pls_dim = 5
pca_dim = 50

# Prepare data
x_ctrl = df_brain.values.astype('float')[df_cnv['TYPE'] == 'CTRL']
y_ctrl = df_phens.values.astype('float')[df_cnv['TYPE'] == 'CTRL']

id_group = df_group.loc[df_cnv['TYPE'] == 'CTRL', 'Group'].values

# zscore
x_ctrl_ss = StandardScaler().fit_transform(x_ctrl)
y_ctrl_ss = StandardScaler().fit_transform(y_ctrl)

# Fit PCA
pca = PCA(n_components=pca_dim)
x_ctrl_ss_pca = pca.fit_transform(x_ctrl_ss)
y_ctrl_ss_pca = pca.fit_transform(y_ctrl_ss)

In [None]:
# Out of sample covariance for CTRL
delayed_calls = [delayed(covariance_oos)(x_ctrl, y_ctrl, pls_dim, pca_dim, n_splits, i_iter, id_group) for i_iter in range(n_iter)]
res = Parallel(n_jobs=8, prefer="threads")(delayed_calls)
cov_oos = np.array(res)

In [None]:
# Permuted insample covariance for CTRL
delayed_calls = [delayed(covariance_is_permute)(x_ctrl, y_ctrl, pls_dim, pca_dim, n_splits, i_perm, id_group) for i_perm in range(n_perm)]
res = Parallel(n_jobs=8, prefer="threads")(delayed_calls)
cov_perm_is = np.squeeze(np.array(res))

In [None]:
# Plot of covariance and permuted covariance
for i in range(5):
    p_val = np.sum(cov_perm_is[:, i] > np.mean(cov_oos[:, i]))/n_perm
    print('p-value: ' + str(p_val))

    fig, ax = plt.subplots(1, figsize=(6, 3))
    g = sns.kdeplot(cov_oos[:, i], alpha=0.5, label='Original OOS', fill=True)
    g_perm = sns.kdeplot(cov_perm_is[:, i], alpha=0.5, label='Permuted', fill=True)
    plt.legend(loc='upper right')
    g.set_title('Mode ' + str(i+1))
    g.set_xlabel('PLS scores covariance')
    g.set_ylabel('Density')
    g.set_yticks([])
    plt.show()