# Preamble

In [1]:
# Essentials
import os, sys, glob
import pandas as pd
import numpy as np
import nibabel as nib
import scipy.io as sio

# Stats
import scipy as sp
from scipy import stats
import statsmodels.api as sm
import pingouin as pg

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['svg.fonttype'] = 'none'

In [2]:
sys.path.append('/Users/lindenmp/Dropbox/Work/ResProjects/neurodev_cs_predictive/code/func/')
from proj_environment import set_proj_env
sys.path.append('/Users/lindenmp/Dropbox/Work/git/pyfunc/')
from func import get_synth_cov

In [3]:
train_test_str = 'squeakycleanExclude'
exclude_str = 't1Exclude'
extra_str = '_consist' # '_vol_norm' '_noboxcox'
edge_weight = 'streamlineCount' # 'streamlineCount' 'fa' 'mean_streamlineLength' 'adc'
parc_scale = 200
primary_covariate = 'ageAtScan1_Years'
parcel_names, parcel_loc, drop_parcels, num_parcels, yeo_idx, yeo_labels = set_proj_env(exclude_str = exclude_str, train_test_str = train_test_str,
                                                                                        parc_scale = parc_scale, primary_covariate = primary_covariate,
                                                                                       extra_str = extra_str, edge_weight = edge_weight)

In [4]:
print(os.environ['MODELDIR'])

/Users/lindenmp/Dropbox/Work/ResProjects/neurodev_cs_predictive/analysis_cubic_test/normative/t1Exclude/squeakycleanExclude/schaefer_200_streamlineCount_consist


## Load data

In [5]:
# Load data
df = pd.read_csv(os.path.join(os.environ['MODELDIR'], 'df_pheno.csv'))
df.set_index(['bblid', 'scanid'], inplace = True)

df_node = pd.read_csv(os.path.join(os.environ['MODELDIR'], 'df_node_clean.csv'))
# df_node = pd.read_csv(os.path.join(os.environ['MODELDIR'], 'df_node_base.csv'))
df_node.set_index(['bblid', 'scanid'], inplace = True)

# adjust sex to 0 and 1
df['sex_adj'] = df.sex - 1
print(df.shape)
print(df_node.shape)

(1062, 47)
(1062, 801)


In [6]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,squeakycleanExclude,ageAtScan1,ageAtScan1_Years,sex,race2,handednessv2,dti64MeanAbsRMS,dti64MeanRelRMS,dti64MaxAbsRMS,dti64MaxRelRMS,...,F1_Social_Cognition_Efficiency_Ar,F2_Complex_Reasoning_Efficiency_Ar,F3_Memory_Efficiency_Ar,F4_Executive_Efficiency_Ar,F1_Slow_Speed_Ar,F2_Memory_Speed_Ar,F3_Fast_Speed_Ar,streamline_count,network_density,sex_adj
bblid,scanid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
81287,2738,0,240,20.0,2,1,1,0.351665,0.154478,0.557358,0.272518,...,1.0577,1.601508,0.830148,0.514279,0.391313,-0.105869,-0.156787,235668.0,0.065427,1
80680,2739,1,253,21.1,1,1,1,0.531246,0.208078,0.754717,0.35456,...,0.595196,1.647967,0.379309,-0.389853,0.439099,0.709515,0.355577,225185.0,0.06603,0
81754,2740,1,232,19.3,2,1,1,0.310943,0.218462,0.460633,0.387235,...,0.307979,1.134216,0.677509,1.006161,1.177551,0.935206,1.335039,212752.0,0.065729,1
81903,2749,0,231,19.2,2,1,1,0.43261,0.283153,0.819576,0.509537,...,1.324297,1.886781,1.634455,0.527513,0.783375,1.27244,0.333475,179762.0,0.065628,1
81043,2750,0,249,20.8,2,2,1,0.162409,0.096761,0.429102,0.259678,...,0.759529,0.261837,0.30649,0.001343,0.748104,1.059229,0.370116,209586.0,0.06608,1


In [7]:
df_node.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,squeakycleanExclude,vol_0,vol_1,vol_2,vol_3,vol_4,vol_5,vol_6,vol_7,vol_8,...,mc_190,mc_191,mc_192,mc_193,mc_194,mc_195,mc_196,mc_197,mc_198,mc_199
bblid,scanid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
81287,2738,0,5189.0,2500.0,2724.0,2812.0,3148.0,3396.0,2501.0,1817.0,843.0,...,-0.035777,-0.107565,-0.060975,-0.029014,-0.005197,-0.024319,-0.02721,-0.006056,-0.014127,-0.008207
80680,2739,1,6030.0,4159.0,4376.0,3968.0,4736.0,4269.0,4102.0,3579.0,1518.0,...,-0.053202,-0.093597,-0.157219,-0.132259,-0.02856,-0.053926,-0.04251,-0.012675,-0.033888,-0.017134
81754,2740,1,5426.0,3135.0,4029.0,3247.0,4108.0,3276.0,2612.0,2754.0,741.0,...,-0.070738,-0.015497,-0.058051,-0.070581,-0.017404,-0.025727,-0.012646,-0.002153,-0.012977,-0.003792
81903,2749,0,3173.0,2491.0,2027.0,2494.0,2450.0,3164.0,2554.0,1617.0,664.0,...,-0.043862,-0.015668,-0.080619,-0.041519,-0.013985,-0.033639,-0.022386,-0.003368,-0.013461,-0.005448
81043,2750,0,3327.0,1890.0,2183.0,2525.0,3189.0,2660.0,2451.0,1758.0,871.0,...,-0.087753,-0.017894,-0.110389,-0.042485,-0.00774,-0.027156,-0.018434,-0.004059,-0.020778,-0.010947


# Prepare files for normative modelling

In [8]:
# Note, 'ageAtScan1_Years' is assumed to be covs[0] and 'sex_adj' is assumed to be covs[1]
# if more than 2 covs are to be used, append to the end and age/sex will be duplicated accordingly in the forward model
covs = [primary_covariate, 'sex_adj']

print(covs)
num_covs = len(covs)
print(num_covs)

['ageAtScan1_Years', 'sex_adj']
2


In [9]:
extra_str_2 = ''

## Primary model (train/test split)

In [10]:
# Create subdirectory for specific normative model -- labeled according to parcellation/resolution choices and covariates
normativedir = os.path.join(os.environ['MODELDIR'], '+'.join(covs) + extra_str_2 + '/')
print(normativedir)
if not os.path.exists(normativedir): os.mkdir(normativedir);

/Users/lindenmp/Dropbox/Work/ResProjects/neurodev_cs_predictive/analysis_cubic_test/normative/t1Exclude/squeakycleanExclude/schaefer_200_streamlineCount_consist/ageAtScan1_Years+sex_adj/


In [11]:
# Write out training
df[df[train_test_str] == 0].to_csv(os.path.join(normativedir, 'train.csv'))
df[df[train_test_str] == 0].to_csv(os.path.join(normativedir, 'cov_train.txt'), columns = covs, sep = ' ', index = False, header = False)
print(str(np.sum(df[train_test_str] == 0)) + ' individuals in the final training set')

# Write out test
df[df[train_test_str] == 1].to_csv(os.path.join(normativedir, 'test.csv'))
df[df[train_test_str] == 1].to_csv(os.path.join(normativedir, 'cov_test.txt'), columns = covs, sep = ' ', index = False, header = False)
print(str(np.sum(df[train_test_str] == 1)) + ' individuals in the final testing set')

304 individuals in the final training set
758 individuals in the final testing set


In [12]:
# Write out training
resp_train = df_node[df_node[train_test_str] == 0].drop(train_test_str, axis = 1)
mask = np.all(np.isnan(resp_train), axis = 1)
if np.any(mask): print("Warning: NaNs in response train")
resp_train.to_csv(os.path.join(normativedir, 'resp_train.csv'))
resp_train.to_csv(os.path.join(normativedir, 'resp_train.txt'), sep = ' ', index = False, header = False)

# Write out test
resp_test = df_node[df_node[train_test_str] == 1].drop(train_test_str, axis = 1)
mask = np.all(np.isnan(resp_test), axis = 1)
if np.any(mask): print("Warning: NaNs in response train")
resp_test.to_csv(os.path.join(normativedir, 'resp_test.csv'))
resp_test.to_csv(os.path.join(normativedir, 'resp_test.txt'), sep = ' ', index = False, header = False)

print(str(resp_train.shape[1]) + ' features written out for normative modeling')

800 features written out for normative modeling


### Forward variants

In [13]:
fwddir = os.path.join(normativedir, 'forward/')
if not os.path.exists(fwddir): os.mkdir(fwddir)

# Synthetic cov data
x = get_synth_cov(df, cov = primary_covariate, stp = 1)

if 'sex_adj' in covs:
    # Produce gender dummy variable for one repeat --> i.e., to account for two runs of ages, one per gender
    gender_synth = np.concatenate((np.ones(x.shape),np.zeros(x.shape)), axis = 0)

# concat
synth_cov = np.concatenate((np.matlib.repmat(x, 2, 1), np.matlib.repmat(gender_synth, 1, 1)), axis = 1)
print(synth_cov.shape)

# write out
np.savetxt(os.path.join(fwddir, 'synth_cov_test.txt'), synth_cov, delimiter = ' ', fmt = ['%.1f', '%.d'])

(30, 2)


### Cross-val variant

In [14]:
# # Create subdirectory for specific normative model -- labeled according to parcellation/resolution choices and covariates
# cvdir = os.path.join(normativedir, 'cv/')
# if not os.path.exists(cvdir): os.mkdir(cvdir)