# ABIDE I. `fmriprep` with AROMA. Connectivity analysis

In [1]:
import nilearn
print (nilearn.__version__)

0.6.2


## 1. ABIDE1: `fmriprep(AROMA)` `AAL` dataset creation

In [2]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import os

In [3]:
data_dir = "../../../datasets/abide/preproc/"

In [5]:
import glob

suffix = "MNI152NLin2009cAsym_preproc.nii.gz"
list_files = glob.glob(data_dir + "*/*/*/*/func/*" + suffix)

In [6]:
len(list_files)

1139

In [7]:
target_names = []
for i in range(len(list_files)):
    target_names.append(list_files[i].split('/')[-1][4:11]) # splitting the subject number

In [8]:
len(pd.to_numeric(pd.DataFrame(target_names)[0], errors ='coerce').dropna())

target_names = pd.to_numeric(pd.DataFrame(target_names)[0], errors ='coerce').dropna()

In [9]:
len(target_names)

1139

## 2. Connectivity matrices creation

In [47]:
import warnings
warnings.filterwarnings("ignore")

def tr_extractor(f):
    import nibabel as nib
    
    try:
        img = nib.load(f)
        header = img.header
        dim = header["pixdim"].tolist()
        h = dim[4]
        return h
    except FileNotFoundError:
        print("NIFTI file not found")

def make_correlation_matrix(path_to_fmriprep_data, 
                            path_to_save_connectivity_matrices,
                            path_to_save_ts = False):
    """
        Process the fmriprep preprocessed functional MRI time-series into 2D correlation matrix as DataFrame using Nilearn lib.
    """
    import pandas as pd
    import numpy as np

    import nilearn
    from nilearn import datasets
    from nilearn.image import concat_imgs
    from nilearn.input_data import NiftiLabelsMasker
    from nilearn.image import high_variance_confounds
    from nilearn.connectome import ConnectivityMeasure
    
    tr = tr_extractor(path_to_fmriprep_data)
    subject_name = path_to_fmriprep_data.split('/')[-1][4:11]

    dataset = datasets.fetch_atlas_aal(version='SPM12', data_dir='./datadir/', url=None, resume=True, verbose=0)
    atlas_filename = dataset.maps
    labels = dataset.labels
    correlation_measure = ConnectivityMeasure(kind='correlation')

    img = concat_imgs(path_to_fmriprep_data, auto_resample=True, verbose=0)
    atlas = nilearn.image.resample_to_img(atlas_filename, img, interpolation='nearest', copy=True, order='F', clip=False)
    masker = NiftiLabelsMasker(labels_img=atlas, standardize=True,
                                                detrend = True, low_pass=0.08,
                                                high_pass=0.009, t_r=tr,
                                                memory='nilearn_cache', memory_level=1,
                                                verbose=0)

    confounds = high_variance_confounds(img, 1)
    time_series = masker.fit_transform(img, confounds)
    
    if path_to_save_ts:
        np.save(path_to_save_ts + '/'+ subject_name, time_series)

    correlation_matrix = correlation_measure.fit_transform([time_series])[0]
    np.fill_diagonal(correlation_matrix, 1)
    df = pd.DataFrame(correlation_matrix)
    output_path = os.path.join(path_to_save_connectivity_matrices, subject_name)
    df.to_csv(output_path + '.csv', sep=',')
    
    print ('TR: ', tr, ' subject:', subject_name)

### 2.2. Creating the connectivity matrices

In [10]:
list_files[0]

'../../../datasets/abide/preproc/abide1/KKI/fmriprep/sub-0050801/func/sub-0050801_task-rest_run-1_bold_space-MNI152NLin2009cAsym_preproc.nii.gz'

In [10]:
from fc_creator import make_correlation_matrix

make_correlation_matrix(list_files[0], 
                        'ABIDE_AAL_0.08_0.009',
                        'ABIDE_AAL_0.08_0.009_TS')

In [11]:
mydict = np.load('ABIDE_AAL_0.08_0.009_TS/0050801.npy')
print (mydict)

[[-5.5839404e-02 -9.3411461e-02 -1.1138052e-01 ... -5.6957275e-02
  -6.8469465e-02  1.3482283e-02]
 [ 4.2212582e-01 -4.8728446e-03 -4.6111622e-01 ... -4.7854397e-01
  -6.5490562e-01  9.3528263e-02]
 [ 4.6767047e-01 -2.7042191e-02 -6.3370985e-01 ... -7.9223645e-01
  -1.2005019e+00 -1.9079966e-02]
 ...
 [ 1.0825256e+00  3.1713769e+00  3.7461162e+00 ...  5.0757241e+00
   4.5964098e+00  2.3811901e+00]
 [ 5.1382966e-02  1.4786392e+00  1.7315035e+00 ...  3.2881253e+00
   3.6182051e+00  1.9925953e+00]
 [ 1.1615280e-01  1.3855229e-01  5.2401349e-02 ...  6.9049075e-02
   1.9180915e-01 -1.1515156e-01]]


In [12]:
np.shape(mydict)

(156, 116)

In [13]:
from tqdm import tqdm
for i in tqdm(list_files):
    make_correlation_matrix(i, 
                            'ABIDE_AAL_0.08_0.009',
                            'ABIDE_AAL_0.08_0.009_TS')

100%|██████████| 1139/1139 [2:10:49<00:00,  6.89s/it] 


In [14]:
len(os.listdir('ABIDE_AAL_0.08_0.009'))

1089

In [15]:
len(os.listdir('ABIDE_AAL_0.08_0.009_TS'))

1089

### 2.3. Creating the dataset

In [17]:
fc_frame = pd.DataFrame()    

subject_id = []

path_to_connectivity_matrices_folder = 'ABIDE_AAL_0.08_0.009/'

for i in os.listdir(path_to_connectivity_matrices_folder):
    
    try:
        
        connectivity_matrix_vector = pd.read_csv(
                path_to_connectivity_matrices_folder + i, index_col = 0
            ).values
        connectivity_matrix_vector = connectivity_matrix_vector[
                np.triu_indices(connectivity_matrix_vector.shape[0], 1)
            ].reshape(1,-1)
        connectivity_matrix_vector = connectivity_matrix_vector[0][:6670]
        #checking for Nan values within the frame    
        assert (i, np.isnan(connectivity_matrix_vector).any())
        
        print(i.split('/')[-1][:-4], 'full name ', i)
        subject_id.append(i.split('/')[-1][:-4])
        fc_frame = fc_frame.append(pd.DataFrame(connectivity_matrix_vector).T)
            
    except:
        continue

0051193 full name  0051193.csv
0051167 full name  0051167.csv
0051166 full name  0051166.csv
0051188 full name  0051188.csv
0051182 full name  0051182.csv
0051197 full name  0051197.csv
0051171 full name  0051171.csv
0051169 full name  0051169.csv
0051190 full name  0051190.csv
0051161 full name  0051161.csv
0051177 full name  0051177.csv
0051176 full name  0051176.csv
0051196 full name  0051196.csv
0051163 full name  0051163.csv
0051192 full name  0051192.csv
0051170 full name  0051170.csv
0051186 full name  0051186.csv
0051175 full name  0051175.csv
0051164 full name  0051164.csv
0051195 full name  0051195.csv
0051198 full name  0051198.csv
0051174 full name  0051174.csv
0051179 full name  0051179.csv
0051183 full name  0051183.csv
0051191 full name  0051191.csv
0051173 full name  0051173.csv
0051194 full name  0051194.csv
0051185 full name  0051185.csv
0051189 full name  0051189.csv
0051168 full name  0051168.csv
0051178 full name  0051178.csv
0051187 full name  0051187.csv
0051180 

In [18]:
fc_frame['SUB_ID'] = ''
fc_frame['SUB_ID'] = pd.to_numeric(subject_id)

In [19]:
fc_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6661,6662,6663,6664,6665,6666,6667,6668,6669,SUB_ID
0,0.832303,0.892027,0.819333,0.510881,0.466768,0.824275,0.800789,0.265858,0.218023,0.788554,...,0.426708,0.589954,0.380438,0.516206,0.252576,0.032588,0.605443,0.123506,0.40417,51193
0,0.557616,0.867488,0.575818,0.596784,0.750462,0.60507,0.556198,0.155206,0.724947,0.627863,...,0.705333,0.47044,0.384528,0.677783,0.456227,0.530975,0.75889,0.595654,0.767414,51167
0,0.521023,0.651068,0.569641,0.090522,0.138832,0.678696,0.595421,0.1484,0.160258,0.539653,...,0.406905,0.158856,-0.020114,0.44921,0.156323,-0.037217,0.490009,-0.083202,0.027777,51166
0,0.497805,0.636777,0.635174,0.259107,0.406935,0.680455,0.498526,0.308284,0.242842,0.351873,...,0.128865,0.212443,-0.171631,0.036004,-0.167815,-0.10181,0.668085,0.167622,0.128128,51188
0,0.689907,0.775888,0.763009,0.417104,0.426217,0.847102,0.832371,0.429545,0.581974,0.766802,...,0.526777,0.440572,0.094115,0.483137,0.605417,0.012031,0.454308,0.141789,-0.057951,51182


#### 2.3.1. Adding targets to `DataFrame`

In [20]:
targets = pd.read_csv('Phenotypic_V1_0b_preprocessed1.csv')
targets.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,0,1,50002,1,50002,PITT,no_filename,1,1,16.77,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,...,,OK,,OK,,OK,,OK,,1
2,2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,...,,OK,,OK,,OK,,OK,,1
3,3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1


In [21]:
resulting_frame = pd.merge(fc_frame, targets, on ='SUB_ID')

In [23]:
len(resulting_frame)

1088

In [24]:
resulting_frame['DX_GROUP'].value_counts()

2    562
1    526
Name: DX_GROUP, dtype: int64

In [25]:
resulting_frame = resulting_frame.reset_index(drop = True)
resulting_frame.to_csv('ABIDEI_AAL_0.08_0.009_29.11.2020.csv')

## 3. CPAC `CC200` dataset preparation

In [27]:
path_to_connectivity_matrices_folder = '/home/datasets/abide/abide1_2_connectivity_matrices/abide1_preprocessed/abide1_cpac_cc200/abide1_cpac_cc200_nilearn_cm/'

In [29]:
len(os.listdir(path_to_connectivity_matrices_folder))

1036

In [31]:
temp_matrix = pd.read_csv(path_to_connectivity_matrices_folder + os.listdir(path_to_connectivity_matrices_folder)[0])

In [35]:
temp_matrix.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,0,1.0,-0.30847,-0.032817,-0.135309,-0.417513,0.053874,-0.361935,0.312277,-0.147749,...,-0.314891,0.185321,-0.436074,-0.127543,0.493735,-0.486128,0.099531,0.056224,-0.013549,-0.009883
1,1,-0.30847,1.0,-0.063555,-0.014674,0.339593,-0.389461,-0.038745,-0.212948,-0.268071,...,0.374232,-0.288452,0.261243,-0.055206,-0.156907,0.407194,-0.042034,-0.125769,-0.169513,-0.029002
2,2,-0.032817,-0.063555,1.0,-0.244791,0.125359,0.062976,-0.227785,-0.068636,-0.082225,...,0.242437,-0.100932,0.075287,0.00666,0.057069,-0.140261,0.083968,-0.05975,0.091088,-0.281021
3,3,-0.135309,-0.014674,-0.244791,1.0,0.259065,0.155924,0.158733,-0.239074,0.073599,...,0.083864,-0.307369,0.106434,-0.063382,-0.272554,0.213878,0.1188,0.005151,-0.139888,0.470222
4,4,-0.417513,0.339593,0.125359,0.259065,1.0,-0.139184,0.039592,-0.307816,0.170535,...,0.620852,-0.255609,0.41562,0.049887,-0.342978,0.560448,0.161961,-0.123142,-0.1081,0.166701


In [34]:
fc_frame = pd.DataFrame()    

subject_id = []


for i in os.listdir(path_to_connectivity_matrices_folder):
    
    try:
        
        connectivity_matrix_vector = pd.read_csv(
                path_to_connectivity_matrices_folder + i, index_col = 0
            ).values
        connectivity_matrix_vector = connectivity_matrix_vector[
                np.triu_indices(connectivity_matrix_vector.shape[0], 1)
            ].reshape(1,-1)
        connectivity_matrix_vector = connectivity_matrix_vector[0][:19900]
        #checking for Nan values within the frame    
        assert (i, np.isnan(connectivity_matrix_vector).any())
        
        print(i.split('/')[-1][:-4], 'full name ', i)
        subject_id.append(i.split('/')[-1][:-4])
        fc_frame = fc_frame.append(pd.DataFrame(connectivity_matrix_vector).T)
            
    except:
        continue

50657 full name  50657.csv
50654 full name  50654.csv
50386 full name  50386.csv
51489 full name  51489.csv
51308 full name  51308.csv
51269 full name  51269.csv
50405 full name  50405.csv
50133 full name  50133.csv
50520 full name  50520.csv
50171 full name  50171.csv
51361 full name  51361.csv
50488 full name  50488.csv
51066 full name  51066.csv
51085 full name  51085.csv
50532 full name  50532.csv
50416 full name  50416.csv
51240 full name  51240.csv
50042 full name  50042.csv
50413 full name  50413.csv
50046 full name  50046.csv
51477 full name  51477.csv
51075 full name  51075.csv
50499 full name  50499.csv
50268 full name  50268.csv
50370 full name  50370.csv
51051 full name  51051.csv
51358 full name  51358.csv
50482 full name  50482.csv
50723 full name  50723.csv
51462 full name  51462.csv
50735 full name  50735.csv
50143 full name  50143.csv
51131 full name  51131.csv
50626 full name  50626.csv
51202 full name  51202.csv
51267 full name  51267.csv
50135 full name  50135.csv
5

#### 3.1. Adding targets to `DataFrame`

In [36]:
fc_frame['SUB_ID'] = ''
fc_frame['SUB_ID'] = pd.to_numeric(subject_id)

In [37]:
fc_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19891,19892,19893,19894,19895,19896,19897,19898,19899,SUB_ID
0,-0.30847,-0.032817,-0.135309,-0.417513,0.053874,-0.361935,0.312277,-0.147749,-0.252594,-0.344279,...,-0.086228,-0.256857,0.165728,-0.225255,-0.056809,-0.024379,0.168549,-0.028447,-0.273653,50657
0,-0.261418,-0.079423,0.005804,0.12229,0.164643,-0.154563,0.27098,-0.040355,-0.207504,-0.459586,...,-0.034937,0.1441,0.200052,-0.261023,-0.083319,0.322562,0.144369,-0.230471,-0.073006,50654
0,-0.241987,-0.037245,-0.152939,-0.180749,-0.305586,-0.183445,0.037724,0.098372,0.146093,0.011728,...,-0.32094,-0.07365,-0.075697,0.034041,0.039844,0.018948,0.058587,0.12987,0.206737,50386
0,-0.034297,0.24805,-0.110738,-0.10358,-0.15739,-0.385977,0.029797,0.157929,0.172453,0.068071,...,0.031391,0.552107,-0.042254,-0.360964,-0.421551,0.200142,0.313593,0.33743,-0.07312,51489
0,0.162958,-0.293863,0.065329,0.229314,0.040645,0.022878,-0.17349,-0.191969,0.161584,0.240223,...,0.026002,-0.04937,-0.298941,0.09006,-0.17028,-0.109266,0.132606,-0.049203,0.137933,51308


In [38]:
targets = pd.read_csv('Phenotypic_V1_0b_preprocessed1.csv')
targets.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,0,1,50002,1,50002,PITT,no_filename,1,1,16.77,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,...,,OK,,OK,,OK,,OK,,1
2,2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,...,,OK,,OK,,OK,,OK,,1
3,3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1


In [39]:
resulting_frame = pd.merge(fc_frame, targets, on ='SUB_ID')

In [40]:
len(resulting_frame)

1035

In [41]:
resulting_frame['DX_GROUP'].value_counts()

2    530
1    505
Name: DX_GROUP, dtype: int64

In [42]:
resulting_frame = resulting_frame.reset_index(drop = True)
resulting_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,-0.30847,-0.032817,-0.135309,-0.417513,0.053874,-0.361935,0.312277,-0.147749,-0.252594,-0.344279,...,,fail,skull-striping fail,fail,ic-frontal-temporal-cerebellum,OK,,OK,,0
1,-0.261418,-0.079423,0.005804,0.12229,0.164643,-0.154563,0.27098,-0.040355,-0.207504,-0.459586,...,,maybe,skull-striping fail,maybe,ic-cerebellum,OK,,OK,,0
2,-0.241987,-0.037245,-0.152939,-0.180749,-0.305586,-0.183445,0.037724,0.098372,0.146093,0.011728,...,,OK,,OK,,OK,,OK,,1
3,-0.034297,0.24805,-0.110738,-0.10358,-0.15739,-0.385977,0.029797,0.157929,0.172453,0.068071,...,,OK,,fail,Ic-parietal,OK,,OK,,0
4,0.162958,-0.293863,0.065329,0.229314,0.040645,0.022878,-0.17349,-0.191969,0.161584,0.240223,...,,maybe,skull-striping fail;,maybe,,OK,,OK,,1


In [43]:
resulting_frame.to_csv('ABIDEI_CC200_29.11.2020.csv')

### *Saving test subjects:

In [9]:
connectivity_matrix_vector = pd.read_csv(
                'ABIDE_AAL_0.08_0.009/0051153.csv', index_col = 0
            ).values
connectivity_matrix_vector = connectivity_matrix_vector[
                np.triu_indices(connectivity_matrix_vector.shape[0], 1)
            ].reshape(1,-1)
connectivity_matrix_vector = connectivity_matrix_vector[0][:6670]
        #checking for Nan values within the frame    
        
fc_frame = pd.DataFrame(connectivity_matrix_vector).T

In [10]:
fc_frame.to_csv('0051153.csv')

In [8]:
ls

 0051023.csv                            [0m[01;34m__pycache__[0m/
 01042020_D_vs_C_45_64_LR_76_11.pkl     abide1_classif_test.ipynb
 ABIDEI_AAL_0.08_0.009_27.10.2020.csv  'abide1_preproc_ classification.ipynb'
 [01;34mABIDE_AAL_0.08_0.009[0m/                  [01;34mdatadir[0m/
 [01;34mGUEHT_AAL_0.08_0.009[0m/                  dataset_creator.ipynb
 GUEHT_AAL_0.08_0.009_30.10.2020.csv    [01;34mnilearn_cache[0m/
 [01;34mNYU_OLIN_ABIDEI[0m/                       utils.py
 Phenotypic_V1_0b_preprocessed1.csv
