# Make a function to extract and write csv with rois and labels

In [1]:
import os
import re
import nibabel as nib
import numpy as np
import matplotlib as plt
import pandas as pd
from itertools import cycle
from nilearn.input_data import NiftiLabelsMasker
from nilearn import datasets
from nilearn.connectome import ConnectivityMeasure
from nilearn import plotting
from joblib import Parallel, delayed
import multiprocessing
%matplotlib inline

parcellation_name='power_drysdale'
input_file_list_fname = "../post_processing/rs_files_and_exclusions.csv"
parcel_labels_fname='../power_spheres/power_drysdale_labels.csv'
label_image_fname='../power_spheres/power_drysdale_spheres.nii'
rsfc_derivs_dir="/data/mounts/scs-fs-20/kpsy/genr/users/jflournoy/rsfc_derivatives/"
num_cores=4

outpath = os.path.join(rsfc_derivs_dir, parcellation_name)

def time_course_extractor(connectivity_obj, outpath=None):
    def extract(fname, masker_fname, save_csv=True, save_img=True):
        if os.path.isfile(fname) and os.path.isfile(masker_fname):
            masker_obj = NiftiLabelsMasker(labels_img=masker_fname, 
                                           standardize=True,
                                           memory='nilearn_cache',
                                           verbose=5)
            anImg = nib.load(fname)
            time_series = masker_obj.fit_transform(anImg)
            cormat = connectivity_obj.fit_transform([time_series])[0]
            
            if save_csv and outpath:
                if not os.path.isdir(outpath):
                    try:
                        print('Output dir {} not found, attempting to create...'.format(outpath))
                        os.makedirs(outpath)
                    except:
                        print("Cannot make dir output dir {}".format(outpath))
                        raise
                save_one(fname, time_series, cormat, outpath)
                
        else:
            time_series = []
            cormat = []
        return time_series, cormat
    return extract

def save_one(f, t, c, outpath):
    sid = re.match('.*/(sub-\d+).*_bold\.nii\.gz', f).groups()[0]
    sid_outpath = os.path.join(outpath, sid)
    if not os.path.isdir(sid_outpath):
        try:
            os.makedirs(sid_outpath)
        except:
            print("Cannot make dir {}".format(sid_outpath))
            raise
    outfilename = re.match('.*/(sub.*)_bold\.nii\.gz', f).groups()[0]
    outfilename_cr = os.path.join(sid_outpath, outfilename + '_corrmat.csv')
    outfilename_ts = os.path.join(sid_outpath, outfilename + '_timeseries.csv')

    uppertri_indexes = np.triu_indices_from(c, k=1)
    uppertri_data = c[uppertri_indexes]

    c_df = pd.DataFrame({'r': uppertri_data, 'row': uppertri_indexes[0], 'col': uppertri_indexes[1]})
    t_df = pd.DataFrame(t, columns=labels).assign(tr = list(range(1,t.shape[0]+1))).melt(id_vars='tr', var_name='label')

    c_df.to_csv(outfilename_cr)
    t_df.to_csv(outfilename_ts)

def make_extract_arg_zips(input_filenames, label_def_filename, exclude):
    #if the poarcel fname is a list, then assume it has a list of subject-specific 
    #label files
    input_file_list_include = input_filenames.loc[exclude == 0,'file'].values
    
    label_image_file_is_nii = re.match(".*nii$", label_def_filename)
    label_image_file_is_csv = re.match(".*csv$", label_def_filename)
    if label_image_file_is_nii:
        print("Using {} to parcellate all images...".format(label_def_filename))
        extract_args_zip = zip(list(input_file_list_include), cycle([label_def_filename]))
    elif label_image_file_is_csv:
        print("Using list of parcellation files from ".format(label_def_filename))
        raise Exception("TEST THIS CODE FIRST")
        label_image_fnames = pd.read_csv(label_def_filename)
        label_image_fnames_include = label_image_fnames.loc[exclude == 0,'file'].values
        if len(input_file_list_include) != label_image_fnames_include:
            raise Exception("List of resting-state and label images do not match: {} and {}".format(len(input_file_list_include), label_image_fnames_include))
        extract_args_zip = zip(input_filenames, label_image_fnames_include)
    else:
        raise Exception("Label definition is neither .nii or .csv: {}".format(label_def_filename))
    
    return extract_args_zip, input_file_list_include

#get our labels -- these are just network labels
input_file_list = pd.read_csv(input_file_list_fname)
label_df = pd.read_csv(parcel_labels_fname)
labels = list(label_df['label'].values)

connectivity_obj = ConnectivityMeasure(kind='correlation')

print("Processing input file list...")
if 'exclude' in input_file_list.columns:
    exclude = input_file_list.exclude.values
    print("Found {} exclusions.".format(sum(exclude)))
else:
    print("No exclusions found in file list; assuming no files will be excluded.")
    exclude = [0]*input_file_list.file.shape[0]



Processing input file list...
Found 0 exclusions.


In [2]:
35/50*3420/60

39.9

In [2]:
def parallel_extract(input_file_list, label_def, exclude, connectivity_obj, outpath=None, save_csv=True, save_img=True, num_cores=1, verbose=0):
    print("Extracting from {} files using {} processes...".format(input_file_list.shape[0], num_cores))
    extract_corrs = time_course_extractor(connectivity_obj, outpath=outpath)
    extract_args_zip, rs_files_included = make_extract_arg_zips(input_file_list, label_def, exclude)
    results = Parallel(n_jobs=num_cores, verbose=verbose)(delayed(extract_corrs)(f, l, save_csv=save_csv, save_img=save_img) for f, l in extract_args_zip)
    timeseries = [rez[0] for rez in results]
    corrmats = [rez[1] for rez in results]
    return timeseries, corrmats, rs_files_included

import time
t0 = time.time()
timeseries, corrmats, rs_files_included  = parallel_extract(input_file_list[0:10], 
                                                            label_image_fname, 
                                                            exclude[0:10], 
                                                            connectivity_obj, 
                                                            outpath=outpath,
                                                            save_csv=True, 
                                                            save_img=True,
                                                            num_cores=4, 
                                                            verbose=10)
t1 = time.time()
total = t1-t0

Extracting from 10 files using 4 processes...
Using ../power_spheres/power_drysdale_spheres.nii to parcellate all images...


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of  10 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=4)]: Done   7 out of  10 | elapsed:  1.8min remaining:   47.5s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.9min finished


In [3]:
print("3420 participants would take about {} hours to complete...".format(np.round(total*num_cores/(len(input_file_list[0:10]))*3240/60/60, 2)))

3420 participants would take about 43.21 hours to complete...


In [5]:
save_mean_cor=True
if save_mean_cor:
    somemats = connectivity_obj.fit_transform(timeseries)
    mean_mat = connectivity_obj.mean_
    mean_mat_uppertri_indexes = np.triu_indices_from(mean_mat, k=1)
    mean_mat_uppertri_data = mean_mat[mean_mat_uppertri_indexes]

    mean_mat_df = pd.DataFrame({'r': mean_mat_uppertri_data, 
                                'row': mean_mat_uppertri_indexes[0], 
                                'col': mean_mat_uppertri_indexes[1]})
    mean_mat_df.to_csv(os.path.join(outpath, 'mean_correlation_matrix.csv'))

In [6]:
mean_mat_df

Unnamed: 0,r,row,col
0,0.271039,0,1
1,-0.190732,0,2
2,-0.077421,0,3
3,-0.028892,0,4
4,0.375133,0,5
5,0.198278,0,6
6,-0.073783,0,7
7,-0.304358,0,8
8,0.009187,0,9
9,-0.097045,0,10
