This script will use the spins_concat etc. files created in the previous notebook (HCPgrad) to calculate the centroids (centre point) and dispersion around the centroid in every network, for every subject, by task. Functions are created to calculate each of these, and then are looped over every variable for the spins_concat etc. data that is grouped by subject, task, and network. The resulting data tables are then merged with a table containing the diagnostic information for each subject to faciliate stats done using that information

Read in files from HCPgrad_tidy_version notebook (now located in scratch)

In [3]:
import pandas as pd

all_concat = pd.read_csv('/scratch/a/arisvoin/lbassman/spins_gradients/spin_gradients/spins_concat_full.csv')
merge_networks = pd.read_csv('/scratch/a/arisvoin/lbassman/spins_gradients/merge_networks')
all_concat_short = pd.read_csv('/scratch/a/arisvoin/lbassman/spins_gradients/spins_concat_shortened.csv')
gsr_all_concat = pd.read_csv('/scratch/a/arisvoin/lbassman/spins_gradients/gsr_spins_concat_full.csv')
gsr_all_concat_short = pd.read_csv('/scratch/a/arisvoin/lbassman/spins_gradients/gsr_spins_concat_shortened.csv')

Grouping the all_concat file by site, task, and subbject, and the merge_networks file by network to create a list of each of the subject IDs, networks, sites and tasks of all the spins data. 

In [4]:
subject_group = all_concat.groupby(['Subject'])            #grouping_by separates dataframe by column - creates indiv. groups
network_group = merge_networks.groupby(['network1'])
site_group = all_concat.groupby(['Site'])
task_group = all_concat.groupby(['Task'])

list_subjects = list(subject_group.groups.keys())         #groups.keys gets all groupby group names, creates list
list_networks = list(network_group.groups.keys())
list_sites = list(site_group.groups.keys())
list_tasks = list(task_group.groups.keys())

Creating centroid function to determine the centroid for each network given specified subject id and task.

In [6]:
def calc_centroid(file, grouping_by, subject, task, network):
    '''Calculates the centroid of a network by subject and by task for given parameters in a file
    Takes first three gradients only
    Uses pandas groupby function to separate by given group, column in the file
    Takes one object in the given group each to specify which element in the group
    Calculates mean for each gradient and outputs as a 3D coordinate point.
    
    Parameters
    ----------
    file: str or path
        csv containing all information
    grouping_by: str 
        column in file 
    subject: str
    task: str
    network: str
        Specify which network, subject, and task pandas will get from the group


    Returns
    -------
    x_mean, y_mean, z_mean
        A three point coordinate, representing the centroid of the network (by subject and task)
    '''
    grouped_by = file.groupby(grouping_by)
    #create group, and then get groups based on parameters
    group_name = grouped_by.get_group((subject, task, network))
    #calculate mean of each gradient
    x_mean = group_name['grad1'].mean()
    y_mean = group_name['grad2'].mean()
    z_mean = group_name['grad3'].mean()
    return x_mean, y_mean, z_mean

                         

Creating dispersion function to determine the average dispersion around the centroid for each network given specified subject id and task.

In [7]:
import numpy as np

def calc_dispersion(file, grouping_by, subject, task, network):
    '''Calculates the dispersion around a centroid of a network by subject and by task for given parameters in a file
    Takes first three gradients only
    Uses pandas groupby function to separate by given group, column in the file
    Takes one object in the given group each to specify which element in the group
    Calculates dispersion for each gradient and outputs value. 
    
    Parameters
    ----------
    file: str or path
        csv containing all information
    grouping_by: str 
        column in file 
    subject: str
    task: str
    network: str
        Specify which network, subject, etc pandas will get from the group


    Returns
    -------
    dispersion
        A single value, representing the average dispersion of the network (by subject and task)
    '''
    grouped_by = file.groupby(grouping_by)
    group_name = grouped_by.get_group((subject, task, network))
    #calculate dispersion by using euclidean distance formula and standard deviation of each point of each gradient
    dispersion = np.sqrt((group_name['grad1'].std()**2)+(group_name['grad2'].std()**2)+(group_name['grad3'].std()**2))
    return dispersion

Using pseudo nested for loop to calculate the dispersion of every network for every subject and each task which is appended to an empty dataframe

In [8]:
import itertools

dispersion_total = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    dispersion =  calc_dispersion(all_concat, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    dispersion_total.append(dispersion)
    
dispersion_total_short = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    dispersion_short =  calc_dispersion(all_concat_short, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    dispersion_total_short.append(dispersion_short)
    
dispersion_total_gsr = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    dispersion_gsr =  calc_dispersion(gsr_all_concat, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    dispersion_total_gsr.append(dispersion_gsr)
    
dispersion_total_gsr_short = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    dispersion_gsr_short =  calc_dispersion(gsr_all_concat_short, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    dispersion_total_gsr_short.append(dispersion_gsr_short)

Same method used to calcualte centroid, one column dataframe is outputted to hold mean value for each dimension (gradient) as a coordinate (x,y,z) --> (grad1,grad2,grad3)

In [9]:
centroid_total = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    centroid =  calc_centroid(all_concat, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    centroid_total.append(centroid)
    
centroid_total_short = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    centroid_short =  calc_centroid(all_concat_short, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    centroid_total_short.append(centroid_short)
    
centroid_total_gsr = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    centroid_gsr =  calc_centroid(gsr_all_concat, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    centroid_total_gsr.append(centroid_gsr)

centroid_total_gsr_short = []

for i,j,k in itertools.product(range(len(list_subjects)), range(len(list_tasks)), range(len(list_networks))):
    centroid_gsr_short =  calc_centroid(gsr_all_concat_short, ['Subject','Task','Network'], list_subjects[i], list_tasks[j], list_networks[k])
    centroid_total_gsr_short.append(centroid_gsr_short)

Creating the first dataframe using the full, no gsr data. A full list of subjects is created, and a task list using separate lists for EA and rest, which are converted into dataframes to be concat into a larger file. Columns are then created for dispersion and centroid using the outputs from the function loops, and a site coclumn is created from the subject ID. 

In [13]:
subject_list= list()               
subject_list_df = list()          

for i in range(len(list_subjects)):
    subject=[list_subjects[i]]*26
    subject_list.append(subject)

    subject_list_dfs=pd.DataFrame(subject_list[i],
                                columns=['Subject'])
    subject_list_df.append(subject_list_dfs)
    
EA_list = ['EA'] * 13                  
RS_list = ['RS'] * 13 
task_list = EA_list + RS_list   
task_list_df = pd.DataFrame(task_list,
                           columns = ['Task']) 

network_list = list_networks*2
networks_df = pd.DataFrame(network_list, 
                         columns=['Network'])

full_concatenated = list()         

for i in range(len(list_subjects)):
    concat=pd.concat([task_list_df,subject_list_df[i],networks_df],axis=1)
    full_concatenated.append(concat)
    
total_concat = pd.concat(full_concatenated)

total_concat['Dispersion'] = dispersion_total 
total_concat['Centroid']=centroid_total
total_concat['Site'] = [s[4:7] for s in total_concat['Subject']]

Same dfs for subject, task, site used. Dispersion and centroid added for the full data with gsr applied.

In [17]:
gsr_total_concat = pd.concat(full_concatenated)

gsr_total_concat['Dispersion']=dispersion_total_gsr
gsr_total_concat['Centroid']=centroid_total_gsr
gsr_total_concat['Site'] = [s[4:7] for s in gsr_total_concat['Subject']]

Same method for the shortened, no gsr data, and shortened gsr data.

In [18]:
total_concat_short = pd.concat(full_concatenated)

total_concat_short['Dispersion']=dispersion_total_short
total_concat_short['Centroid']=centroid_total_short
total_concat_short['Site'] = [s[4:7] for s in total_concat_short['Subject']]

In [19]:
gsr_total_concat_short = pd.concat(full_concatenated)

gsr_total_concat_short['Dispersion']=dispersion_total_gsr_short
gsr_total_concat_short['Centroid']=centroid_total_gsr_short
gsr_total_concat_short['Site'] = [s[4:7] for s in gsr_total_concat_short['Subject']]

Reading in file containing diagnosis for each subject, to be merged to the total_concat files to include diagnostic information. Merging the data with each of the files, and writing them to scratch.

In [53]:
diagnostic_df = pd.read_csv('/scratch/a/arisvoin/lbassman/spins_gradients/spin_gradients/diagnostic_information.csv')

In [26]:
total_concat_merge = pd.merge(total_concat, diagnostic_df, how = "left", left_on = "Subject", right_on = "subject")
total_concat_short_merge = pd.merge(total_concat_short, diagnostic_df, how = "left", left_on = "Subject", right_on = "subject")
gsr_total_concat_merge = pd.merge(gsr_total_concat, diagnostic_df, how = "left", left_on = "Subject", right_on = "subject")
gsr_total_concat_short_merge = pd.merge(gsr_total_concat_short, diagnostic_df, how = "left", left_on = "Subject", right_on = "subject")

In [49]:
path = '/scratch/a/arisvoin/lbassman/spins_gradients/network_averages_full.csv'
path_s = '/scratch/a/arisvoin/lbassman/spins_gradients/network_averages_shortened.csv'
path_g = '/scratch/a/arisvoin/lbassman/spins_gradients/gsr_network_averages_full.csv'
path_g_s = '/scratch/a/arisvoin/lbassman/spins_gradients/gsr_network_averages_shortened.csv'

total_concat_merge.to_csv(path_or_buf=path)    
total_concat_short_merge.to_csv(path_or_buf=path_s)
gsr_total_concat_merge.to_csv(path_or_buf=path_g)
gsr_total_concat_short_merge.to_csv(path_or_buf=path_g_s)