# Schaefer timeseries: data exploration

In [1]:
import pandas as pd

In [2]:
ukbb_dir = '/ritter/share/data/UKBB/ukb_data/bids/'
schaefer_data_dir = '../data/schaefer/'

In [3]:
schaefer_exists_df = pd.read_csv(schaefer_data_dir+'schaefer_exists.csv')

How many subjects do we currently have downloaded Schaefer timeseries for?

In [4]:
print('Number of subjects with Schaefer ts files:',len(schaefer_exists_df.loc[schaefer_exists_df['schaefer_exists'] == True]))
print('Number of subjects without Schaefer ts files:',len(schaefer_exists_df.loc[schaefer_exists_df['schaefer_exists'] == False]))
print('Number of subjects with empty Schaefer ts files:',len(schaefer_exists_df[schaefer_exists_df['is_empty']==True]))
print('Number of subjects with usable Schaefer ts files:',len(schaefer_exists_df[(schaefer_exists_df['schaefer_exists']==True) & (schaefer_exists_df['is_empty']==False)]))

Number of subjects with Schaefer ts files: 39856
Number of subjects without Schaefer ts files: 10411
Number of subjects with empty Schaefer ts files: 60
Number of subjects with usable Schaefer ts files: 39796


How many timepoints do the timeseries have?

In [5]:
# limit to available ts
existing = (schaefer_exists_df['schaefer_exists'] == True) 
nonempty = (schaefer_exists_df['is_empty'] == False)
available_schaefer_df = schaefer_exists_df[existing & nonempty].reset_index(drop=True)
# prepare dictionary for counting timepoints
count_dict = {}

# check number of timepoints for each subject
for i in range(len(available_schaefer_df)):
    sub_id = available_schaefer_df.loc[i, 'eid']
    sub_dir = ukbb_dir+'sub-'+str(sub_id)+'/ses-2/func/sub-'+str(sub_id)+'_ses-2_task-rest_Schaefer7n100p.csv.gz'
    # check length of columns
    # subtract 1 for label column
    timepoints = len(pd.read_csv(sub_dir).columns)-1
    # add to count
    if timepoints in count_dict.keys():
        count_dict[timepoints] += 1
    else:
        count_dict[timepoints] = 1

print('{Number of timepoints: number of subjects with these timepoints}')
print(count_dict)

{Number of timepoints: number of subjects with these timepoints}
{490: 39302, 523: 494}
