# Schaefer timeseries: data exploration

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
ukbb_dir = '/ritter/share/data/UKBB/ukb_data/bids/'
schaefer_data_dir = '../data/schaefer/'

In [3]:
schaefer_exists_df = pd.read_csv(schaefer_data_dir+'schaefer_exists.csv')

How many subjects do we currently have downloaded Schaefer timeseries for?

In [4]:
print('Number of subjects with Schaefer ts:',len(schaefer_exists_df.loc[schaefer_exists_df['schaefer_exists'] == True]))
print('Number of subjects without Schaefer ts:',len(schaefer_exists_df.loc[schaefer_exists_df['schaefer_exists'] == False]))

Number of subjects with Schaefer ts: 39856
Number of subjects without Schaefer ts: 10411


How many timepoints do the timeseries have?

In [5]:
# limit to available ts
available_schaefer_df = schaefer_exists_df[schaefer_exists_df['schaefer_exists'] == True].reset_index(drop=True)
# prepare dictionary for counting timepoints
count_dict = {}
iterated_subs = []
# take note of empty files
exceptions = set()

# check number of timepoints for each subject
for i in range(len(iterated_subs), len(available_schaefer_df)): #available_schaefer_df.index
    sub_id = available_schaefer_df.loc[i, 'eid']
    # only open non-empty files
    if sub_id not in exceptions:
        iterated_subs.append(sub_id)
        sub_dir = ukbb_dir+'sub-'+str(sub_id)+'/ses-2/func/sub-'+str(sub_id)+'_ses-2_task-rest_Schaefer7n100p.csv.gz'
        # catch empty files:
        if os.path.getsize(sub_dir) > 0:
            # check length of columns
            # subtract 1 for label column
            timepoints = len(pd.read_csv(sub_dir).columns)-1
            # add to count
            if timepoints in count_dict.keys():
                count_dict[timepoints] += 1
            else:
                count_dict[timepoints] = 1
        else:
            exceptions.add(sub_id)
print('{Number of timepoints: number of subjects with these timepoints}')
print(count_dict)
print('Subjects with empty Schaefer files (IDs):')
print(exceptions)

{Number of timepoints: number of subjects with these timepoints}
{490: 39302, 523: 494}
Subjects with empty Schaefer files (IDs):
{1116417, 1433347, 1122312, 1105545, 1118605, 1218963, 1118490, 1028124, 1137180, 1010334, 1256608, 1138465, 1132834, 1072932, 1109412, 1141164, 1145006, 1164723, 1202229, 1046966, 1075383, 1110970, 1163323, 1169466, 1066687, 1169855, 1034050, 1167299, 1127747, 1000901, 1001163, 1132748, 1279313, 1195090, 1008595, 1240151, 1108952, 1163479, 1005914, 1035483, 1209562, 1040480, 1232099, 1171043, 1053029, 1085029, 1005930, 1068267, 1244266, 1088493, 1118574, 1203822, 1068529, 1008242, 1138933, 1009910, 1060983, 1111418, 1160316, 1041279}
