In [None]:
#This script merges cleaned data from an old data freeze, with new data pulled from Axis.
#Inputs:
    #enrollment sheets pulled from AXIS, stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
    #data cleaned and organized from 2020, stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/flywheel_data_uploads/data_ready_for_upload/
    #log of redcap IDs pulled from AXIS (https://axis.med.upenn.edu/redcap_v10.3.7/DataExport/index.php?pid=378&report_id=2598) and stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
    #new data pulled from AXIS (https://axis.med.upenn.edu/redcap_v10.3.7/DataExport/index.php?pid=191&report_id=1331) and stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
#Outputs:
    #a csv of all pre-scan scales collected for participants enrolled prior to April 1st, 2022 
    #a csv of all post-scan scales collected for participants enrolled prior to April 1st, 2022

In [1]:
import pandas as pd

In [2]:
#read in T1 enrollment
axis_t1=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_enroll_t1.csv',dtype=str)
axis_t1=axis_t1.drop(columns=['scan_1_date'])
#axis_t1

In [3]:
#reformat 
t1_enroll=axis_t1['bblid']
t1_enroll=t1_enroll.tolist()
t1_enroll = [str(t) for t in t1_enroll]

In [5]:
#read in RedCAP IDS from EF tracker
redcap_ids = pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_redcap_ids.csv', dtype=str)
prescan_ids = redcap_ids['prescan_redcap_id']
prescan_ids = prescan_ids.tolist()
prescan_ids = [str(i) for i in prescan_ids]

In [6]:
#get rid of nan's
prescan_ids1=[]
for s in prescan_ids:
    if 'nan' not in s:
        #t=s.split('.')[0]
        #print(t)
        prescan_ids1.append(s)
print(prescan_ids1)

['345', '276', '279', '455', '287', '290', '464', '386', '611', '415', '377', '841', '475', '471', '694', '420', '769', '607', '603', '499', '599', '503', '507', '479', '429', '550', '484', '641', '517', '619', '615', '661', '629', '670', '741', '779', '761', '717', '867', '805', '1004', '857', '852', '885', '918', '990', '1025', '950', '967', '1127', '1131', '994', '1092', '1099', '1045', '1055', '1114', '1110', '1069', '1073', '1081', '1177', '1266', '1271', '1170', '1235', '1293', '1185', '1246', '1345', '1341', '1262', '1305', '1333', '1356', '1370', '1461', '1422', '1481', '1447', '2030', '1385', '1526', '2349', '2019', '1539', '2778', '1831', '2026', '3107', '2281', '2443', '2704', '2495', '3096', '3222', '2811', '3044', '3200', '3204', '2918', '3040', '2945', '2922', '3089', '3301', '3305', '1391812379', '3379', '3337', '3488', '3491', '3925', '3989', '3981', '4384', '4214', '4267', '4324', '4328', '4414', '4423', '4271', '1778']


In [7]:
#read in raw SR data
sr=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_EF_pre_post_scan_scales.csv',dtype=str) 
#filter for scale IDs from RedCAP 
pre_scan=sr[sr['scales_id'].isin(prescan_ids1)]

In [8]:
#read in pre scan cleaned SR data from last audit/freeze, and get list of IDs
pre_cleaned=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/flywheel_data_uploads/data_ready_for_upload/EF_pre_scan_scales_CLEANED.csv', dtype=str)
cleaned_ids = pre_cleaned['scales_id']
cleaned_ids = cleaned_ids.tolist()
cleaned_ids = [str(i) for i in cleaned_ids]

In [9]:
#find which id's were not included in last data freeze 
missing=[]
for i in prescan_ids1:
    if i  not in cleaned_ids:
        missing.append(i)

print(missing)

['2030', '2349', '2019', '2778', '1831', '2026', '3107', '2281', '2443', '2704', '2495', '3096', '3222', '2811', '3044', '3200', '3204', '2918', '3040', '2945', '2922', '3089', '3301', '3305', '1391812379', '3379', '3337', '3488', '3491', '3925', '3989', '3981', '4384', '4214', '4267', '4324', '4328', '4414', '4423', '4271', '1778']


In [10]:
#filter out any data that was already organized in the last audit. 
new = pre_scan[pre_scan['scales_id'].isin(missing)]

In [11]:
#drop any columns that identifiers, or are not necessary for EF 
new=new.drop(columns=['date'])
new=new.drop(columns=['admin_proband'])
new=new.drop(columns=['admin_proband_group'])
new=new.drop(columns=['admin_motive_timepoint'])

pre_cleaned=pre_cleaned.drop(columns=['date'])

In [12]:
pre_scan_t1 = pd.concat([pre_cleaned, new], axis=0, sort=False)

In [13]:
#add a "timepoint" variable (this is t1) and merge with scan IDs so data is easily used w BIDS 
t1_pre_scan=pd.merge(pre_scan_t1,axis_t1, left_on="bblid", right_on="bblid")
t1_pre_scan=t1_pre_scan.rename(columns={"scan_id_timepoint_1": "scan_id"})
t1_pre_scan['timepoint']= '1'

In [14]:
#now for T2

In [15]:
#read in T2 enrollment
axis_t2=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_enroll_t2.csv',dtype=str)
axis_t2=axis_t2.drop(columns=['scan_2_date'])

In [16]:
#reformat 
t2_enroll=axis_t2['bblid']
t2_enroll=t2_enroll.tolist()

In [17]:
#get redcap IDs for pre-scan T2
prescan_ids_t2 = redcap_ids['prescan_redcap_id_t2']
prescan_ids_t2 = prescan_ids_t2.tolist()
prescan_ids_t2 = [str(i) for i in prescan_ids_t2]

In [18]:
#get rid of nan's
prescan_ids2=[]
for s in prescan_ids_t2:
    if 'nan' not in s:
        #t=s.split('.')[0]
        #print(t)
        prescan_ids2.append(s)
print(prescan_ids2)

['2469', '3232', '2120', '2473', '2220', '2209', '3196', '2185', '2189', '2322', '2598', '2477', '2318', '4389', '2602', '2873', '3018', '2664', '2649', '2771', '3457', '3461', '3254', '3855', '4151', '4056', '3526', '4300', '1391812360', '3516', '4292', '3756', '4204', '4480', '4521']


In [19]:
pre_scan2=sr[sr['scales_id'].isin(prescan_ids2)]

In [20]:
pre_scan2=pre_scan2.drop(columns=['date'])
pre_scan2=pre_scan2.drop(columns=['admin_proband'])
pre_scan2=pre_scan2.drop(columns=['admin_proband_group'])
pre_scan2=pre_scan2.drop(columns=['admin_motive_timepoint'])

In [21]:
#add a "timepoint" variable (this is t2) and merge with scan IDs so data is easily used w BIDS 
t2_pre_scan=pd.merge(pre_scan2,axis_t2, left_on="bblid", right_on="bblid")
t2_pre_scan=t2_pre_scan.rename(columns={"scan_id_t2": "scan_id"})
t2_pre_scan['timepoint']='2'

In [22]:
#altogether now! 
all_pre_scan = pd.concat([t1_pre_scan, t2_pre_scan], axis=0, sort=False)

In [23]:
all_pre_scan.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_all_prescan_scales.csv', sep = ',', index=False)

In [None]:
#and for post scan 

In [24]:
#t1
postscan_ids = redcap_ids['postscan_redcap_id']
postscan_ids = postscan_ids.tolist()
postscan_ids = [str(i) for i in postscan_ids]

In [25]:
#get rid of nan's
postscan_ids1=[]
for s in postscan_ids:
    if 'nan' not in s:
        #t=s.split('.')[0]
        #print(t)
        postscan_ids1.append(s)
print(postscan_ids1)

['346', '277', '288', '291', '465', '612', '416', '378', '842', '476', '472', '695', '421', '770', '608', '604', '500', '600', '504', '508', '480', '430', '551', '485', '518', '620', '616', '671', '742', '780', '762', '868', '806', '1005', '858', '853', '886', '891', '991', '1026', '968', '1128', '1132', '995', '1093', '1100', '1046', '1056', '1115', '1111', '1070', '1074', '1078', '1178', '1267', '1272', '1171', '1236', '1294', '1186', '1247', '1346', '1263', '1306', '1334', '1357', '1371', '1462', '1423', '1482', '2031', '1448', '1386', '2350', '2020', '1527', '2779', '1540', '1832', '2027', '3108', '2282', '2444', '2705', '2496', '3097', '3223', '2812', '3045', '3201', '3205', '2919', '3041', '2946', '2923', '3090', '3302', '3306', '1391812380', '3380', '3338', '3499', '3492', '3926', '3990', '3982', '4385', '4215', '4268', '4325', '4329', '4415', '4424', '4272', '1779']


In [27]:
post_scan=sr[sr['scales_id'].isin(postscan_ids1)]

In [28]:
#read in pre scan cleaned SR data from last time, and get list of IDs
post_cleaned=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/flywheel_data_uploads/data_ready_for_upload/EF_post_scan_scales_CLEANED.csv', dtype=str)
cleaned_ids = post_cleaned['scales_id']
cleaned_ids = cleaned_ids.tolist()
cleaned_ids = [str(i) for i in cleaned_ids]

In [29]:
missing=[]
for i in postscan_ids1:
    if i  not in cleaned_ids:
        missing.append(i)

print(missing)

['277', '2031', '2350', '2020', '2779', '1832', '2027', '3108', '2282', '2444', '2705', '2496', '3097', '3223', '2812', '3045', '3201', '3205', '2919', '3041', '2946', '2923', '3090', '3302', '3306', '1391812380', '3380', '3338', '3499', '3492', '3926', '3990', '3982', '4385', '4215', '4268', '4325', '4329', '4415', '4424', '4272', '1779']


In [30]:
#filter out any data that was already organized in the last audit. 
new2 = post_scan[post_scan['scales_id'].isin(missing)]

In [31]:
new2=new2.drop(columns=['date'])
new2=new2.drop(columns=['admin_proband'])
new2=new2.drop(columns=['admin_proband_group'])
new2=new2.drop(columns=['admin_motive_timepoint'])

post_cleaned=post_cleaned.drop(columns=['date'])

In [32]:
post_scan_t1 = pd.concat([post_cleaned, new2], axis=0, sort=False)

In [33]:
#add a "timepoint" variable (this is t1) and merge with scan IDs so data is easily used w BIDS 
t1_post_scan=pd.merge(post_scan_t1,axis_t1, left_on="bblid", right_on="bblid")
t1_post_scan=t1_post_scan.rename(columns={"scan_id_timepoint_1": "scan_id"})
t1_post_scan['timepoint']= '1'

In [34]:
#get redcap IDs for pre-scan T2
postscan_ids_t2 = redcap_ids['postscan_redcap_id_t2']
postscan_ids_t2 = postscan_ids_t2.tolist()
postscan_ids_t2 = [str(i) for i in postscan_ids_t2]

In [35]:
#get rid of nan's
postscan_ids2=[]
for s in postscan_ids_t2:
    if 'nan' not in s:
        #t=s.split('.')[0]
        #print(t)
        postscan_ids2.append(s)
print(postscan_ids2)

['2470', '3233', '2121', '2474', '2221', '2210', '3197', '2186', '2190', '2323', '2599', '2478', '2319', '4390', '2603', '2874', '3019', '2665', '2650', '3458', '3462', '3255', '3856', '4152', '4057', '3527', '4301', '1391812361', '3517', '4293', '3757', '4205', '4481', '4522']


In [36]:
post_scan2=sr[sr['scales_id'].isin(postscan_ids2)]

In [37]:
post_scan2=post_scan2.drop(columns=['date'])
post_scan2=post_scan2.drop(columns=['admin_proband'])
post_scan2=post_scan2.drop(columns=['admin_proband_group'])
post_scan2=post_scan2.drop(columns=['admin_motive_timepoint'])

In [38]:
#add a "timepoint" variable (this is t2) and merge with scan IDs so data is easily used w BIDS 
t2_post_scan=pd.merge(post_scan2,axis_t2, left_on="bblid", right_on="bblid")
t2_post_scan=t2_post_scan.rename(columns={"scan_id_t2": "scan_id"})
t2_post_scan['timepoint']= '2'

In [39]:
#altogether now! 
all_post_scan = pd.concat([t1_post_scan, t2_post_scan], axis=0, sort=False)

In [43]:
all_post_scan.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_all_postscan_scales.csv', sep = ',', index=False)