In [None]:
#This script merges cleaned data from an old data freeze, with new data pulled from Axis.
#Inputs:
    #enrollment sheets pulled from AXIS, stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
    #data cleaned and organized from 2020, stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/flywheel_data_uploads/data_ready_for_upload/
    #log of redcap IDs pulled from AXIS (https://axis.med.upenn.edu/redcap_v10.3.7/DataExport/index.php?pid=378&report_id=2598) and stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
    #new data pulled from AXIS (https://axis.med.upenn.edu/redcap_v10.3.7/DataExport/index.php?pid=191&report_id=1318) and stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
#Outputs:
    #a csv of all main battery scales collected for participants enrolled prior to April 1st, 2022 

In [1]:
import pandas as pd

In [2]:
#read in T1 enrollment
axis_t1=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_enroll_t1.csv',dtype=str)
axis_t1=axis_t1.drop(columns=['scan_1_date'])
#axis_t1

In [3]:
#reformat 
t1_enroll=axis_t1['bblid']
t1_enroll=t1_enroll.tolist()
t1_enroll = [str(t) for t in t1_enroll]

In [7]:
#read in RedCAP IDS from EF tracker
redcap_ids = pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_redcap_ids.csv', dtype=str)
scale_ids = redcap_ids['sr_redcap_id']
scale_ids = scale_ids.tolist()
scale_ids = [str(i) for i in scale_ids]

In [8]:
#get rid of nan's
scale_ids1=[]
for s in scale_ids:
    if 'nan' not in s:
        #t=s.split('.')[0]
        #print(t)
        scale_ids1.append(s)
print(scale_ids1)

['344', '275', '278', '454', '286', '289', '463', '385', '610', '414', '376', '840', '474', '470', '693', '419', '768', '606', '602', '498', '681', '598', '502', '506', '478', '428', '549', '483', '516', '618', '614', '585', '660', '628', '669', '740', '778', '760', '716', '866', '804', '1003', '856', '851', '884', '889', '989', '1024', '949', '966', '1126', '1130', '993', '1091', '1098', '1044', '1036', '1064', '1109', '1068', '1072', '1080', '1076', '1176', '1265', '1270', '1169', '1234', '1292', '1184', '1245', '1344', '1261', '1304', '1332', '1355', '1369', '1460', '1421', '1480', '2029', '1446', '1384', '2348', '2018', '1525', '2777', '1538', '1830', '2025', '3106', '2280', '2442', '2703', '2494', '3095', '3221', '2810', '3043', '3199', '3203', '2917', '3039', '2944', '2921', '3088', '3300', '3304', '1391812378', '3378', '3336', '3487', '3490', '3924', '3988', '3980', '4383', '4213', '4266', '4323', '4327', '4413', '4422', '4270', '1777']


In [17]:
#read in raw data
sr=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_EF_main_scales.csv',dtype=str)

In [18]:
#subset based on scale ids
scales=sr[sr['scales_id'].isin(scale_ids1)]

In [19]:
#read in old SR data
cleaned_sr=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/flywheel_data_uploads/data_ready_for_upload/EF_ScanDay_Scales_3.csv',dtype=str,encoding='latin-1')

In [20]:
#for some reason these don't have scales IDs, so lets separate them from redcap IDs 
ids= cleaned_sr['redcapid']
new_ids=[]
for i in ids:
    x=i[5:]
    new_ids.append(x)

In [21]:
#add as new column to dataframe 
cleaned_sr['scales_id']=new_ids

In [22]:
#find who is missing from the cleaned data 
missing=[]
for i in scale_ids1:
    if i not in new_ids:
        missing.append(i)

print(missing)

['2029', '2348', '2018', '2777', '1830', '2025', '3106', '2280', '2442', '2703', '2494', '3095', '3221', '2810', '3043', '3199', '3203', '2917', '3039', '2944', '2921', '3088', '3300', '3304', '1391812378', '3378', '3336', '3487', '3490', '3924', '3988', '3980', '4383', '4213', '4266', '4323', '4327', '4413', '4422', '4270', '1777']


In [23]:
#filter out any data that was already organized in the last audit. 
new = scales[scales['scales_id'].isin(missing)]

In [24]:
#drop unnecessary columns
new=new.drop(columns=['date'])
new=new.drop(columns=['admin_proband'])
new=new.drop(columns=['admin_proband_group'])

In [25]:
t1_all_scales = pd.concat([cleaned_sr, new], axis=0, sort=False)

In [26]:
#add a "timepoint" variable (this is t1) and merge with scan IDs so data is easily used w BIDS 
t1_all_scales=pd.merge(t1_all_scales,axis_t1, left_on="bblid", right_on="bblid")
t1_all_scales=t1_all_scales.rename(columns={"scan_id_timepoint_1": "scan_id"})
t1_all_scales['timepoint']= '1'

In [23]:
#and now to add T2 

In [28]:
#read in T1 enrollment
axis_t2=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_enroll_t2.csv',dtype=str)
axis_t2=axis_t2.drop(columns=['scan_2_date'])
#axis_t2

In [29]:
#reformat 
t2_enroll=axis_t2['bblid']
t2_enroll=t2_enroll.tolist()
t2_enroll = [str(t) for t in t2_enroll]

In [30]:
#get T2 scale ids
scale_ids2 = redcap_ids['sr_redcap_id_t2']
scale_ids2 = scale_ids2.tolist()
scale_ids2 = [str(i) for i in scale_ids2]

In [31]:
#get rid of nan's
scale_ids2t=[]
for s in scale_ids2:
    if 'nan' not in s:
        #t=s.split('.')[0]
        #print(t)
        scale_ids2t.append(s)
print(scale_ids2t)

['2468', '3231', '2119', '2472', '2219', '2208', '3195', '2184', '2188', '2321', '2605', '2476', '2317', '4388', '2601', '2872', '3017', '2663', '2648', '2770', '3456', '3460', '3253', '3854', '4150', '4055', '3525', '4299', '1391812359', '3515', '4291', '3755', '4203', '4479', '4520']


In [32]:
scales2=sr[sr['scales_id'].isin(scale_ids2t)]

In [33]:
scales2=scales2.drop(columns=['date'])
scales2=scales2.drop(columns=['admin_proband'])
scales2=scales2.drop(columns=['admin_proband_group'])

In [34]:
t2_all_scales=pd.merge(scales2,axis_t2, left_on="bblid", right_on="bblid")
t2_all_scales=t2_all_scales.rename(columns={"scan_id_t2": "scan_id"})
t2_all_scales['timepoint']= '2'

In [35]:
#altogether now! 
all_scales = pd.concat([t1_all_scales, t2_all_scales], axis=0, sort=False)

In [36]:
all_scales.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_all_scales.csv', sep = ',', index=False)