In [None]:
#This script merges cleaned collateral data from an old data freeze, with new data pulled from Axis.
#Inputs:
    #enrollment sheets pulled from AXIS, stored at afp://saturn/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs
    #demographic data from AXIS stored on saturn
    #family IDs from oracle, stored on saturn
    #age calculated from birthday and date of scan on flywheel 
#Outputs:
    #a csv of demographic data for participants enrolled prior to April 1st, 2022 
    #binary spreadsheets of demos collected for participants for T1 and T2 enrolled prior to April 1st, 2022 

In [2]:
import pandas as pd

In [3]:
#read in T1 enrollment
axis_t1=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_enroll_t1.csv',dtype=str)
#axis_t1=axis_t1.drop(columns=['scan_1_date'])
#axis_t1

In [4]:
#reformat 
t1_enroll=axis_t1['bblid']
t1_enroll=t1_enroll.tolist()
t1_enroll = [str(t) for t in t1_enroll]

In [5]:
#read in other relevant data
t1_ages = pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/flywheel_ages_t1.csv', dtype=str)#calculated from flywheel date of acq + sub birthday
fam_id=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/oracle_fam_ids.csv', dtype=str) #pulled by Lan from oracle 
#oracle_demos=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/oracle_demos.csv', dtype=str) #pulled from study visit demos 
axis_demos=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_demos.csv', dtype=str) #pulled from axis tracker
#libi_demos = pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/oracle_demos_LIBI.csv', dtype=str)#calculated from flywheel date of acq + sub birthday

In [6]:
#add age at scan 
t1_demos= pd.merge(axis_t1, t1_ages, how='left', on='bblid')

In [7]:
#add family id's 
t1_demos= pd.merge(t1_demos, fam_id, how='left', left_on='bblid', right_on='BBLID')

In [8]:
#add in basic demo information from axis trackerr 
t1_demos=pd.merge(t1_demos, axis_demos, how='left', on='bblid')

In [9]:
#and now for some light reformatting 
t1_demos=t1_demos.rename(columns={"FAMID": "family_id"})
t1_demos=t1_demos.drop(columns=['BBLID', 'PROTOCOL','redcap_id','scanvisit_t1_status','scan_id_timepoint_1','scan_1_date'])

In [10]:
t1_demos=t1_demos.drop_duplicates()

In [11]:
t1_demos['timepoint']=1

In [12]:
#t1_demos.head(n=5)

In [13]:
#t1_demos.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_T1_basic_demos.csv', sep = ',', index=False)

In [14]:
#read in T2 enrollment
axis_t2=pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/axis_enroll_t2.csv',dtype=str)
#axis_t1=axis_t1.drop(columns=['scan_1_date'])
#axis_t1

In [15]:
#reformat 
t2_enroll=axis_t2['bblid']
t2_enroll=t2_enroll.tolist()
t2_enroll = [str(t) for t in t2_enroll]

In [16]:
#read in additional data
t2_ages = pd.read_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/inputs/flywheel_ages_t2.csv', dtype=str)#calculated from flywheel date of acq + sub birthday

In [17]:
#add age at scan 
t2_demos= pd.merge(axis_t2, t2_ages, how='left', on='bblid')

In [18]:
#add family id's 
t2_demos= pd.merge(t2_demos, fam_id, how='left', left_on='bblid', right_on='BBLID')

In [19]:
#add in basic demo information from axis trackerr 
t2_demos=pd.merge(t2_demos, axis_demos, how='left', on='bblid')

In [20]:
#and now for some light reformatting 
t2_demos=t2_demos.rename(columns={"FAMID": "family_id"})
t2_demos=t2_demos.drop(columns=['BBLID', 'PROTOCOL','redcap_id','scan_id_t2', 'scan_2_date','scanvisit_t1_status'])

In [21]:
t2_demos=t2_demos.drop_duplicates()

In [22]:
t2_demos['timepoint']=2

In [37]:
#t2_demos.head(n=5)

In [52]:
#t2_demos.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_T2_basic_demos.csv', sep = ',', index=False)

In [34]:
#altogether now! 
all_demos = pd.concat([t1_demos, t2_demos], axis=0, sort=False)

In [35]:
all_demos.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_basic_demos.csv', sep = ',', index=False)
t1_demos.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_T1_basic_demos.csv',sep = ',', index=False)
t2_demos.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/outputs/EF_T2_basic_demos.csv',sep = ',', index=False)

In [24]:
#and now a quick audit 
audit1 = pd.DataFrame (t1_enroll, columns = ['BBLID'])
print(len(audit1))

173


In [27]:
has_demos=[]
for index, row in t1_demos.iterrows():
    for t in t1_enroll:
        if (t in row['bblid']):
            has_demos.append(t)

In [29]:
demos=[]
for val in audit1.values:
    if val in has_demos:
        demos.append(1)
    if val not in has_demos:
        demos.append(0)

In [30]:
audit1['has_demos']=demos

In [32]:
audit1.head(n=5)

Unnamed: 0,BBLID,has_demos
0,19861,1
1,20124,1
2,20125,1
3,20139,1
4,20141,1


In [33]:
audit2 = pd.DataFrame (t2_enroll, columns = ['BBLID'])
print(len(audit2))

37


In [36]:
has_demos2=[]
for index, row in t2_demos.iterrows():
    for t in t2_enroll:
        if (t in row['bblid']):
            has_demos2.append(t)

In [37]:
demos2=[]
for val in audit2.values:
    if val in has_demos2:
        demos2.append(1)
    if val not in has_demos:
        demos2.append(0)

In [38]:
audit2['has_demos']=demos2

In [39]:
audit2.head(n=5)

Unnamed: 0,BBLID,has_demos
0,20139,1
1,20149,1
2,20188,1
3,20214,1
4,20238,1


In [41]:
audit1.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/audits/EF_T1_demos_audit.csv', sep = ',', index=False)
audit2.to_csv('/Volumes/Coordinators/Protocols/TED_PROTOCOLS/EXECUTIVE_829744/2022_data_freeze/audits/EF_T2_demos_audit.csv', sep = ',', index=False)