In [None]:
from funcs import funcs
from Crosswalk.Transformer import Transformer
from Crosswalk.DataCache import DataCache
from Crosswalk.NDAWriter import NDAWriter
from Crosswalk.Manager import Manager

from Crosswalk.Loader import Loader, BoxLoader, BoxHcaLoader, SsagaLoader, QintHcaLoader, RedcapLoader
import pandas as pd
import numpy as np


In [None]:
#create folder for prepped structures, if it doesn't exist
!!mkdir prepped
!!mkdir prepped/hca

In [None]:
# note the path to the validator below.  This is the vtcmd.py validator written by the NDA.  
# If you haven't already installed this, please do so now: https://github.com/NDAR/nda-tools
# and the place the path that shows up when you type 'which vtcmd' from your terminal
# validation results will be sent to and read from whatever default is specified in the vtcmd configuration file,
# so if you're using vtcmd to validate any other datatypes, keep this in mind.

M = Manager(
        data =  DataCache(
            BoxHcaLoader('PennCNP',592325063896),
            RedcapLoader('hcpa'),
            SsagaLoader(),
            QintHcaLoader()
        ),
        writer = NDAWriter(completed_dir="./prepped/hca/", validator="vtcmd"),
        #writer = NDAWriter(completed_dir="./prepped/hca/"),
        transformer = Transformer(funcs = funcs, map_dir='./maps/hca/')
)

In [None]:
# This step requires that you have a 'rosetta stone' file that has all the required NDA fields for 
# all subjects you intend to submit at this time.  This approach facilitates keeping track of subject counts
# across data types.  For example, if your required fields are already stored in XNAT because you had the CCF
# upload your imaging data for you, you can export this csv from XNAT and rename as appropriate.  
# Place this file at the main level of this repository, and name it in your config file
# Loader.py program's _post_load_hook_ method referenced below.  

M.preload_data()

In [None]:
#Ad hoc functions to clean up empty rows for particular instruments after generated (issue for redcap data)
def redcleanup(structure="lbadl01",filePath="./prepped/hca/",extraomitcol1='NO',extraomitcol2='NO',extraomitcol3='NO',extraomitcol4='NO'):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]

    df=pd.read_csv(filePath+structure+".csv",header=1)
    df.head()

    print("NumRows Before: "+str(df.shape[0]))
    subfields=df.columns.to_list()
    subfields.remove('subjectkey')
    subfields.remove('src_subject_id')
    subfields.remove('interview_date')
    subfields.remove('interview_age')
    subfields.remove('sex')
    if extraomitcol1 and extraomitcol1 !='NO':
        subfields.remove(extraomitcol1)
    if extraomitcol2 and extraomitcol2 !='NO':
        subfields.remove(extraomitcol2)
    if extraomitcol3 and extraomitcol3 !='NO':
        subfields.remove(extraomitcol3)
    if extraomitcol4 and extraomitcol4 !='NO':
        subfields.remove(extraomitcol4)
    df=df.dropna(how='all',subset=subfields)
    print("NumRows After: "+str(df.shape[0]))

    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
 

    


#these guys already set to 99s in map, so null finder wont work above
def asrover60(structure="asr01",filePath="./prepped/hca/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    #df=df.loc[df.interview_age>719].copy()
    df=df.loc[~((df.asr2_2==-99)&(df.asr3_2==-99))]
    df=df.drop(columns=['asr2_3_text',
        'oasr_ppl9_des',
        'asr5_5_text',
        'asr7_4_text',
        'asr8_4_text',
        'asr10_6_text',
        'asr13_3_text',
        'asr14_1_text',
        'asr15_2_text',
        'asr16_3_text',
        'asr16_4_text',
        'asr17_5_text',
        'asr19_1_text',
        'cbcl56h_des'
        ])
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
    #print(subset)
        
def satisfy(structure='scan_debrief01',filePath="./prepped/hca/"):  
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.drop(columns=['satisfaction1more','satisfaction2more','satisfaction4more','satisfaction5','satisfaction6'])
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

        
def cleanlist(structurelist=['lbadl01','mchq01']):
    for i in structurelist:
        print(i)
        redcleanup(structure=i,filePath="./prepped/hca/")

In [None]:
def cleanzeros(structure='vitals01',filePath="./prepped/hca/"):
    #print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    df.loc[df.vtl007==0,'vtl007']=np.NaN
    df.loc[df.bp_stand=='11/80','bp_stand']=np.NaN
    df.loc[df.bp_stand=='9999','bp_stand']=np.NaN
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)


In [None]:
def integercleanup(structure='asr01',filePath="./prepped/hca/",varlist=['a']):
    #print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    for v in varlist:
        df[v]=df[v].fillna(-9999).astype(int).astype(str).str.replace('-9999','')     
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)



In [None]:
M.run('psqi01')
cleanlist(structurelist=['psqi01'])

In [None]:
M.run('trail_ca01')
redcleanup(structure="trail_ca01",filePath="./prepped/hca/",extraomitcol1='seizures',extraomitcol2='versionchildadult')


In [None]:
M.run('vitals01')
cleanzeros()

In [None]:
M.run('ssaga_cover_demo01')

In [None]:
structs = [
    'lbadl01',
    'mchq01',
    'er4001',
    'deldisk01',
    'asr01',
    'batbil01',
    'bsc01',
    'drugscr01',
    'gales01',
    'ipaq01',
    'leap01',
    'medh01',
    'mendt01',
    'moca01',
    'nffi01',
    #'psqi01',
    'ravlt01',
    'scan_debrief01',
  ]

for s in structs:
    M.run(s)
    print(s)   
cleanlist(structurelist=structs)
asrover60(structure="asr01",filePath="./prepped/hca/")
redcleanup(structure="deldisk01",filePath="./prepped/hca/",extraomitcol1='version_form',extraomitcol2='comqother')
redcleanup(structure="medh01",filePath="./prepped/hca/",extraomitcol1='comqother')
redcleanup(structure="bsc01",filePath="./prepped/hca/",extraomitcol1='comqother')
redcleanup(structure="ravlt01",filePath="./prepped/hca/",extraomitcol1='ravlt_delt',extraomitcol2='ravlt_disct',extraomitcol3='ravlt_tott')
redcleanup(structure="scan_debrief01",filePath="./prepped/hca/",extraomitcol1='comqother')
satisfy(structure='scan_debrief01',filePath="./prepped/hca/")

In [None]:
# rest of SSAGA still not mapped 11/11/2020
structs2 = [
#     'diagpsx01',
#     'eatdisdemo01',
#     'phenx_sib01',
#     'scidv_pscyh01',
#     'socdem01'
  ]

for s in structs2:
    M.run(s)

In [None]:
#fix noninteger types
a=['asr1_1','asr1_3','asr1_5','asr4_3','asr4_4','asr4_5','asr4_6','asr5_1','asr5_2','asr5_4','asr6_6','asr7_1','asr7_3','asr7_6','asr8_1','asr8_4','asr8_5','asr11_6','asr12_2','asr12_4','asr12_6','asr13_3','asr13_4','asr14_2','asr14_3','asr14_6','asr15_2','asr17_2','asr19_2','asr19_6','asr20_3','asr21_3','asr22_1','asr22_2','asr22_3','asr22_5','asr3_1','asr6_4','oasr_ppl35','oasr_ppl3','oasr_ppl39','asr8_6','oasr_ppl16','oasr_ppl19','oasr_ppl20','oasr_ppl47','oasr_ppl49_10','oasr_ppl49_11','oasr_ppl54','oasr_ppl79','oasr_ppl101','oasr_ppl102','oasr_ppl109','oasr_ppl110','oasr_ppl114','oasr_ppl121','oasr_ppl33']
integercleanup(structure='asr01',filePath="./prepped/hca/",varlist=a)
b=['batlq4','batlq8','batlq_score1','batlq_score10','batlq_score11','batlq_score13','batlq_score14','batlq_score16','batlq_score17','batlq_score18','batlq_score19','batlq_score2','batlq_score20','batlq_score23','batlq_score24','batlq_score25','batlq_score26','batlq_score27','batlq_score28','batlq_score29','batlq_score3','batlq_score30','batlq_score31','batlq_score33','batlq_score39','batlq_score4','batlq_score5','batlq_score6','batlq_score7','batlq_score8','batlq_score9','batlq1_lc','batlq1_dz','batlq16_lc','batlq16_dz','batlq23_lc','batlq23_dz','batlq14_lc','batlq14_dz','batlq18_lc','batlq18_dz','batlq20_lc','batlq20_dz','batlq39_lc','batlq39_dz','batlq9_lc','batlq9_dz','batlq24_lc','batlq24_dz','batlq10_lc','batlq10_dz','batlq5_lc','batlq5_dz','batlq27_lc','batlq27_dz','batlq11_lc','batlq11_dz','batlq7_lc','batlq7_dz','batlq19_lc','batlq19_dz','batlq25_lc','batlq25_dz','batlq4_lc','batlq4_dz','batlq6_lc','batlq6_dz','batlq17_lc','batlq17_dz','batlq35_lc','batlq35_dz','batlq38_lc','batlq38_dz','batlq13_lc','batlq13_dz','batlq33_lc','batlq33_dz','batlq8_lc','batlq8_dz','batlq2_lc','batlq2_dz','batlq26_lc','batlq26_dz','batlq36_lc','batlq36_dz','batlq34_lc','batlq34_dz','batlq12_lc','batlq12_dz','batlq31_lc','batlq31_dz','batlq21_lc','batlq21_dz','batlq29_lc','batlq29_dz','batlq3_lc','batlq3_dz','batlq28_lc','batlq28_dz','batlq30_lc','batlq30_dz']
integercleanup(structure='batbil01',filePath="./prepped/hca/",varlist=b)
b2=['bld_core','bld_core_grn','biospc_8','fasting','bld_core_snack','bld_core_spin','bld_rucdr','biospc_6','vitdlev','ls_alt','ls_ast','chloride','glucose','sodium','ed1_saliva','bld_core_d2ph','bld_core_d2pm','bld_core_p2fh','bld_core_p2fm','laba6']
integercleanup(structure='bsc01',filePath="./prepped/hca/",varlist=b2)
d=['ddisc_valid']
integercleanup(structure='deldisk01',filePath="./prepped/hca/",varlist=d)
dd=['caffeine_s1yn','nicotine_s1yn','drug1_1','drug1_2','drug1_3','drug1_4','drug1_5','drug1_6','caffeine_s1preday','nicotine_s1preday']
integercleanup(structure='drugscr01',filePath="./prepped/hca/",varlist=dd)
g=['gales1','gales2','gales3','gales4','gales5','gales6','gales6b','gales6c','gales7','gales8','gales9','gales10','gales11','gales12','gales13','gales14','gales15','gales16','gales17','gales18','gales19','gales20','gales21','gales21b','gales21c','gales22','gales23','gales24','gales25','gales26','gales_worst','gales1b','gales1c','gales3b','gales3c','gales10b','gales10c','gales13b','gales13c','gales16b','gales16c','gales18b','gales18c','gales19b','gales19c','gales20b','gales20c','gales2b','gales2c','gales7b','gales7c','gales9b','gales9c','gales25b','gales25c','gales22b','gales22c','gales23b','gales23c','gales24b','gales24c','gales26b','gales26c','gales8b','gales8c','gales11b','gales11c','gales12b','gales12c','gales4b','gales4c','gales17b','gales17c','gales5b','gales5c','gales14b','gales14c','gales15b','gales15c']
integercleanup(structure='gales01',filePath="./prepped/hca/",varlist=g)
i=['ipaq2','ipaq5a','ipaq5b','ipaq7a','ipaq7b','ipaq26a','ipaq26n','ipaq3a','ipaq3b']
integercleanup(structure='ipaq01',filePath="./prepped/hca/",varlist=i)
l=['adlnow8','adlnow12','adlnow4','adlnow1','iadl05','iadl6','adlnow14','adlnow2','lbadl_currtot']
integercleanup(structure='lbadl01',filePath="./prepped/hca/",varlist=l)
la=['lan_know','lan611','lan612','lan613','lan621','lan622','lan623','lan71','lan72','lan81','lan82','lan631','lan632','lan633','lan73','lan83','lan641','lan642','lan643','lan74','lan84']
integercleanup(structure='leap01',filePath="./prepped/hca/",varlist=la)
m=['scq_24_a','mchq_17','mchq_21','straw1','straw1a','straw1b','straw2','straw3','straw4a','straw4b','straw3a','straw2a','straw2b','straw3b']
integercleanup(structure='mchq01',filePath="./prepped/hca/",varlist=m)
m2=['dental1','dental2','dental3','dental4','dental5','dental6']
integercleanup(structure='medh01',filePath="./prepped/hca/",varlist=m2)
m3=['othmed0a','num_meds']
integercleanup(structure='mendt01',filePath="./prepped/hca/",varlist=m3)
m4=['moca_edu']
integercleanup(structure='moca01',filePath="./prepped/hca/",varlist=m4)
n=['nffi_2','nffi_5','nffi_6','nffi_7','nffi_8','nffi_10','nffi_14','nffi_19','nffi_20','nffi_22','nffi_23','nffi_28','nffi_29','nffi_32','nffi_33','nffi_34','nffi_35','nffi_36','nffi_37','nffi_38','nffi_39','nffi_40','nffi_41','nffi_42','nffi_43','nffi_44','nffi_45','nffi_46','nffi_47','nffi_48','nffi_49','nffi_50','nffi_51','nffi_52','nffi_53','nffi_54','nffi_55','nffi_56','nffi_57','nffi_58','nffi_59','nffi_60']
integercleanup(structure='nffi01',filePath="./prepped/hca/",varlist=n)
p=['parent_sleep20','csq_psqi_4_score','insomnia8','psqip6b_5']
integercleanup(structure='psqi01',filePath="./prepped/hca/",varlist=p)
r=['pea_ravlt_sd_tc','pea_ravlt_sd_trial_i_tc','pea_ravlt_sd_trial_ii_tc','pea_ravlt_sd_trial_iii_tc','pea_ravlt_sd_trial_iv_tc','pea_ravlt_sd_trial_v_tc','pea_ravlt_sd_listb_tc','pea_ravlt_sd_trial_vi_tc','delay_recall_intrusion','delay_total_intrusion','delay_total_repetitions']
integercleanup(structure='ravlt01',filePath="./prepped/hca/",varlist=r)
s=['satisfaction1','satisfaction2','satisfaction3','satisfaction4','satisfaction7','satisfaction8','satisfaction9','satisfaction10','satisfaction11']
integercleanup(structure='scan_debrief01',filePath="./prepped/hca/",varlist=s)

