In [None]:
from funcs import funcs
from Crosswalk.Transformer import Transformer
from Crosswalk.DataCache import DataCache
from Crosswalk.NDAWriter import NDAWriter
from Crosswalk.Manager import Manager

from Crosswalk.Loader import BoxLoader, BoxHcdLoader, QintHcdLoader, RedcapLoader, ParentLoader
import pandas as pd
import numpy as np
from ccf.easy_yaml import EasyYaml
from ccf.redcap import RedcapTable

In [None]:
#create folder for prepped structures, if it doesn't exist
!!mkdir prepped
!!mkdir prepped/hcd

In [None]:
M = Manager(
        data =  DataCache(
            ParentLoader(),
            RedcapLoader('child'),
            RedcapLoader('teen'),
            BoxHcdLoader('PennCNP', 592325063896),
            QintHcdLoader()
        ),
        writer = NDAWriter(completed_dir="./prepped/hcd/", validator="/home/petra/.local/bin/vtcmd"),
        transformer = Transformer(funcs = funcs, map_dir='./maps/hcd/')
)

In [None]:
# This step requires that you have a 'rosetta stone' file that has all the required NDA fields for 
# all subjects you intend to submit at this time.  This approach facilitates keeping track of subject counts
# across data types.  For example, if your required fields are already stored in XNAT because you had the CCF
# upload your imaging data for you, you can export this csv from XNAT and rename as appropriate.  
# Place this file at the main level of this repository, and name it in your config file
# Loader.py program's _post_load_hook_ method referenced below.  the method is currently hardcoded to read this csv and rename 
# columns to NDA requirements of ['subject', 'subjectkey', 'gender', 'interview_date', 'interview_age']
# as follows.  
        #rosetta = pd.read_csv('UnrelatedHCAHCD_w_STG_Image_and_pseudo_GUID05_27_2020.csv')
        #rosetta = rosetta[['subjectped', 'nda_guid', 'nda_gender', 'nda_interview_date', 'nda_interview_age']]
        #rosetta.columns = ['subject', 'subjectkey', 'gender', 'interview_date', 'interview_age']
#future versions of this code will pull out this file into config.py or even better place, if demand warrants.
#For now, just tweak this function to read your own rosetta file, making sure to result in csv with required, or
# fill out the template file and save it as 'UnrelatedHCAHCD_w_STG_Image_and_pseudo_GUID05_27_2020.csv' or whatever 
# you want it to be under the 'rosetta' attribute in the config file

M.preload_data()

In [None]:
#Ad hoc functions to clean up empty rows for particular instruments after generated (issue for redcap data)
def redcleanup(structure="lbadl01",filePath="./prepped/hcd/",extraomitcol1='NO',extraomitcol2='NO',extraomitcol3='NO',extraomitcol4='NO'):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]

    df=pd.read_csv(filePath+structure+".csv",header=1)
    df.head()

    print("NumRows Before: "+str(df.shape[0]))
    subfields=df.columns.to_list()
    subfields.remove('subjectkey')
    subfields.remove('src_subject_id')
    subfields.remove('interview_date')
    subfields.remove('interview_age')
    subfields.remove('sex')
    subfields.remove('comqother')
    if extraomitcol1 and extraomitcol1 !='NO':
        subfields.remove(extraomitcol1)
    if extraomitcol2 and extraomitcol2 !='NO':
        subfields.remove(extraomitcol2)
    if extraomitcol3 and extraomitcol3 !='NO':
        subfields.remove(extraomitcol3)
    if extraomitcol4 and extraomitcol4 !='NO':
        subfields.remove(extraomitcol4)
    df=df.dropna(how='all',subset=subfields)
    print("NumRows After: "+str(df.shape[0]))

    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
   
#these guys already set to 99s in map, so null finder wont work above
def asr(structure="asr01",filePath="./prepped/hcd/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    print("NumColumns Before: "+str(df.shape[1]))
    #df=df.loc[df.interview_age>719].copy()
    df=df.loc[~((df.asr2_2==-99)&(df.asr3_2==-99))]
    df=df.drop(columns=['asr1_6_text',
        'asr2_3_text',
        'asr5_5_text',
        'asr7_4_text',
        'asr8_4_text',
        'asr10_6_text',
        'asr12_1_text',                
        'asr13_3_text',
        'asr14_1_text',
        'asr15_2_text',
        'asr15_4_text',
        'asr16_3_text',
        'asr16_4_text',
        'asr17_5_text',
        'asr19_1_text',
        ])
    print("NumRows After: "+str(df.shape[0]))
    print("NumColunms After: "+str(df.shape[1]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
    #print(subset)

def dropcols(structure="bsc01",filePath="./prepped/hcd/",dropcols=[]):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumColumns Before: "+str(df.shape[1]))
    df=df.drop(columns=dropcols)
    print("NumColumns After: "+str(df.shape[1]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

        
#these guys already set to 99s in map, so null finder wont work above
def bisbasparent999(structure="bisbas01",filePath="./prepped/hcd/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.loc[~(df.bissc_total==999)].copy()
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

#these guys already set to 99s in map, so null finder wont work above
def neo999(structure="neo_ffi_form_s_adult_200301",filePath="./prepped/hcd/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.loc[~((df.neo_n==999)&(df.neo_e==999)&(df.neo_a==999))].copy()
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

def cbcl999(structure="cbcl01",filePath="./prepped/hcd/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.loc[~((df.cbcl1==999)&(df.cbcl2==999)&(df.cbcl3==999)&(df.cbcl4==999))].copy()
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

def cbcl1_5_999(structure="cbcl1_501",filePath="./prepped/hcd/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.loc[~((df.cbcl1==999)&(df.cbcl56a==999)&(df.cbcl_nt==999)&(df.cbcl_eye==999))].copy()
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)


        
#these guys already set to 99s in map, so null finder wont work above
def phenx25(structure="phenx_su01",filePath="./prepped/hcd/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.loc[~((df.ale_total_number_nm==25))].copy()
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

def cleanlist(structurelist=['lbadl01','mchq01']):
    for i in structurelist:
        redcleanup(structure=i,filePath="./prepped/hcd/")
        
def cleanzeros(structure='vitals01',filePath="./prepped/hcd/"):
    #print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    df.loc[df.vtl007==0,'vtl007']=np.NaN
    #df.loc[df.bp_stand=='11/80','bp_stand']=np.NaN
    #df.loc[df.bp_stand=='9999','bp_stand']=np.NaN
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
        

In [None]:
M.run('vitals01')
cleanzeros()

In [None]:
#test one to see if its working
M.run('socdem01')


In [None]:
#pull in race and ethnicity from code that also derived ndar_subjects, as well as HCD id of parents 
#for the comqother variable
#special clean up of annual_fam_inc vars for this structure
singlepath="~/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/prepped_singleton_structures/"
racethnic='HCPD_racethnic_for_socdem01_07_01_2020.csv'
re=pd.read_csv(singlepath+racethnic,header=0)

filePath="./prepped/hcd/"
structure='socdem01'
strucroot='socdem'
strucnum='01'
df=pd.read_csv(filePath+structure+".csv",header=1)

redf=pd.merge(re,df,on='src_subject_id',how='inner')
redf.shape
redf.columns
    
redf.loc[redf.annual_fam_inc==9999999,'annual_fam_inc']=-999999  
redf.loc[redf.annual_fam_inc==99999999,'annual_fam_inc']=-999999
redf.loc[redf.annual_fam_inc==9999999999,'annual_fam_inc']=-999999
redf.loc[redf.annual_fam_inc==9999999999999,'annual_fam_inc']=-999999    


In [None]:
redf['newcomq']=''
redf.loc[redf.comqother.str.contains('caregiver'),'newcomq']='caregiver '+redf.parent_id+redf.comqother.str.replace('caregiver','')
redf.loc[redf.comqother.str.contains('adult'),'newcomq']=redf.comqother
#redf[['comqother','parent_id','newcomq']]
redf=redf.drop(columns=['comqother','parent_id'])
redf=redf.rename(columns={'newcomq':'comqother'})


In [None]:
#reorder and rewrite
redf=redf[['src_subject_id','subjectkey', 'interview_date', 'interview_age', 'sex',
       'comqother', 'race', 'ethnicity', 'cg1_race', 'cg1_ethnicity',
       'bio_mother', 'bio_father', 'cust_parent', 'ptner_yn','p_partnerbio',
       'dembadpt', 'sub_adopt_1', 'country_origin', 'cg1_country_origin','ustime','cg1_ustime',
       'nimh_rv_dem_03', 'fspgod',  'cg1_nimh_rv_dem_03', 'cg1_fspgod',
       'das1ms',  'cg1_das1ms',
       'area4_explain', 'employcur', 'paofwork5', 'cg1_area4_explain', 'cg1_employcur','cg1_paofwork5', 
       'ind_type', 'jobh','calm_inc1', 'sub_income','cg1_ind_type', 'dem_industry_mom_12', 'cg1_sub_income',
        'annual_fam_inc', 'sub_income_famcode',
       'household_number_in_house', 'preg_age_mom', 'preg_age_dad',
       'birthcountry_dad', 'birthcountry_mom', 
        'bkgrnd_education',  'mother_edu_cat','father_edu_cat','cg1_bkgrnd_education', 'ptner_grade',
       'family_income_dfct1', 'family_income_dfct2',
       'family_income_dfct3', 'family_income_dfct4', 'family_income_dfct5',
       'family_income_dfct6', 'family_income_dfct7', 
       'ptner_job', 'ptner_job1_1', 'ptner_job7_1',
       'ptner_job8_1', 'ptner_business', 'ptner_work', 'ptner_income']]


In [None]:
with open(filePath+structure+".csv",'w') as f:
    f.write(strucroot+","+str(int(strucnum))+"\n")
    redf.to_csv(f,index=False)


In [None]:
# dont rerun socdem, since special fix applied above
structs = [
    'asr01',
    'bsc01',
    'cbcl01',
    'cbcl1_501',
    'cbq01',
    'deldisk01',
    'er4001',
    'leap01',
    'mab01',
    'mctq01',
    'medh01',
    'mendt01',
    'mmse01',
    'neo_ffi_form_s_adult_200301',
    'phenx_su01',
    'psqi01',
    'saiq01',
    'sdq01',
    'sleepdis01',
#    'socdem01', #specialty structure
    'vision_tests01',
#    'vitals01',
    'wais_iv_part101',
    'wisc_v01',
    'wppsiiv01',
    'ysr01'
]

#parent report as well as self report in some cases so can be multiple rows per person
structs2=[
    'drugscr01',
    'bisbas01',  
    'eatq01',
    'fenvs01',
    'gbi01',
    'pds01',
    'scan_debrief01',
    'srs02',
    'upps01',
    'screentime01'  
]
for s in structs:
    M.run(s)

for s in structs2:
    M.run(s)
#for i in structs:
#    print(i)
#for i in structs2:
#    print(i)


In [None]:
#now clean up empty rows
cleanlist(structurelist=structs)
cleanlist(structurelist=structs2)
redcleanup(structure="asr01",filePath="./prepped/hcd/",extraomitcol1='somatic_complaints_total',extraomitcol2='missingsum_obvq',extraomitcol3='kksomp') #line is redundant with cleanlist function
asr(structure="asr01",filePath="./prepped/hcd/")
dropcols(structure="bsc01",filePath="./prepped/hcd/",dropcols=['comments'])
bisbasparent999(structure="bisbas01",filePath="./prepped/hcd/")
redcleanup(structure='cbcl01',filePath="./prepped/hcd/",extraomitcol1='version_form')
cbcl999(structure="cbcl01",filePath="./prepped/hcd/")
cbcl1_5_999(structure="cbcl1_501",filePath="./prepped/hcd/")
redcleanup(structure='deldisk01',filePath="./prepped/hcd/",extraomitcol1='version_form')
redcleanup(structure="eatq01",filePath="./prepped/hcd/",extraomitcol1='respond')
redcleanup(structure='gbi01',filePath="./prepped/hcd/",extraomitcol1='version_form',extraomitcol2='sup_y_ss_sum_nm')
redcleanup(structure='mctq01',filePath="./prepped/hcd/",extraomitcol1='version_form',extraomitcol2='frprnts')
neo999(structure="neo_ffi_form_s_adult_200301",filePath="./prepped/hcd/")
redcleanup(structure="pds01",filePath="./prepped/hcd/",extraomitcol1='respond')
phenx25(structure="phenx_su01",filePath="./prepped/hcd/")
redcleanup(structure='srs02',filePath="./prepped/hcd/",extraomitcol1='respond',extraomitcol2='respond_detail',extraomitcol3='phenotype')
redcleanup(structure='upps01',filePath="./prepped/hcd/",extraomitcol1='version_form')
redcleanup(structure='ysr01',filePath="./prepped/hcd/",extraomitcol1='version_form',extraomitcol2='missingsum_obvq')
ysrcols=['cbcl1_2_text','cbcl2_3_text','cbcl5_2_text','cbcl6_6_text','cbcl7_3_text','cbcl9_2_text','cbcl10_1_text',
    'cbcl11_2_text','cbcl11_6_text','cbcl12_6_text','cbcl13_1_text','cbcl13_5_text','cbcl13_6_text','cbcl13_7_text',
    'cbcl16_1_text','cbcl16_6_text','cbcl56h_des']
dropcols(structure="ysr01",filePath="./prepped/hcd/",dropcols=ysrcols)
