In [1]:
from funcs import funcs
from Crosswalk.Transformer import Transformer
from Crosswalk.DataCache import DataCache
from Crosswalk.NDAWriter import NDAWriter
from Crosswalk.Manager import Manager

from Crosswalk.Loader import Loader, BoxLoader, BoxHcaLoader, SsagaLoader, QintHcaLoader, RedcapLoader
import pandas as pd
import numpy as np


In [2]:
#create folder for prepped structures, if it doesn't exist
!!mkdir prepped
!!mkdir prepped/hca

['mkdir: cannot create directory ‘prepped/hca’: File exists']

In [3]:
# note the path to the validator below.  This is the vtcmd.py validator written by the NDA.  
# If you haven't already installed this, please do so now: https://github.com/NDAR/nda-tools
# and the place the path that shows up when you type 'which vtcmd' from your terminal
# validation results will be sent to and read from whatever default is specified in the vtcmd configuration file,
# so if you're using vtcmd to validate any other datatypes, keep this in mind.

M = Manager(
        data =  DataCache(
            BoxHcaLoader('PennCNP',592325063896),
            RedcapLoader('hcpa'),
            SsagaLoader(),
            QintHcaLoader()
        ),
        writer = NDAWriter(completed_dir="./prepped/hca/", validator=".venv/bin/vtcmd"),
        #writer = NDAWriter(completed_dir="./prepped/hca/"),
        transformer = Transformer(funcs = funcs, map_dir='./maps/hca/')
)

Loading   ./maps/hca/asr01.yaml
Loading   ./maps/hca/batbil01.yaml
Loading   ./maps/hca/bsc01.yaml
Loading   ./maps/hca/deldisk01.yaml
Loading   ./maps/hca/drugscr01.yaml
Loading   ./maps/hca/er4001.yaml
Loading   ./maps/hca/gales01.yaml
Loading   ./maps/hca/ipaq01.yaml
Loading   ./maps/hca/lbadl01.yaml
Loading   ./maps/hca/leap01.yaml
Loading   ./maps/hca/mchq01.yaml
Loading   ./maps/hca/medh01.yaml
Loading   ./maps/hca/mendt01.yaml
Loading   ./maps/hca/moca01.yaml
Loading   ./maps/hca/nffi01.yaml
Loading   ./maps/hca/psqi01.yaml
Loading   ./maps/hca/ravlt01.yaml
Loading   ./maps/hca/scan_debrief01.yaml
Loading   ./maps/hca/ssaga_cover_demo01.yaml
Loading   ./maps/hca/trail_ca01.yaml
Loading   ./maps/hca/vitals01.yaml


In [4]:
# This step requires that you have a 'rosetta stone' file that has all the required NDA fields for 
# all subjects you intend to submit at this time.  This approach facilitates keeping track of subject counts
# across data types.  For example, if your required fields are already stored in XNAT because you had the CCF
# upload your imaging data for you, you can export this csv from XNAT and rename as appropriate.  
# Place this file at the main level of this repository, and name it in your config file
# Loader.py program's _post_load_hook_ method referenced below.  

M.preload_data()

Timing:  hcpa 9.698323249816895
Timing:  ssaga 0.3825404644012451
Timing:  PennCNP 0.3003568649291992


AttributeError: 'DataFrame' object has no attribute 'subjectid'

In [None]:
#Ad hoc functions to clean up empty rows for particular instruments after generated (issue for redcap data)
def redcleanup(structure="lbadl01",filePath="./prepped/hca/",extraomitcol1='NO',extraomitcol2='NO',extraomitcol3='NO',extraomitcol4='NO'):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]

    df=pd.read_csv(filePath+structure+".csv",header=1)
    df.head()

    print("NumRows Before: "+str(df.shape[0]))
    subfields=df.columns.to_list()
    subfields.remove('subjectkey')
    subfields.remove('src_subject_id')
    subfields.remove('interview_date')
    subfields.remove('interview_age')
    subfields.remove('sex')
    if extraomitcol1 and extraomitcol1 !='NO':
        subfields.remove(extraomitcol1)
    if extraomitcol2 and extraomitcol2 !='NO':
        subfields.remove(extraomitcol2)
    if extraomitcol3 and extraomitcol3 !='NO':
        subfields.remove(extraomitcol3)
    if extraomitcol4 and extraomitcol4 !='NO':
        subfields.remove(extraomitcol4)
    df=df.dropna(how='all',subset=subfields)
    print("NumRows After: "+str(df.shape[0]))

    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
 

    


#these guys already set to 99s in map, so null finder wont work above
def asrover60(structure="asr01",filePath="./prepped/hca/"):
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    #df=df.loc[df.interview_age>719].copy()
    df=df.loc[~((df.asr2_2==-99)&(df.asr3_2==-99))]
    df=df.drop(columns=['asr2_3_text',
        'oasr_ppl9_des',
        'asr5_5_text',
        'asr7_4_text',
        'asr8_4_text',
        'asr10_6_text',
        'asr13_3_text',
        'asr14_1_text',
        'asr15_2_text',
        'asr16_3_text',
        'asr16_4_text',
        'asr17_5_text',
        'asr19_1_text',
        'cbcl56h_des'
        ])
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)
    #print(subset)
        
def satisfy(structure='scan_debrief01',filePath="./prepped/hca/"):  
    print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    print("NumRows Before: "+str(df.shape[0]))
    df=df.drop(columns=['satisfaction1more','satisfaction2more','satisfaction4more','satisfaction5','satisfaction6'])
    print("NumRows After: "+str(df.shape[0]))
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)

        
def cleanlist(structurelist=['lbadl01','mchq01']):
    for i in structurelist:
        print(i)
        redcleanup(structure=i,filePath="./prepped/hca/")

In [None]:
def cleanzeros(structure='vitals01',filePath="./prepped/hca/"):
    #print(structure)
    strucroot=structure[:-2]
    strucnum=structure[-2:]
    df=pd.read_csv(filePath+structure+".csv",header=1)
    df.loc[df.vtl007==0,'vtl007']=np.NaN
    df.loc[df.bp_stand=='11/80','bp_stand']=np.NaN
    df.loc[df.bp_stand=='9999','bp_stand']=np.NaN
    with open(filePath+structure+".csv",'w') as f:
        f.write(strucroot+","+str(int(strucnum))+"\n")
        df.to_csv(f,index=False)


In [None]:
M.run('psqi01')
cleanlist(structurelist=['psqi01'])

In [None]:
M.run('trail_ca01')
redcleanup(structure="trail_ca01",filePath="./prepped/hca/",extraomitcol1='seizures',extraomitcol2='versionchildadult')


In [None]:
M.run('vitals01')
cleanzeros()

In [None]:
M.run('ssaga_cover_demo01')

In [None]:
# The commented out structs dont seem to be laid out as requested...need to investigate further...perhaps turn into 
#singleton structure, since doesnt follow typical one row per subject format
structs = [
    'lbadl01',
    'mchq01',
    'er4001',
    'deldisk01',
    'asr01',
    'batbil01',
    'bsc01',
    'drugscr01',
    'gales01',
    'ipaq01',
    'leap01',
    'medh01',
    'mendt01',
    'moca01',
    'nffi01',
    #'psqi01',
    'ravlt01',
    'scan_debrief01',
  ]

for s in structs:
    M.run(s)
    print(s)   
cleanlist(structurelist=structs)
asrover60(structure="asr01",filePath="./prepped/hca/")
redcleanup(structure="deldisk01",filePath="./prepped/hca/",extraomitcol1='version_form',extraomitcol2='comqother')
redcleanup(structure="medh01",filePath="./prepped/hca/",extraomitcol1='comqother')
redcleanup(structure="bsc01",filePath="./prepped/hca/",extraomitcol1='comqother')
redcleanup(structure="ravlt01",filePath="./prepped/hca/",extraomitcol1='ravlt_delt',extraomitcol2='ravlt_disct',extraomitcol3='ravlt_tott')
redcleanup(structure="scan_debrief01",filePath="./prepped/hca/",extraomitcol1='comqother')
satisfy(structure='scan_debrief01',filePath="./prepped/hca/")

In [None]:
# SSAGA and not mapped at this time because NDA wanted to reconsider how they organize this info per 5/1/20 email.  
# update:  Leo sent demographics for SSAGA: ssaga_cover_demo01 (see above)
structs2 = [
#     'diagpsx01',
#     'eatdisdemo01',
#     'phenx_sib01',
#     'scidv_pscyh01',
#     'socdem01'
  ]

for s in structs2:
    M.run(s)