This notebook will pull all the maps from various sources into one common annotation file
Big picture:  you get a folder of HCP behavioral data structures from the NDA and want to know what the elements within them mean.  This notebook takes the list of elements within those structures, and maps them to the NDA documentation about them (e.g. via the data dictionary API) and the local Lifespan documentation about them (to the extent that it is available in REDCap data dictionaries and other human AND machine readable maps.  


In [None]:
from ccf.easy_yaml import EasyYaml
from ccf.redcap import RedcapTable

In [None]:
import yaml
import pandas as pd
import os, datetime
import xlrd
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')
rosetta=pd.read_csv('/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/UnrelatedHCAHCD_w_STG_Image_and_pseudo_GUID12_11_2020.csv')
rosetta.head()

In [None]:
Y = EasyYaml()

In [None]:
#get the NDA maps
nda = {}
for filename in os.listdir('./nda/'):
    struct = filename[:-5]
    elements = Y('./nda/'+filename)
    nda[struct] = elements

In [None]:
#nda.keys()
#nda.values()
#nda['asr01'].keys()
#nda['asr01']['subjectkey'].keys()
#nda['asr01']['subjectkey'].values()

In [None]:
#get the redcap data dictionary annotation
redcap = {}
for filename in os.listdir('./definitions/'):
    struct = filename[:-5]
    elements = Y('./definitions/'+filename)
    redcap[struct] = elements

In [None]:
#Non TOOLBOX prepared dirs
nontlbx_hca="./Behavioral_HCA726_NonTLBX_20201212 datasetid_ 33780/"
nontlbx_hcd="./Behavioral_HCD652_NonTLBX_20210111 dataset id_ 34536/"

In [None]:
#Toolbox Prepared dirs
hcapreppedpath='/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/NDA_submissions/NDA_submissions/Behavioral_HCA635_TLBX_20210111 dataset id 34518/'
hcdpreppedpath='/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/NDA_submissions/NDA_submissions/Behavioral_HCD488_TLBX_20210111 dataset id 34520/'

In [None]:
#Get all the variables that are in prepared structures and collect their REDCap and NDA annotation in a single place for a master crosswalk
#three main types of files w.r.t annotation : toolbox, moises pipeline, and singletons for each HCA and HCD
#singletons break all rules wrt annotation conventions
#moises can pull annotation from REDCap or from other Data dictionaries (Penncnp)
#get the set of pipeline prepared structures --> point to latest box download to prevent versioning issues

#Moises pipeline HCA
prepped_hca_elems= []
prepped_hca_structs =[]
for filename in os.listdir(nontlbx_hca):
    if filename[0:4] != 'HCPA':  #singletons require special consideration
        struct = filename[:-4]
        els = pd.read_csv(nontlbx_hca+filename,header=1).columns.to_list()
        struc=[struct]*len(els)
        prepped_hca_elems= prepped_hca_elems + els
        prepped_hca_structs = prepped_hca_structs + struc
    
prepped_hca=[1]*len(prepped_hca_structs)
print(len(prepped_hca))
  
adict = {'nda_element':prepped_hca_elems,'nda_structure':prepped_hca_structs,'C-2847 (HCP-A)':prepped_hca}    
a=pd.DataFrame(adict)
a['collection']='hca'
print(a.shape)    


#Moises pipeline HCD
prepped_hcd_elems= []
prepped_hcd_structs =[]
for filename in os.listdir(nontlbx_hcd):
    if filename[0:4] != 'HCPD':  #singletons require special consideration
        struct = filename[:-4]
        els = pd.read_csv(nontlbx_hcd+filename,header=1).columns.to_list()
        struc=[struct]*len(els)
        prepped_hcd_elems= prepped_hcd_elems + els
        prepped_hcd_structs = prepped_hcd_structs + struc
   
prepped_hcd=[1]*len(prepped_hcd_structs)
print(len(prepped_hcd))
  
ddict = {'nda_element':prepped_hcd_elems,'nda_structure':prepped_hcd_structs,'C-2846 (HCP-D)':prepped_hcd}    
d=pd.DataFrame(ddict)    
d['collection']='hcd'
print(d.shape)    
 
#a.head()
d.head()

In [None]:
redcap.keys()

In [None]:
#strip html from redcap annotation
import re
def html_stripper(text):
    if text is None:    
        return None
    else:
        return re.sub('<[^<]+?>', '', text)
    


In [None]:
#capture section nearest previous section header for all variables
for db_name, db_elements in redcap.items():
    section, form = None, None
    for name, element in db_elements.items():
        current_form = element.get('form')
        current_section = html_stripper(element.get('section'))
        if current_form != form:
            form = current_form
            section = current_section
        elif current_section is not None:
            section = current_section
        if section is not None:    
            element['section'] = section

In [None]:
#now get the maps (from local sources to nda)
db = []
for collection in os.listdir('./maps/'):
    directory = './maps/' + collection
    for filename in os.listdir(directory):
        struct = filename[:-5]
        elements = Y(os.path.join(directory, filename))['elements']
        for e in elements:
            e['struct'] = struct
            e['collection'] = collection
        db.extend(elements)

In [None]:
from copy import deepcopy #copy function for objects that contain sub-objects

#this function will change a list object into a string, where items are separated by '/'
def flattened_str(f, x):
    n = x.get(f, None)
    if type(n) is list:
        n = '/'.join(n)
    return n

#for every row in the 'db' which is a collection of all entries in all yaml maps, elongate, where multiple sources
#getting merged into the same element (i.e. so that you can get the correct REDCap annotation from that source)
#name and rename (or input and output) should only contain multiple variables if they are coming from the same source
#source can be hcpa, child, teen, ssaga, ksads, PennCNP, eprime, parent, qint...
#due to the nested nature of sources in the yaml, though, you have to do a little iterating in cases where source is not a single string.
#open the bisbas01.yaml map if you need an example.  
elongated = []
for original in db:
    modified = deepcopy(original)    
    sources = modified.pop('source')
    
    if type(sources) is str:
        modified[sources] = sources
        elongated.append(modified)
        
    elif type(sources) is list:        
        item = deepcopy(modified)
        for source in sources:
            if type(source) is str:
                # e.g., item['parent] = "parent"
                item[source] = source
                elongated.append(item)  
                                
            elif type(source) is dict:
                sourcename, overrides = source.popitem()
                newitem = deepcopy(modified)                
                newitem[sourcename] = sourcename
                newitem.update(overrides)                
                elongated.append(newitem)
    
            elif type(source) is list:
                item['collection'] = "uh-oh"
                elongated.append(item)  
                
#gonna need the flattened version of name/rename for the next step, but then you can discard this redundant info
for original in elongated:
    original['output'] = flattened_str('rename', original)
    original['input'] = flattened_str('name', original)

In [None]:
#check - note that 'nda_name' is a field unique to a buggy drugscr01 structure.  If you see it below, 
#then this issue is still in the process of being addressed.  If you dont see it, then dont worry about it
dfdb = pd.DataFrame(elongated)
dfdb.columns
#dfdb.to_csv('moisesxwalk0.csv')

In [None]:
#now we'll add the NDA and Redcap info, using the 'input' and 'output' variables
#remember that not all the annotaiton is actually coming from REDCap, so we'll still have holes to fill.
for i in elongated:
    # add nda info
    if 'output' in i and i['output'] is not None and i['output'].split('/')[0] in nda[i['struct']]:
        #only use the first variable (vars are joined with /)
        nameo = i['output'].split('/')[0]
        x = nda[i['struct']][nameo]
        #x = nda[i['struct']][i['output']]
        i['type'] = x['type']
        i['description'] = x['description'].replace("\r"," ")
        i['notes'] = x.get('notes')
        i['alias'] = x.get('alias')
        if 'range' in x:
            i['range'] = '; '.join(list(map(str, x['range'])))
    
    # add redcap info
    if 'input' in i and i['input'] is not None:
        #only use the first variable (vars are joined with /)
        name = i['input'].split('/')[0]
        struct = i.get('parent') or i.get('teen') or i.get('child') or i.get('hcpa') or i.get('qint')  or i.get('ssaga') # or i.get('ksads') 
        if struct is None or name not in redcap[struct]:
            continue
        x = redcap[struct][name]
        i['r_form'] = x.get('form')
        i['r_section'] = x.get('section')
        i['r_type'] = x.get('type')
        i['r_label'] = html_stripper(x.get('label').replace("\n"," ").replace("\r"," "))
        i['r_choices'] = x.get('choices')      

In [None]:
#lets check the columns again
#turn into a dataframe
df = pd.DataFrame(elongated)
df.columns

In [None]:
#get rid the duplicate rows (happens when two separate sources have the same transformations - function of source being a list vs a subnested dictionary in the yaml maps)
#df.drop_duplicates() #doesnt work because there are lists in the df
df=df.loc[df.astype(str).drop_duplicates().index]



In [None]:
#reset source for all of the required variables (incorrectly pulling labels from redcap but all should point to rosetta,
#annotation for rosetta will happen later after the singletons and toolbox data annotation are added
rosetta_list=['gender','sex','interview_age','interview_date','src_subject_id','subjectkey','family_user_def_id']
cols=['PennCNP', 'hcpa', 'qint', 'ssaga', 'child', 'teen', 'parent','name', 'func', 'r_form', 'r_section', 'r_type',
       'r_label', 'r_choices', 'request', 'code', 'recode', 'old_code',   
       'specialty_code']

df['rosetta']=''
for i in cols:
    df.loc[df['rename'].astype(str).isin(rosetta_list),i]=''
    df.loc[df['rename'].astype(str).isin(rosetta_list),'rosetta']='rosetta'


In [None]:
print(df.shape)

#remove rows corresponding to ksads data and/or buggy drugscreening for the 2.0 behavioral data release
df=df.loc[~(df.ksads=='ksads')]
df=df.loc[~(df.struct=='drugscr01')]
df=df.drop(columns=['ksads','nda_name'])
print(df.shape)
df = df.rename(columns={"PennCNP": "penn_cnp"})


#drop columns that were created to make redcap annotation more easy
df=df.drop(columns=['input','output'])

#reorder and sort so that you can look at the file
df=df[['collection','rosetta','penn_cnp', 'hcpa', 'qint', 'ssaga', 'child', 'teen', 'parent','name', 'rename', 'struct',  'type', 'description',
       'notes', 'alias', 'range', 'func', 'r_form', 'r_section', 'r_type',
       'r_label', 'r_choices', 'request', 'code', 'recode', 'old_code',   
       'specialty_code']]

#df.sort_values(by=['collection','struct']).to_csv('moisesxwalk.csv',index=False)
df.groupby(['collection','struct','rosetta']).count() #just checking that all five rosetta fields are in every structure so far (the family id will be in ndar, but that annotaiton isn't present yet)

In [None]:
#specialty import of PennCNP annotation, which doesnt exist in a redcap label
import numpy as np
from openpyxl import load_workbook

needsanno=df.loc[(df.penn_cnp=="PennCNP")]

wb = load_workbook(filename = './DataDictionaries/UPennCNP_Emotion_and_Delayed_Discounting_and_Eprime_mapped_definitions_withEprime_11March2020.xlsx')
ws=wb['UPennCNP_Emotion_and_Delated_Di']
annot=pd.DataFrame(ws.values)
columns=list(annot.iloc[0])#annot.head()
annot.columns=columns
annot=annot.loc[~(annot.hcp_variable=='hcp_variable')]
penn=annot[['hcp_variable','Element Description','test Name','nda_structure']].rename(columns={'hcp_variable':'name','Element Description':'r_label','test Name':'r_form'})
penn=penn.loc[penn.nda_structure.isin(['deldisk01','er4001'])]
penn=penn[['name','r_label','r_form']]

updated = needsanno.merge(penn, how='left', on=['name'], suffixes=('', '_new'))
updated['r_label'] = np.where(pd.notnull(updated['r_label_new']), updated['r_label_new'], updated['r_label'])
updated['r_form'] = np.where(pd.notnull(updated['r_form_new']), updated['r_form_new'], updated['r_form'])
updated.drop(columns=['r_label_new','r_form_new'], axis=1, inplace=True)

#put it back together
dfA=df.loc[~(df.penn_cnp=="PennCNP")]
dfnew=pd.concat([dfA,updated],axis=0)
dfnew.loc[(dfnew['rename']=='version_form') & (dfnew.struct=='deldisk01'),'name']="version_form"
dfnew.loc[(dfnew['rename']=='version_form') & (dfnew.struct=='deldisk01'),'r_label']="DELAY_3.5 or PennCNP"
dfnew.loc[dfnew['rename']=='ddisc_valid','r_label']='Current Programming Version of the DDISC_VALID Scoring Code at penncnp.med.upenn.edu'
dfnew.loc[dfnew.name=='K_ER40D.valid_code','r_label']='Current Programming Version of the ER40D VALID Scoring Code at penncnp.med.upenn.edu'
#dfnew.sort_values(by=['collection','struct']).to_csv('moisesxwalk2.csv',index=False)


In [None]:
dfnew.columns #looking for merge issues, whereby you have a left and right version of r_forms or something like that
#this can happen...especially if you don't start at top of the notebook and proceed all the way to the bottom
#dfnew.head()

In [None]:
#NOW FOR THE SINGLETONS
#specialty code to add in ndar_subjects, edinburgh handedness, eprime, facename, singleton structure annotations
#FIRST DO HCA
singletons=nontlbx_hca
#get the set of prepared structures 
prepped_hca_elems= []
prepped_hca_structs =[]
prepped_hca_rforms =[]
prepped_hca_rsection =[]

for filename in os.listdir(singletons):
    if 'HCPA' in filename:
       print(filename)
       els = pd.read_csv(singletons+'/'+filename,header=1).columns.to_list()
       if 'facename' in filename:
          struc='facename01'
          rform='Face Name'
          rsection=''
       if 'edinburgh' in filename:
          struc='edinburgh_hand01'
          rform='Intake Interview 2'
          rsection='Handedness'
       if 'ndar' in filename:
          struc='ndar_subject01'
          rform=''
          rsection=''
       struct=[struc]*len(els)
       rforms=[rform]*len(els)
       rsections=[rsection]*len(els)

       prepped_hca_elems= prepped_hca_elems + els
       prepped_hca_structs= prepped_hca_structs + struct
       prepped_hca_rforms= prepped_hca_rforms + rforms
       prepped_hca_rsection= prepped_hca_rsection + rsections
       
        
prepped_hca_elems      
prepped_hca_structs
singlehca=pd.DataFrame(pd.concat([pd.Series(prepped_hca_elems),pd.Series(prepped_hca_structs)
                                  ,pd.Series(prepped_hca_rforms),pd.Series(prepped_hca_rsection)],axis=1))

#initializing these vars 'rename' but actually the name/rename designation needs to be finessed by hand, thanks to help-desk
#nightmare which initiated 'data harmonization' for the HCP collection
#for what its worth:  When study data are downloaded, they come in terms of NDA ELEMENTS, NOT as the ALIAS you used for upload.
#ALWAYS KNOW THE NDA ELEMENT to which you are sending the data because NDA does not keep track of study aliases which
#can be local or global in scope (also not tracked).  

singlehca.columns=['rename','struct','r_form','r_section'] #initializing these vars 'rename' 
singlehca['hcpa']='hcpa'

#the facename vars are coming from both INTRADB And REDCAP.  Our annotation got sent to NDA but not to me (lol), so 
#I fill in that missing local version of the local annotation later by copying it from the NDA later
singlehca['intradb']=''
singlehca.loc[singlehca.struct=='facename01','hcpa']=''
singlehca.loc[singlehca.struct=='facename01','intradb']='intradb'
listr=['f1_recall','f1_other','f2_recall','f2_other','f3_recall','f3_other','f4_recall','f4_other','f5_recall','f5_other','f6_recall','f6_other','f7_recall','f7_other','f8_recall','f8_other','f9_recall','f9_other','f10_recall','f10_other']
singlehca.loc[singlehca['rename'].isin(listr),'hcpa']='hcpa'
singlehca.loc[singlehca['rename'].isin(listr+rosetta_list),'intradb']=''



In [None]:
#check columns
singlehca.columns

In [None]:
#universally assigning name to be the same as rename, but will have to go back and fix so that rename only contains nda_elements, 
#and name only contains hcp_variables
singlehca['name']=singlehca['rename']

singlehca.loc[(singlehca.name=='version') & (singlehca.struct=='facename01'),'rename']='version_form'
singlehca['collection']='hca'

singlehca.loc[singlehca['rename']=='race','name']='racial'
singlehca.loc[singlehca['rename']=='ethnic_group','name']='ethnic'
singlehca.loc[singlehca.name=='iihandwr','rename']='writing'
singlehca.loc[singlehca.name=='iihandth','rename']='throwing'
singlehca.loc[singlehca.name=='iihandsc','rename']='scissors'
singlehca.loc[singlehca.name=='iihandto','rename']='toothbrush'
singlehca.loc[singlehca.name=='iihandkn','rename']='knife_no_fork'
singlehca.loc[singlehca.name=='iihandsp','rename']='spoon'
singlehca.loc[singlehca.name=='iihandbr','rename']='broom'
singlehca.loc[singlehca.name=='iihandma','rename']='match'
singlehca.loc[singlehca.name=='iihandbo','rename']='box'
singlehca.loc[singlehca.name=='iihandfk','rename']='foot'
singlehca.loc[singlehca.name=='iihandey','rename']='eye'


#fill out requests as muhc as possible.  reset name to be missing for vars that dont exist locally but were created
#to fill a requirement at the NDA
singlehca.loc[singlehca.struct.isin(['ndar_subjects','edinburgh_hand01']),'request']='structure created by HCA_ndar_edinburgh_*.ipynb notebook'
singlehca.loc[singlehca.struct.isin(['facename01']),'request']='structure created by Stats2Structures.sh '

singlehca.loc[singlehca['rename'].isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'name']=''
singlehca.loc[singlehca['rename'].isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'request']='hardcode required variables'

singlehca.loc[singlehca['rename']=='family_user_def_id','name']='final_pedid'
#singlehca.to_csv('singletonsHCA.csv')

#note will circle back to standardize the rosetta vars after getting HCA singletons together with the HCD singletons

In [None]:
#now do HCD
singletons=nontlbx_hcd

prepped_hcd_elems= []
prepped_hcd_structs =[]
prepped_hcd_rforms =[]
prepped_hcd_rsection =[]

for filename in os.listdir(singletons):
    if 'HCPD' in filename:
       print(filename)
       if 'racethnic' not in filename:
           els = pd.read_csv(singletons+'/'+filename,header=1).columns.to_list()
           if 'edinburgh' in filename:
              struc='edinburgh_hand01'
              rform='Intake Interview 2'
              rsection='Handedness'
           if 'ndar' in filename:
              struc='ndar_subject01'
              rform=''
              rsection=''
           if 'eprime' in filename:
              struc='deldisk01'     
              rform='Eprime Delay Discounting'
              rsection=''
           struct=[struc]*len(els)
           rforms=[rform]*len(els)
           rsections=[rsection]*len(els)

           prepped_hcd_elems= prepped_hcd_elems + els
           prepped_hcd_structs= prepped_hcd_structs + struct
           prepped_hcd_rforms= prepped_hcd_rforms + rforms
           prepped_hcd_rsection= prepped_hcd_rsection + rsections
       
        
prepped_hcd_elems      
prepped_hcd_structs
singlehcd=pd.DataFrame(pd.concat([pd.Series(prepped_hcd_elems),pd.Series(prepped_hcd_structs)
                                  ,pd.Series(prepped_hcd_rforms),pd.Series(prepped_hcd_rsection)],axis=1))
singlehcd.columns=['rename','struct','r_form','r_section']
singlehcd['collection']='hcd'


singlehcd['teen']='teen'
singlehcd['child']=''
singlehcd['parent']=''
singlehcd['eprime']=''
#singlehcd.to_csv('singletonsHCD.csv')

In [None]:
singlehcd['name']=singlehcd['rename']


singlehcd.loc[singlehcd['rename']=='race','name']='sub_race'
singlehcd.loc[singlehcd['rename']=='ethnic_group','name']='sub_latino'
singlehcd.loc[singlehcd['rename']=='hand_total','child']='child'
singlehcd.loc[singlehcd['rename']=='hand_total','teen']=''
singlehcd.loc[singlehcd['rename']=='hammer','child']='child'
singlehcd.loc[singlehcd['rename']=='hammer','name']='hand2'
singlehcd.loc[singlehcd['rename']=='hammer','teen']=''
singlehcd.loc[singlehcd['rename']=='hand5','child']='child'
singlehcd.loc[singlehcd['rename']=='hand5','teen']=''
singlehcd.loc[singlehcd['rename']=='hand_15_drink','child']='child'
singlehcd.loc[singlehcd['rename']=='hand_15_drink','teen']=''
singlehcd.loc[singlehcd['rename']=='hand_15_drink','name']='hand8'
singlehcd.loc[singlehcd['rename']=='writing','name']='iihandwr'
singlehcd.loc[singlehcd['rename']=='throwing','name']='iihandth'
singlehcd.loc[singlehcd['rename']=='scissors','name']='iihandsc'
singlehcd.loc[singlehcd['rename']=='toothbrush','name']='iihandto'
singlehcd.loc[singlehcd['rename']=='knife_no_fork','name']='iihandkn'
singlehcd.loc[singlehcd['rename']=='spoon','name']='iihandsp'
singlehcd.loc[singlehcd['rename']=='broom','name']='iihandbr'
singlehcd.loc[singlehcd['rename']=='match','name']='iihandma'
singlehcd.loc[singlehcd['rename']=='box','name']='iihandbo'
singlehcd.loc[singlehcd['rename']=='foot','name']='iihandfk'
singlehcd.loc[singlehcd['rename']=='eye','name']='iihandey'


secondrow=singlehcd.loc[singlehcd['rename'].isin(['writing','throwing','toothbrush','spoon','scissors','race','ethnic_group'])].copy()
secondrow['r_form']='Intake Interview'
secondrow['r_section']='Handedness Assessment'

secondrow.loc[secondrow['rename']=='writing','name']='hand1'
secondrow.loc[secondrow['rename']=='writing','child']='child'
secondrow.loc[secondrow['rename']=='writing','teen']=''
secondrow.loc[secondrow['rename']=='throwing','name']='hand3'
secondrow.loc[secondrow['rename']=='throwing','child']='child'
secondrow.loc[secondrow['rename']=='throwing','teen']=''
secondrow.loc[secondrow['rename']=='toothbrush','name']='hand4'
secondrow.loc[secondrow['rename']=='toothbrush','child']='child'
secondrow.loc[secondrow['rename']=='toothbrush','teen']=''
secondrow.loc[secondrow['rename']=='spoon','name']='hand6'
secondrow.loc[secondrow['rename']=='spoon','child']='child'
secondrow.loc[secondrow['rename']=='spoon','teen']=''
secondrow.loc[secondrow['rename']=='scissors','name']='hand7'
secondrow.loc[secondrow['rename']=='scissors','child']='child'
secondrow.loc[secondrow['rename']=='scissors','teen']=''
secondrow.loc[secondrow['rename']=='race','name']='p_c_race'
secondrow.loc[secondrow['rename']=='race','child']=''
secondrow.loc[secondrow['rename']=='race','teen']=''
secondrow.loc[secondrow['rename']=='race','parent']='parent'
secondrow.loc[secondrow['rename']=='race','r_form']='Intake Interview'
secondrow.loc[secondrow['rename']=='race','r_section']=''
secondrow.loc[secondrow['rename']=='ethnic_group','name']='p_c_latino'
secondrow.loc[secondrow['rename']=='ethnic_group','child']=''
secondrow.loc[secondrow['rename']=='ethnic_group','teen']=''
secondrow.loc[secondrow['rename']=='ethnic_group','parent']='parent'
secondrow.loc[secondrow['rename']=='ethnic_group','r_form']='Intake Interview'
secondrow.loc[secondrow['rename']=='ethnic_group','r_section']=''

singlehcd=pd.concat([singlehcd,secondrow],axis=0)

#fill out requests as muhc as possible.  reset name to be missing for vars that dont exist locally but were created
#to fill a requirement at the NDA
singlehcd.loc[singlehcd.struct.isin(['ndar_subjects','edinburgh_hand01']),'request']='structure created by HCD_ndar_edinburgh_*.ipynb notebook'
singlehcd.loc[singlehcd.struct.isin(['deldisk01']),'request']='structure created by HCD_Eprime_deldisk01_*.ipynb notebook'
singlehcd.loc[singlehcd['rename']=='family_user_def_id','name']='final_pedid'
singlehcd.loc[singlehcd['rename']=='hand_total','rename']='handedness_score'

singlehcd.loc[singlehcd['rename'].isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'name']=''
singlehcd.loc[singlehcd['rename'].isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'request']='hardcode required variables'

singlehcd.loc[singlehcd.struct=='deldisk01','teen']=''
singlehcd.loc[singlehcd.struct=='deldisk01', 'eprime']='eprime'

rosetta_list=['gender','sex','interview_age','interview_date','src_subject_id','subjectkey','family_user_def_id']
singlehcd.loc[singlehcd['rename'].astype(str).isin(rosetta_list),'eprime']=''
#singlehcd.to_csv('singletonsHCD.csv')




In [None]:
#put the singletons together for the redcap annotation grab
singletonsHCP=pd.concat([singlehca,singlehcd],axis=0)
#it used to be the case that 'gender' was the nda_element name for sex.  NDA has since changed this, though
#gender and sex can both be used
#HCP used 'gender' to refer to sex at birth, too.  Yay.  
singletonsHCP.loc[singletonsHCP['rename']=='gender','rename']='sex'
#streamline the rosetta vars and make sure there are 5 for each structure (except ndar, which will have the family var, too)

#reset source for all of the required variables (incorrectly pulling labels from redcap but all should point to rosetta,
#annotation for rosetta will happen later after the singletons and toolbox data annotation are added
rosetta_list=['gender','sex','interview_age','interview_date','src_subject_id','subjectkey','family_user_def_id']
cols=['penn_cnp', 'hcpa', 'qint', 'ssaga', 'child', 'teen', 'parent','name', 'func', 'r_form', 'r_section', 'r_type',
       'r_label', 'r_choices', 'request', 'code', 'recode', 'old_code',   
       'specialty_code']

singletonsHCP['rosetta']=''
for i in cols:
    singletonsHCP.loc[singletonsHCP['rename'].astype(str).isin(rosetta_list),i]=''
    singletonsHCP.loc[singletonsHCP['rename'].astype(str).isin(rosetta_list),'rosetta']='rosetta'


singletonsHCP.head()
#singletonsHCP.to_csv('tttt.csv')
#check columns
singletonsHCP.columns

In [None]:
#now pull in any available redcap annotation if source is hcpa child teen or parent
redcaphcpa=pd.DataFrame(redcap['hcpa']).transpose()
redcaphcpa=redcaphcpa.reset_index().rename(columns={'index':'name','label':'r_label','type':'r_type','choices':'r_choices'})[['name','r_type','r_label','r_choices']]
redcaphcpa['hcpa']='hcpa'
singletonsHCPa=pd.merge(singletonsHCP.loc[singletonsHCP.hcpa=='hcpa'].drop(columns=['r_type','r_label','r_choices']),redcaphcpa,how='left',on=['hcpa','name'])

redcapchild=pd.DataFrame(redcap['child']).transpose()
redcapchild=redcapchild.reset_index().rename(columns={'index':'name','label':'r_label','type':'r_type','choices':'r_choices'})[['name','r_type','r_label','r_choices']]
redcapchild['child']='child'
singletonsHCPb=pd.merge(singletonsHCP.loc[singletonsHCP.child=='child'].drop(columns=['r_type','r_label','r_choices']),redcapchild,how='left',on=['child','name'])

redcapteen=pd.DataFrame(redcap['teen']).transpose()
redcapteen=redcapteen.reset_index().rename(columns={'index':'name','label':'r_label','type':'r_type','choices':'r_choices'})[['name','r_type','r_label','r_choices']]
redcapteen['teen']='teen'
singletonsHCPc=pd.merge(singletonsHCP.loc[singletonsHCP.teen=='teen'].drop(columns=['r_type','r_label','r_choices']),redcapteen,how='left',on=['teen','name'])

redcapparent=pd.DataFrame(redcap['parent']).transpose()
redcapparent=redcapparent.reset_index().rename(columns={'index':'name','label':'r_label','type':'r_type','choices':'r_choices'})[['name','r_type','r_label','r_choices']]
redcapparent['parent']='parent'
singletonsHCPd=pd.merge(singletonsHCP.loc[singletonsHCP.parent=='parent'].drop(columns=['r_type','r_label','r_choices']),redcapparent,how='left',on=['parent','name'])

singletonsHCPe=singletonsHCP.loc[(singletonsHCP.eprime=='eprime') | (singletonsHCP.intradb=='intradb')  | (singletonsHCP.rosetta=='rosetta')]
#redcapanno=pd.concat([redcaphcpa,redcapchild,redcapteen,redcapparent],axis=0)
#singletonsHCP=pd.merge(singletonsHCP,redcapanno,how='left',on=['parent','hcpa','teen','child','name'])

print(singletonsHCP.shape)
singletonsHCPtest=pd.concat([singletonsHCPa,singletonsHCPb,singletonsHCPc,singletonsHCPd,singletonsHCPe],axis=0)
print(singletonsHCPtest.shape)

singletonsHCPtest.columns                                                                                                           
#singletonsHCPtest.to_csv('ssss.csv')


In [None]:
nda1=pd.DataFrame(nda['edinburgh_hand01']).transpose().reset_index().rename(columns={'index':'rename'})[['type','rename','description','notes','range','alias']]
nda1['struct']='edinburgh_hand01'
nda2=pd.DataFrame(nda['facename01']).transpose().reset_index().rename(columns={'index':'rename'})[['type','rename','description','notes','range','alias']]
nda2['struct']='facename01'
nda3=pd.DataFrame(nda['ndar_subject01']).transpose().reset_index().rename(columns={'index':'rename'})[['type','rename','description','notes','range','alias']]
nda3['struct']='ndar_subject01'
nda4=pd.DataFrame(nda['deldisk01']).transpose().reset_index().rename(columns={'index':'rename'})[['type','rename','description','notes','range','alias']]
nda4['struct']='deldisk01'
nda_anno=pd.concat([nda1,nda2,nda3,nda4],axis=0)
nda_anno

In [None]:
singletonsHCPtest.columns
singletonsHCPtest2=pd.merge(singletonsHCPtest,nda_anno,how='left',on=['struct','rename'])
singletonsHCPtest2.columns
singletonsHCPtest2.to_csv('uuuu.csv')


#copy the NDA facename var descriptions over to the local annotation 
#r_form is missing for all the rosetta vars, so you can use it as a pull indicator
singletonsHCPtest2.loc[singletonsHCPtest2.r_form=='Face Name','r_label']=singletonsHCPtest2.description

#copy the NDA var descriptions for a few other special cases;
singletonsHCPtest2.loc[singletonsHCPtest2.name=='hcp_handedness_score','r_label']=singletonsHCPtest2.description

#now get the eprime special annotation
singletonsHCPtest2.loc[singletonsHCPtest2.name.str.contains('ddisc'),'r_label']=singletonsHCPtest2.description

singletonsHCPtest2.loc[(singletonsHCPtest2.name=='version_form') & (singletonsHCPtest2.struct=='deldisk01'),'r_label']="DELAY_3.5 or PennCNP"
singletonsHCPtest2.loc[(singletonsHCPtest2.name=='version_form') & (singletonsHCPtest2.struct=='deldisk01'),'code']="return 'DELAY_3.5'"

singletonsHCPtest2.loc[singletonsHCPtest2['rename']=='comqother','name']=''
singletonsHCPtest2.loc[singletonsHCPtest2['rename']=='comqother','code']="return 'subject about self'"
#singletonsHCPtest2.to_csv('uuuu.csv')



In [None]:
#now concatenate these two sources of annotation (moises and singletons) and do a little cleanup
#print(singletonsHCPtest2.columns)
#print(dfnew.columns)

NonTLBX=pd.concat([singletonsHCPtest2,dfnew],axis=0)
#dfnew.sort_values(by=['collection','struct']).to_csv('moisesxwalk2.csv',index=False)
NonTLBX.columns


In [None]:
#beautify column names
NonTLBX = NonTLBX.rename(columns={
        'name':'hcp_variable_name', 
        'rename': 'nda_element',
        'struct':'nda_structure',
        'request':'nda_request',
         #func,code,recode
        #'hcpa':'REDCap7 HCPA',
        'type':'nda_type',
        'description':'nda_description', 
        'notes':'nda_notes',
        'alias':'nda_aliases',
        'range':'nda_range',
        'r_label':'hcp_label',
        'r_choices':'choices_calcs'
         #'penn_cnp': 'Box Curated PennCNP',
        #'qint': 'REDCap9 Qinteractive', 
        #'ksads': 'REDCap9 KSADs',
        #'teen':'REDCap7 HCPD-18', 
        #'child':'REDCap7 HCPD-child',
        #'parent':'REDCap7 HCPD-Parent', 
        #'ssaga':'REDCap7 HCPA-SSAGA'
   })
NonTLBX.columns

In [None]:
#do some concatenating and rosetta filling to make the crosswalk readable
NonTLBX['source']=NonTLBX['rosetta'].fillna('') + NonTLBX['hcpa'].fillna('') + NonTLBX['intradb'].fillna('') + NonTLBX['ssaga'].fillna('') +  NonTLBX['penn_cnp'].fillna('') + NonTLBX['qint'].fillna('')+ NonTLBX['teen'].fillna('') +  NonTLBX['child'].fillna('') +  NonTLBX['parent'].fillna('') +  NonTLBX['eprime'].fillna('') 
NonTLBX.r_section=NonTLBX.r_section.str.replace('<[^<]+?>', '')
NonTLBX['hcp_instrument']=NonTLBX['r_form'].fillna('') +":"+ NonTLBX['r_section'].fillna('')
NonTLBX['hcp_instrument']=NonTLBX['hcp_instrument'].str.strip(":")


#create hcp_variable_names and labels and Instruments for Rosetta vars
rosetta_list=['sex','interview_age','interview_date','src_subject_id','subjectkey','family_user_def_id']
rosetta_local=['nda_gender','nda_interview_age','nda_interview_date','subjectped','nda_guid','final_pedid']
NonTLBX.loc[NonTLBX['nda_element']=='sex','hcp_variable_name']='nda_gender'
NonTLBX.loc[NonTLBX['nda_element']=='interview_age','hcp_variable_name']='nda_interview_age'
NonTLBX.loc[NonTLBX['nda_element']=='interview_date','hcp_variable_name']='nda_interview_date'
NonTLBX.loc[NonTLBX['nda_element']=='src_subject_id','hcp_variable_name']='subjectped'
NonTLBX.loc[NonTLBX['nda_element']=='subjectkey','hcp_variable_name']= 'nda_guid'
NonTLBX.loc[NonTLBX['nda_element']=='family_user_def_id','hcp_variable_name']='final_pedid'

NonTLBX.loc[NonTLBX['nda_element']=='sex','hcp_label']='sex at birth'
NonTLBX.loc[NonTLBX['nda_element']=='interview_age','hcp_label']='age in months'
NonTLBX.loc[NonTLBX['nda_element']=='interview_date','hcp_label']='RedCap event registration date when copied to IntraDB (rounded down to nearest Quarter)'
NonTLBX.loc[NonTLBX['nda_element']=='src_subject_id','hcp_label']='HCA or HCD subject id'
NonTLBX.loc[NonTLBX['nda_element']=='subjectkey','hcp_label']='Pseudo-Guid' 
NonTLBX.loc[NonTLBX['nda_element']=='family_user_def_id','hcp_label']='family identifier for related subjects within and across HCA and HCD studies'

NonTLBX.loc[NonTLBX.rosetta=='rosetta','hcp_instrument']='UnrelatedHCAHCD_w_STG_Image_and_pseudo_GUID12_11_2020.csv'

NonTLBX.loc[NonTLBX.nda_element=='comqother','hcp_variable_name']='dummy'
NonTLBX.loc[NonTLBX.nda_element=='comqother','hcp_label']='Respondent (subject AND object) for stacked vs wide structure schema clarification'
NonTLBX.loc[(NonTLBX.nda_element=='respond') & (NonTLBX.nda_structure=='eatq01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='respond') & (NonTLBX.nda_structure=='pds01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='respond') & (NonTLBX.nda_structure=='srs02'),'hcp_variable_name']='dummy'

NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='bisbas01'),'hcp_label']='dummy variable: there is no version'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='bisbas01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='cbcl01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='gbi01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='mctq01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='medh01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='phenx_su01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='srs02'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='ysr01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='version_form') & (NonTLBX.nda_structure=='upps01'),'hcp_variable_name']='dummy'
NonTLBX.loc[(NonTLBX.nda_element=='versionchildadult') & (NonTLBX.nda_structure=='trail_ca01'),'hcp_variable_name']='dummy'

moredummies=['cbcl_activities','cbcl_activities_raw','cbcl_adhd_raw','cbcl_affective','cbcl_affective_raw','cbcl_affective_raw','cbcl_anxiety_raw','cbcl_depresspr','cbcl_depresspr_raw','cbcl_emotional','cbcl_emotional_raw','cbcl_emotional_raw','cbcl_ocd','cbcl_ocd_raw','cbcl_oppositional_raw','cbcl_pervasive','cbcl_pervasive_raw','cbcl_pervasive_raw','cbcl_ptsd','cbcl_ptsd_raw','cbcl_school','cbcl_school_raw','cbcl_sct','cbcl_sct_raw','cbcl_sleep','cbcl_sleep_raw','cbcl_social_c','cbcl_social_c_raw','cbcl_total_c','cbcl_total_c_raw','cbcl_withdrawn_raw','pds_boy_rs','pds_girl_rs','phenotype','ravlt_delt','ravlt_disct','ravlt_tott','respond_detail','seizures','sports_time1','sports_time2','sports_time3','sports_well1','sports_well2','sports_well3','phenotype','phenotype_description','family_study','twins_study','sibling_study','sample_taken']

for i in moredummies:
    newi='dummy'+i
    #print(newi)
    NonTLBX.loc[NonTLBX.nda_element==i,'hcp_variable_name']='dummy'

    

NonTLBX['nda_structure_link']="https://nda.nih.gov/data_structure.html?short_name="+NonTLBX['nda_structure']
NonTLBX=NonTLBX[['collection','rosetta', 'hcpa',  'intradb','ssaga', 'penn_cnp', 'qint', 
                 'teen',   'child', 'parent', 'eprime', 'source',
                 'nda_element', 'nda_structure', 'nda_type', 'nda_description', 'nda_notes',
       'nda_range', 'nda_aliases',  'nda_structure_link', 'nda_request','hcp_variable_name', 'hcp_label', 
        'r_type','hcp_instrument', 'r_form', 'r_section', 'choices_calcs', 'func', 'code', 'recode', 'old_code', 'specialty_code']]
NonTLBX=NonTLBX[['collection','source',
                 'nda_element', 'nda_structure', 'nda_description', 'nda_notes',
       'nda_range',  'nda_structure_link','nda_request', 'hcp_variable_name', 'hcp_label', 
          'hcp_instrument', 'choices_calcs', 'func', 'code', 'recode']]

NonTLBX.loc[NonTLBX.source=='teenparent','source']='teen/parent'#concatenate form and section
NonTLBX.loc[NonTLBX.source=='teenchild','source']='teen/child'#concatenate form and section
NonTLBX.loc[NonTLBX.source=='childparent','source']='child/parent'#concatenate form and section
NonTLBX.loc[NonTLBX.source=='teenchildparent','source']='teen/child/parent'#concatenate form and section


#remove any remaining html from the redcap fields
NonTLBX.hcp_label=NonTLBX.hcp_label.str.replace('<[^<]+?>', '')

#for the dummy vars in ndar_subjects, give hcp_label the ndar_label
NonTLBX.columns
#ndlist=['phenotype','phenotype_description','family_study','twins_study','sibling_study','sample_taken']
NonTLBX.loc[(NonTLBX.nda_structure=='ndar_subject01') & (NonTLBX.nda_element=='phenotype'),'hcp_label']='Phenotype/diagnosis for the subject'
NonTLBX.loc[(NonTLBX.nda_structure=='ndar_subject01') & (NonTLBX.nda_element=='phenotype_description'),'hcp_label']='Description of the phenotype for the subject'
NonTLBX.loc[(NonTLBX.nda_structure=='ndar_subject01') & (NonTLBX.nda_element=='family_study'),'hcp_label']='Was it family study? Study of biological mother, biological father and/or sibling of proband.'
NonTLBX.loc[(NonTLBX.nda_structure=='ndar_subject01') & (NonTLBX.nda_element=='twins_study'),'hcp_label']='Is this study of twins?'
NonTLBX.loc[(NonTLBX.nda_structure=='ndar_subject01') & (NonTLBX.nda_element=='sibling_study'),'hcp_label']='Was it sibling study? Study of sibling(s) of autistic child.'
NonTLBX.loc[(NonTLBX.nda_structure=='ndar_subject01') & (NonTLBX.nda_element=='sample_taken'),'hcp_label']='Was a sample taken at this interview/during this project time'

In [None]:
NonTLBX.loc[(NonTLBX.hcp_variable_name=='dummy') & (NonTLBX.hcp_label.isnull()==True) & (NonTLBX.code=='return 999'),'hcp_label']='dummy var required but na'
NonTLBX.loc[(NonTLBX.hcp_variable_name=='dummy') & (NonTLBX.hcp_label.isnull()==True) & (NonTLBX.code=='return -98'),'hcp_label']='dummy var required but na'
NonTLBX.loc[(NonTLBX.hcp_variable_name=='dummy') & (NonTLBX.hcp_label.isnull()==True) & (NonTLBX.code.str.contains('return')),'hcp_label']='dummy var - see return in trans_code or trans_nda_request'


In [None]:

NonTLBX.loc[NonTLBX.hcp_label.isnull()==True]
#NonTLBX.loc[(NonTLBX.nda_structure=='cbcl01') & (NonTLBX.hcp_variable_name=='dummy') & (NonTLBX.code=='return 999'),'hcp_label']#='dummy var required but missing'
#NonTLBX.loc[(NonTLBX.nda_structure=='cbcl1_501') & (NonTLBX.hcp_variable_name=='dummy') & (NonTLBX.code=='return 999'),'hcp_label']#='dummy var required but missing'
#NonTLBX.loc[(NonTLBX.nda_structure=='ds01

In [None]:
NonTLBX.sort_values(by=['collection','nda_structure']).to_csv('Crosswalk_HCP_NonTLBX_'+snapshotdate+'.csv', index=False)

In [None]:
#Now load the TLBX annotation and reconfigure so it can be concatenated with the rest of the stuff
#get collection, source, and nda_elements from the prepared structures, then merge it with the loaded Toolbox Crosswalk
#drop existing NDA annot and pull in fresh stuff
crosswalkpath="/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/Ipad2NDA_withCrosswalk/NIHToolbox2NDA/"
cfile="Crosswalk_NIH_Toolbox_2_NDA.csv"
crosswalk=pd.read_csv(crosswalkpath+cfile,header=0,low_memory=False, encoding = "ISO-8859-1")
crosswalk=crosswalk.drop(columns=['Measurement System','validated','DataType','specialty_code','template','inst_short','Source','description','valueRange','notes'])

crosswalk=crosswalk.rename(columns={'Responses':'choices_calcs','Inst':'hcp_instrument','hcp_variable':'hcp_variable_name',
                          'action_requested':'nda_request','requested_python':'func','description':'nda_description','valueRange':'nda_range','notes':'nda_notes'})

crosswalk['hcp_label']=crosswalk['Domain'].fillna('') +":"+ crosswalk['Item ID'].fillna('') +":"+ crosswalk['Stem'].fillna('') +":"+ crosswalk['Context'].fillna('')
crosswalk['hcp_label']=crosswalk['hcp_label'].str.strip(":")
crosswalk.loc[crosswalk.hcp_label.fillna('')=='','hcp_label']=crosswalk.hcp_variable_name 
crosswalk=crosswalk.drop(columns=['Domain','Item ID','Stem','Context'])
                                  
crosswalk['source']='NIH Toolbox'
print(crosswalk.shape)
crosswalk=crosswalk.loc[crosswalk.nda_element.isnull()==False]
print(crosswalk.shape)
crosswalk.columns

#crosswalk.to_csv('test.csv',index=False)

In [None]:
#get list of instruments: will need to merge into output, 
instruments=list(crosswalk.hcp_instrument.unique())
#filename='HCPA_Anger-Affect_prang01_12_12_2020.csv'
#row = pd.read_csv(hcapreppedpath+filename,header=1, low_memory=False,nrows=1)
#inst=row.version_form[0]
#print(row.version_form)

In [None]:
#TLBX pipeline uploaded variables (should have been nda_elements...they were not.  were fixed.  check again now...
#look for empty descriptions 

prepped_hca_elems= []
prepped_hca_structs =[]
prepped_hca_instruments =[]

for filename in os.listdir(hcapreppedpath):
#    print(filename)
    struct = pd.read_csv(hcapreppedpath+filename,header=None, low_memory=False,nrows=1)
    structure=str((struct[0]+'0'+struct[1].astype(str))[0])
    if structure=='cogcomp01':
        inst='Cognition Composite Scores'
    else:
        row = pd.read_csv(hcapreppedpath+filename,header=1, low_memory=False,nrows=1)
        inst=row.version_form[0]
    els = pd.read_csv(hcapreppedpath+filename,header=1).columns.to_list()
    instrument=[inst]*len(els)
    struc=[structure]*len(els)
    prepped_hca_elems= prepped_hca_elems + els
    prepped_hca_structs = prepped_hca_structs + struc 
    prepped_hca_instruments=prepped_hca_instruments + instrument
    
#print(len(prepped_hca))
  
adict = {'hcp_variable_upload':prepped_hca_elems,'nda_structure':prepped_hca_structs,'hcp_instrument':prepped_hca_instruments}    
a=pd.DataFrame(adict)
a['collection']='hca'
print(a.shape)    
#a.head()

prepped_hcd_elems= []
prepped_hcd_structs =[]
prepped_hcd_instruments =[]

for filename in os.listdir(hcdpreppedpath):
    struct = pd.read_csv(hcdpreppedpath+filename,header=None, low_memory=False,nrows=1)
    structure=str((struct[0]+'0'+struct[1].astype(str))[0])
    if structure=='cogcomp01':
        inst='Cognition Composite Scores'
    else:
        row = pd.read_csv(hcdpreppedpath+filename,header=1, low_memory=False,nrows=1)
        inst=row.version_form[0]
    els = pd.read_csv(hcdpreppedpath+filename,header=1).columns.to_list()
    instrument=[inst]*len(els)
    struc=[structure]*len(els)
    prepped_hcd_elems= prepped_hcd_elems + els
    prepped_hcd_structs = prepped_hcd_structs + struc
    prepped_hcd_instruments=prepped_hcd_instruments + instrument
    
#print(len(prepped_hcd))
  
ddict = {'hcp_variable_upload':prepped_hcd_elems,'nda_structure':prepped_hcd_structs,'hcp_instrument':prepped_hcd_instruments}    
d=pd.DataFrame(ddict)
d['collection']='hcd'
print(d.shape)    
d.head()

tlbxstructs=pd.concat([a,d],axis=0)
#tlbxstructs.shape
#tlbxstructs.loc[tlbxstructs.nda_element=='gender','nda_element']='sex'
tlbxstructs.head()

In [None]:
tlbxstructs.columns
crosswalk.columns
#merge by hcp_instrument and hcp_variable_upload...check to see everything is there
uploadwcross=pd.merge(tlbxstructs,crosswalk.drop(columns='nda_structure'),how='left',on=['hcp_instrument','hcp_variable_upload'])
#uploadwcross.to_csv('uuuu.csv')  #whew - only the rosetta elements are missing plus 3 from the tlbx_empbeh01
#fill in missing nda_element for next merge

uploadwcross.loc[uploadwcross.hcp_variable_upload=='subjectkey','nda_element']='subjectkey'
uploadwcross.loc[uploadwcross.hcp_variable_upload=='src_subject_id','nda_element']='src_subject_id'
uploadwcross.loc[uploadwcross.hcp_variable_upload=='interview_age','nda_element']='interview_age'
uploadwcross.loc[uploadwcross.hcp_variable_upload=='interview_date','nda_element']='interview_date'
uploadwcross.loc[uploadwcross.hcp_variable_upload=='gender','nda_element']='sex'

#fill in the blanks
uploadwcross['nda_structure_link']='https://nda.nih.gov/data_structure.html?short_name='+uploadwcross.nda_structure
uploadwcross['source']='NIH Toolbox Ipad App'
#uploadwcross.to_csv('uuuu.csv')
uploadwcross.columns
 

In [None]:
uploadwcross.shape

In [None]:
#now get the updated annotation

import requests
import json
#get nda annotation for all the elements uploaded
structs=list(tlbxstructs.nda_structure.unique())
#for a given structure (shortname), grab all the metadata for a list of elements as a dataframe
def getNDAdetails(structure_name='ndar_subject01',crosswalk=tlbxstructs):
    varlist=list(crosswalk.loc[crosswalk.nda_structure==structure_name,'nda_element'].unique())
    r = requests.get('https://ndar.nih.gov/api/datadictionary/datastructure/{}'
                 .format(structure_name),
                  headers={'Accept':'application/json'})
    structure = json.loads(r.text)
    df=pd.DataFrame(structure['dataElements'])
    df2=df[['name','description','valueRange','notes','aliases','type']].copy()
    dfxwalk=df2.loc[df2.name.isin(varlist)].copy() 
    dfxwalk['nda_structure']=structure_name
    dfxwalk=dfxwalk.rename(columns={'name':'nda_element','description':'nda_description','notes':'nda_notes','type':'nda_type','valueRange':'nda_range'})
    #type, aliases
    return dfxwalk

nda_anno=pd.DataFrame()
for i in structs:
    dfstruct=getNDAdetails(structure_name=i,crosswalk=uploadwcross)
    nda_anno=pd.concat([nda_anno,dfstruct],axis=0)
    

print(nda_anno.shape)
nda_anno.head()
nda_anno.loc[nda_anno.nda_element=='dccsmixed_shape_repeat14']
#now merge this annotation with the tlbxstructs (lots of repeats due to stacking of vars via 'inst' : i.e. more structs uploaded than will be downloaded)

In [None]:
print(uploadwcross.shape)
TLBX=pd.merge(uploadwcross,nda_anno,on=['nda_element','nda_structure'],how='left')
print(TLBX.shape)
#TLBX.to_csv('test3.csv',index=False)
list(TLBX.nda_structure.unique())

In [None]:
#fix rosetta var annotation
rosetta_list=['gender','sex','interview_age','interview_date','src_subject_id','subjectkey','family_user_def_id']
cols=['source', 'func', 'hcp_label', 'hcp_instrument','choices_calcs', 'nda_request']

for i in cols:
    TLBX.loc[TLBX['nda_element'].astype(str).isin(rosetta_list),i]=''
    TLBX.loc[TLBX['nda_element'].astype(str).isin(rosetta_list),'source']='rosetta'

TLBX.loc[TLBX.nda_element=='gender','nda_element']='sex'


needsanno=TLBX.loc[TLBX.nda_element.isin(['sex','interview_age','interview_date','src_subject_id','subjectkey'])]
drops=['nda_description','nda_notes','nda_range','nda_structure_link','nda_request','hcp_variable_name','hcp_label','hcp_instrument']
newanno=pd.read_csv('./DataDictionaries/rosetta_anno.csv')

needsanno=pd.merge(needsanno.drop(columns=drops),newanno,how='left',on=['nda_element'])
needsanno.shape #drop_duplicates()  
needsanno.to_csv('testit.csv')
hasanno=TLBX.loc[~(TLBX.nda_element.isin(['sex','interview_age','interview_date','src_subject_id','subjectkey']))]

TLBXbetter=pd.concat([needsanno,hasanno],axis=0)



TLBXbetter.to_csv('test.csv')
TLBXbetter.shape
TLBXbetter2=TLBXbetter[['collection','source',
                 'nda_element', 'nda_structure', 'nda_description', 'nda_notes',
       'nda_range',  'nda_structure_link','nda_request', 'hcp_variable_name', 'hcp_label', 
          'hcp_instrument', 'choices_calcs', 'func']]
Crosswalk=pd.concat([TLBXbetter2,NonTLBX], axis=0)[['collection',
                 'nda_element', 'nda_structure', 'nda_description', 'nda_notes',
       'nda_range',   'hcp_variable_name', 'hcp_label', 
          'hcp_instrument', 'source','choices_calcs', 'nda_request', 'func', 'code', 'recode','nda_structure_link']]

In [None]:
Crosswalk.loc[Crosswalk.collection=='hca','collection']='C-2847 (HCP-A)'
Crosswalk.loc[Crosswalk.collection=='hcd','collection']='C-2846 (HCP-D)'
Crosswalk=Crosswalk.rename(columns={'func':'trans_func','code':'trans_code','recode':'trans_recode'})
Crosswalk.loc[Crosswalk.hcp_variable_name=='height','hcp_label']="Height: ft'in'"
Crosswalk.loc[Crosswalk.hcp_variable_name=='weight','hcp_label']="Weight: LBS"
Crosswalk.sort_values(by=['collection','nda_structure']).to_csv('Crosswalk_Lifespan_Behavioral_2.0_'+snapshotdate+'.csv',index=False)

In [None]:
#structure_var_stats=df.groupby(['C-2847 (HCP-A)', 'C-2846 (HCP-D)','nda_structure']).count()[['hcp_variable_name']]

In [None]:
structure_var_stats=structure_var_stats.rename(columns={'hcp_variable_name':'number of HCP variables'})
structure_var_stats.to_csv('Collection_by_Structure_'+snapshotdate+'.csv', index=True)