This notebook will pull all the maps from various sources into one common annotation file

In [None]:
from ccf.easy_yaml import EasyYaml
from ccf.redcap import RedcapTable

In [None]:
import yaml
import pandas as pd
import os, datetime
snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y')


In [None]:
Y = EasyYaml()

In [None]:
#get the NDA maps
nda = {}
for filename in os.listdir('./nda/'):
    struct = filename[:-5]
    elements = Y('./nda/'+filename)
    nda[struct] = elements

In [None]:
#get the redcap data dictionary annotation
redcap = {}
for filename in os.listdir('./definitions/'):
    struct = filename[:-5]
    elements = Y('./definitions/'+filename)
    redcap[struct] = elements

In [None]:
#get the set of prepared structures 
prepped_hca_elems= []
prepped_hca_structs =[]
for filename in os.listdir('./prepped/hca/'):
    struct = filename[:-4]
    els = pd.read_csv('./prepped/hca/'+filename,header=1).columns.to_list()
    struc=[struct]*len(els)
    prepped_hca_elems= prepped_hca_elems + els
    prepped_hca_structs = prepped_hca_structs + struc
    
prepped_hca=[1]*len(prepped_hca_structs)
print(len(prepped_hca))
  
adict = {'nda_element':prepped_hca_elems,'nda_structure':prepped_hca_structs,'C-2847 (HCP-A)':prepped_hca}    
a=pd.DataFrame(adict)
a['collection']='hca'
print(a.shape)    


prepped_hcd_elems= []
prepped_hcd_structs =[]
for filename in os.listdir('./prepped/hcd/'):
    struct = filename[:-4]
    els = pd.read_csv('./prepped/hcd/'+filename,header=1).columns.to_list()
    struc=[struct]*len(els)
    prepped_hcd_elems= prepped_hcd_elems + els
    prepped_hcd_structs = prepped_hcd_structs + struc
   
prepped_hcd=[1]*len(prepped_hcd_structs)
print(len(prepped_hcd))
  
ddict = {'nda_element':prepped_hcd_elems,'nda_structure':prepped_hcd_structs,'C-2846 (HCP-D)':prepped_hcd}    
d=pd.DataFrame(ddict)    
d['collection']='hcd'
print(d.shape)    
 
#a.head()
d.head()

In [None]:
#strip html from redcap annotation
import re
def html_stripper(text):
    if text is None:    
        return None
    else:
        return re.sub('<[^<]+?>', '', text)
    
for db_name, db_elements in redcap.items():
    section, form = None, None
    for name, element in db_elements.items():
        current_form = element.get('form')
        current_section = html_stripper(element.get('section'))
        if current_form != form:
            form = current_form
            section = current_section
        elif current_section is not None:
            section = current_section
        if section is not None:    
            element['section'] = section

In [None]:
#db = []
#directory = './maps/' + 'hcd'
#for filename in ['bisbas01.yaml']: #os.listdir(directory):
#    struct = filename[:-5]
#    elements = Y(os.path.join(directory, filename))['elements']
#    for e in elements:
#        e['struct'] = struct
#        e['collection'] = collection
#    db.extend(elements)

In [None]:
#put them together
db = []
for collection in os.listdir('./maps/'):
    directory = './maps/' + collection
    for filename in os.listdir(directory):
        struct = filename[:-5]
        elements = Y(os.path.join(directory, filename))['elements']
        for e in elements:
            e['struct'] = struct
            e['collection'] = collection
        db.extend(elements)

In [None]:
from copy import deepcopy

def flattened_str(f, x):
    n = x.get(f, None)
    if type(n) is list:
        n = '/'.join(n)
    return n


elongated = []
for original in db:
    modified = deepcopy(original)    
    modified['output'] = flattened_str('rename', modified)
    modified['input'] = flattened_str('name', modified)
        
    sources = modified.pop('source')
    
    if type(sources) is str:
        modified[sources] = sources
        elongated.append(modified)
        
    elif type(sources) is list:        
        item = deepcopy(modified)
        for source in sources:
            if type(source) is str:
                # e.g., item['parent] = "parent"
                item[source] = source
        elongated.append(item)
        
        for source in sources:
            if type(source) is dict:
                sourcename, overrides = source.popitem()
                newitem = deepcopy(modified)                
                newitem[sourcename] = sourcename
                newitem.update(overrides)                
                elongated.append(newitem)
                

In [None]:
#elongated
#test=RedcapTable.get_table_by_name(parent)#.download_datadictionary('definitions')

In [None]:
for i in elongated:
    # add nda info
    if 'output' in i and i['output'] is not None and i['output'] in nda[i['struct']]:
        x = nda[i['struct']][i['output']]
        i['type'] = x['type']
        i['description'] = x['description']
        i['notes'] = x.get('notes')
        i['alias'] = x.get('alias')
        if 'range' in x:
            i['range'] = '; '.join(list(map(str, x['range'])))
    
    # add redcap info
    if 'input' in i and i['input'] is not None:
        #only use the first variable (vars are joined with /)
        name = i['input'].split('/')[0]
        struct = i.get('parent') or i.get('teen') or i.get('child') or i.get('hcpa') or i.get('qint') 
        if struct is None or name not in redcap[struct]:
            continue
        x = redcap[struct][name]
        i['r_form'] = x.get('form')
        i['r_section'] = x.get('section')
        i['r_type'] = x.get('type')
        i['r_label'] = x.get('label')
        i['r_choices'] = x.get('choices')      

In [None]:
#special code for annotating 'stacked' data 

In [None]:
renamelist = ['caffeine_s1yn', 'caffeine_s1', 'caffeine_s1pretype', 'caffeine_s1pretime', 'caffeine_s1preday', 'caffeine_s1vsttype1', 'caffeine_s1vsttime1', 'caffeine_s1vsttype2', 'caffeine_s1vsttime2', 'nicotine_s1yn', 'nicotine_s1', 'nicotine_s1pretype', 'nicotine_s1pretime', 'nicotine_s1preday', 'nicotine_s1vsttype1', 'nicotine_s1vsttime1', 'nicotine_s1vsttype2', 'nicotine_s1vsttime2', 'drug1_1', 'drug1_2', 'drug1_3', 'drug1_4', 'drug1_5', 'drug1_6', 'alc_breath1']
replacements_dict = {}
for i in range(1, 7):
    replace = {oldname.replace('1', str(i), 1): oldname for oldname in renamelist}
    replacements_dict.update(replace)

replacements_dict

In [None]:
for i in elongated:
    name = i['output']
    if name in replacements_dict:
        newname = replacements_dict[name]
        # set new name
        i['output'] = newname
        
        # Indicate that this field is weird
        i['is_stacked'] = True
        ## or write in notes
        #i['notes'] = "This is a stacked version equivalent of " + name + "."
        ## or in code field
        #i['code'] = etc.

In [None]:
df = pd.DataFrame(elongated)
df.columns

In [None]:
#subset to nonempty sources (lots of skip logic for other visits...not being sent )
print(df.shape)
df=df.loc[~(df.collection=='ssaga')]
print(df.shape)
df = df.rename(columns={"PennCNP": "penn_cnp"})
df=df[df.parent.notna() | df.child.notna() | df.teen.notna() | df.penn_cnp.notna() | df.qint.notna() | df.hcpa.notna() ]
#df.columns


In [None]:
#beautify column names
df = df.rename(columns={
        'name':'hcp_variable_name', 
        'output': 'nda_element',
        'struct':'nda_structure',
            #func,code,recode
        'hcpa':'REDCap7 HCPA',
        'type':'nda_type',
        'description':'nda_description', 
        'notes':'nda_notes',
        'alias':'nda_aliases',
        'range':'nda_range',
        'r_section':'REDCap Section Header',
        'r_form':'REDCap Form Name',
        'r_type':'REDCap type',
        'r_label':'REDCap Label',
        'r_choices':'REDCap Choices',
        'penn_cnp': 'Box Curated PennCNP',
        'qint': 'REDCap8 Qinteractive', 
        'teen':'REDCap7 HCPD-18', 
        'child':'REDCap7 HCPD-child',
        'parent':'REDCap7 HCPD-Parent' 
   })
df.columns

In [None]:
#now pull in indicator for whether on not nda variable is in the output structure uploaded to nda
df=pd.merge(df,a,how='left',on=['collection','nda_structure','nda_element'])
df=pd.merge(df,d,how='left',on=['collection','nda_structure','nda_element'])

#fix a hcp_variable_name assignment issue whereby missings got dragged from rows above
#df.loc[df.nda_structure=='bisbas01','hcp_variable_name']=df.name
#df.loc[df.nda_structure=='bsc01','hcp_variable_name']=df.name
#df.loc[df.nda_structure=='eatq01','hcp_variable_name']=df.name
#df.loc[df.nda_structure=='gbi01','hcp_variable_name']=df.name
#df.loc[df.nda_structure=='pds01','hcp_variable_name']=df.name

In [None]:
#specialty code to add in ndar_subjects, edinburgh handedness, and eprime singleton structure annotations
singletons="/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/prepped_singleton_structures"
#get the set of prepared structures 
prepped_hca_elems= []
prepped_hca_structs =[]
prepped_hca_rforms =[]
prepped_hca_rsection =[]

for filename in os.listdir(singletons):
    if 'HCPA' in filename:
       print(filename)
       els = pd.read_csv(singletons+'/'+filename,header=1).columns.to_list()
       if 'edinburgh' in filename:
          struc='edinburgh_hand01'
          rform='Intake Interview 2'
          rsection='Handedness'
       if 'ndar' in filename:
          struc='ndar_subject01'
          rform='Register Subject'
          rsection=''
       struct=[struc]*len(els)
       rforms=[rform]*len(els)
       rsections=[rsection]*len(els)

       prepped_hca_elems= prepped_hca_elems + els
       prepped_hca_structs= prepped_hca_structs + struct
       prepped_hca_rforms= prepped_hca_rforms + rforms
       prepped_hca_rsection= prepped_hca_rsection + rsections
       
        
prepped_hca_elems      
prepped_hca_structs
singlehca=pd.DataFrame(pd.concat([pd.Series(prepped_hca_elems),pd.Series(prepped_hca_structs)
                                  ,pd.Series(prepped_hca_rforms),pd.Series(prepped_hca_rsection)],axis=1))
singlehca.columns=['nda_element','nda_structure','REDCap Form Name','REDCap Section Header']
singlehca['REDCap7 HCPA']='hcpa'
singlehca['hcp_variable_name']=singlehca['nda_element']
singlehca['C-2847 (HCP-A)']=1
singlehca['collection']='hca'

singlehca.loc[singlehca.nda_element=='race','hcp_variable_name']='racial'
singlehca.loc[singlehca.nda_element=='ethnic_group','hcp_variable_name']='ethnic'
singlehca.loc[singlehca.hcp_variable_name=='iihandwr','nda_element']='writing'
singlehca.loc[singlehca.hcp_variable_name=='iihandth','nda_element']='throwing'
singlehca.loc[singlehca.hcp_variable_name=='iihandsc','nda_element']='scissors'
singlehca.loc[singlehca.hcp_variable_name=='iihandto','nda_element']='toothbrush'
singlehca.loc[singlehca.hcp_variable_name=='iihandkn','nda_element']='knife_no_fork'
singlehca.loc[singlehca.hcp_variable_name=='iihandsp','nda_element']='spoon'
singlehca.loc[singlehca.hcp_variable_name=='iihandbr','nda_element']='broom'
singlehca.loc[singlehca.hcp_variable_name=='iihandma','nda_element']='match'
singlehca.loc[singlehca.hcp_variable_name=='iihandbo','nda_element']='box'
singlehca.loc[singlehca.hcp_variable_name=='iihandfk','nda_element']='foot'
singlehca.loc[singlehca.hcp_variable_name=='iihandey','nda_element']='eye'

singlehca['request']='structure created in separate HCA_ndar_edinburgh_*.ipynb notebook'
singlehca.loc[singlehca.nda_element.isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'hcp_variable_name']=''
singlehca.loc[singlehca.nda_element.isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'REDCap Form Name']=''
singlehca.loc[singlehca.nda_element=='family_user_def_id','hcp_variable_name']='final_pedid'
#singlehca

In [None]:
prepped_hcd_elems= []
prepped_hcd_structs =[]
prepped_hcd_rforms =[]
prepped_hcd_rsection =[]

for filename in os.listdir(singletons):
    if 'HCPD' in filename:
       print(filename)
       els = pd.read_csv(singletons+'/'+filename,header=1).columns.to_list()
       if 'edinburgh' in filename:
          struc='edinburgh_hand01'
          rform='Intake Interview 2'
          rsection='Handedness'
       if 'ndar' in filename:
          struc='ndar_subject01'
          rform='Register Subject'
          rsection=''
       if 'eprime' in filename:
          struc='deldisk01'     
          rform=''
          rsection=''
       struct=[struc]*len(els)
       rforms=[rform]*len(els)
       rsections=[rsection]*len(els)

       prepped_hcd_elems= prepped_hcd_elems + els
       prepped_hcd_structs= prepped_hcd_structs + struct
       prepped_hcd_rforms= prepped_hcd_rforms + rforms
       prepped_hcd_rsection= prepped_hcd_rsection + rsections
       
        
prepped_hcd_elems      
prepped_hcd_structs
singlehcd=pd.DataFrame(pd.concat([pd.Series(prepped_hcd_elems),pd.Series(prepped_hcd_structs)
                                  ,pd.Series(prepped_hcd_rforms),pd.Series(prepped_hcd_rsection)],axis=1))
singlehcd.columns=['nda_element','nda_structure','REDCap Form Name','REDCap Section Header']
singlehcd['collection']='hcd'
singlehcd['REDCap7 HCPD-18']='teen'
singlehcd['REDCap7 HCPD-child']=''
singlehcd['REDCap7 HCPD-parent']=''
singlehcd['Box Curated Eprime']=''
singlehcd['C-2846 (HCP-D)']=1
singlehcd['hcp_variable_name']=singlehcd.nda_element
singlehcd.loc[singlehcd.nda_element=='race','hcp_variable_name']='sub_race'
singlehcd.loc[singlehcd.nda_element=='ethnic_group','hcp_variable_name']='sub_latino'
singlehcd.loc[singlehcd.nda_element=='hand_total','REDCap7 HCPD-child']='child'
singlehcd.loc[singlehcd.nda_element=='hand_total','REDCap7 HCPD-18']=''
singlehcd.loc[singlehcd.nda_element=='hammer','REDCap7 HCPD-child']='child'
singlehcd.loc[singlehcd.nda_element=='hammer','hcp_variable_name']='hand2'
singlehcd.loc[singlehcd.nda_element=='hammer','REDCap7 HCPD-18']=''
singlehcd.loc[singlehcd.nda_element=='hand5','REDCap7 HCPD-child']='child'
singlehcd.loc[singlehcd.nda_element=='hand5','REDCap7 HCPD-18']=''
singlehcd.loc[singlehcd.nda_element=='hand_15_drink','REDCap7 HCPD-child']='child'
singlehcd.loc[singlehcd.nda_element=='hand_15_drink','REDCap7 HCPD-18']=''
singlehcd.loc[singlehcd.nda_element=='hand_15_drink','hcp_variable_name']='hand8'
singlehcd.loc[singlehcd.nda_element=='writing','hcp_variable_name']='iihandwr'
singlehcd.loc[singlehcd.nda_element=='throwing','hcp_variable_name']='iihandth'
singlehcd.loc[singlehcd.nda_element=='scissors','hcp_variable_name']='iihandsc'
singlehcd.loc[singlehcd.nda_element=='toothbrush','hcp_variable_name']='iihandto'
singlehcd.loc[singlehcd.nda_element=='knife_no_fork','hcp_variable_name']='iihandkn'
singlehcd.loc[singlehcd.nda_element=='spoon','hcp_variable_name']='iihandsp'
singlehcd.loc[singlehcd.nda_element=='broom','hcp_variable_name']='iihandbr'
singlehcd.loc[singlehcd.nda_element=='match','hcp_variable_name']='iihandma'
singlehcd.loc[singlehcd.nda_element=='box','hcp_variable_name']='iihandbo'
singlehcd.loc[singlehcd.nda_element=='foot','hcp_variable_name']='iihandfk'
singlehcd.loc[singlehcd.nda_element=='eye','hcp_variable_name']='iihandey'


secondrow=singlehcd.loc[singlehcd.nda_element.isin(['writing','throwing','toothbrush','spoon','scissors','race','ethnic_group'])].copy()
secondrow['REDCap Form Name']='Intake Interview'
secondrow['REDCap Section Header']='Handedness Assessment'

secondrow.loc[secondrow.nda_element=='writing','hcp_variable_name']='hand1'
secondrow.loc[secondrow.nda_element=='writing','REDCap7 HCPD-child']='child'
secondrow.loc[secondrow.nda_element=='writing','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='throwing','hcp_variable_name']='hand3'
secondrow.loc[secondrow.nda_element=='throwing','REDCap7 HCPD-child']='child'
secondrow.loc[secondrow.nda_element=='throwing','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='toothbrush','hcp_variable_name']='hand4'
secondrow.loc[secondrow.nda_element=='toothbrush','REDCap7 HCPD-child']='child'
secondrow.loc[secondrow.nda_element=='toothbrush','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='spoon','hcp_variable_name']='hand6'
secondrow.loc[secondrow.nda_element=='spoon','REDCap7 HCPD-child']='child'
secondrow.loc[secondrow.nda_element=='spoon','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='scissors','hcp_variable_name']='hand7'
secondrow.loc[secondrow.nda_element=='scissors','REDCap7 HCPD-child']='child'
secondrow.loc[secondrow.nda_element=='scissors','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='race','hcp_variable_name']='p_c_race'
secondrow.loc[secondrow.nda_element=='race','REDCap7 HCPD-child']=''
secondrow.loc[secondrow.nda_element=='race','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='race','REDCap7 HCPD-parent']='parent'
secondrow.loc[secondrow.nda_element=='race','REDCap Form Name']='Intake Interview'
secondrow.loc[secondrow.nda_element=='race','REDCap Section Header']=''
secondrow.loc[secondrow.nda_element=='ethnic_group','hcp_variable_name']='p_c_latino'
secondrow.loc[secondrow.nda_element=='ethnic_group','REDCap7 HCPD-child']=''
secondrow.loc[secondrow.nda_element=='ethnic_group','REDCap7 HCPD-18']=''
secondrow.loc[secondrow.nda_element=='ethnic_group','REDCap7 HCPD-parent']='parent'
secondrow.loc[secondrow.nda_element=='ethnic_group','REDCap Form Name']='Intake Interview'
secondrow.loc[secondrow.nda_element=='ethnic_group','REDCap Section Header']=''

singlehcd=pd.concat([singlehcd,secondrow],axis=0)

singlehcd['request']='structure created in separate HCD_*_*.ipynb notebook'
singlehcd.loc[singlehcd.nda_element.isin(['phenotype', 'phenotype_description', 'twins_study',
       'sibling_study', 'family_study', 'sample_taken']),'hcp_variable_name']=''
singlehcd.loc[singlehcd.nda_element=='family_user_def_id','hcp_variable_name']='final_pedid'
singlehcd.loc[singlehcd.nda_structure=='deldisk01','REDCap7 HCPD-18']=''
singlehcd.loc[singlehcd.nda_structure=='deldisk01','Box Curated Eprime']='eprime'


In [None]:
ndar=pd.DataFrame(nda['ndar_subject01'])
ndart=ndar.transpose().reset_index().rename(columns={'type':'nda_type','description':'nda_description',
                                               'notes':'nda_notes','range':'nda_range',
                                               'alias':'nda_aliases'})[['index','nda_type','nda_description',
                                                                        'nda_notes','nda_range','nda_aliases']]
ndart['nda_structure']='ndar_subject01'


eb=pd.DataFrame(nda['edinburgh_hand01'])
ebt=eb.transpose().reset_index().rename(columns={'type':'nda_type','description':'nda_description',
                                               'notes':'nda_notes','range':'nda_range',
                                               'alias':'nda_aliases'})[['index','nda_type','nda_description',
                                                                        'nda_notes','nda_range','nda_aliases']]
ebt['nda_structure']='edinburgh_hand01'

dd=pd.DataFrame(nda['deldisk01'])
ddt=dd.transpose().reset_index().rename(columns={'type':'nda_type','description':'nda_description',
                                               'notes':'nda_notes','range':'nda_range',
                                               'alias':'nda_aliases'})[['index','nda_type','nda_description',
                                                                        'nda_notes','nda_range','nda_aliases']]

ddt['nda_structure']='deldisk01'


singletons=pd.concat([singlehca,singlehcd],axis=0)
ndardata=pd.concat([ndart,ebt,ddt],axis=0)
#ndardata
singletons=pd.merge(singletons,ndardata,how='left',left_on=['nda_structure','nda_element'],right_on=['nda_structure','index'])
#singletons

In [None]:
#singletons.to_csv('singletons'+snapshotdate+'.csv', index=False)
df_all=pd.concat([df,singletons],axis=0)
df_all['All Sources Rosetta']=''
df_all.loc[df_all.nda_element.isin(['gender','sex','interview_age','interview_date','src_subject_id','subjectkey','family_user_def_id']),'All Sources Rosetta']='rosetta'
df_all.loc[df_all['All Sources Rosetta']=='rosetta','REDCap Form Name']=''
df_all.loc[df_all['All Sources Rosetta']=='rosetta','REDCap Section Header']=''
df_all.loc[df_all['All Sources Rosetta']=='rosetta','REDCap7 HCPD-18']=''
df_all.loc[df_all['All Sources Rosetta']=='rosetta','REDCap7 HCPD-child']=''
df_all.loc[df_all['All Sources Rosetta']=='rosetta','REDCap7 HCPD-parent']=''
df_all.loc[df_all['All Sources Rosetta']=='rosetta','REDCap7 HCPA']=''
df_all.loc[df_all['All Sources Rosetta']=='rosetta','Box Curated Eprime']=''
#df_all.to_csv('df_all'+snapshotdate+'.csv', index=False)

In [None]:
df_all.shape

In [None]:
#still a bunch of missing REDCap labels because of the one-to-many, many-to-one possibilities in child,teen, and parent 
#specifically.  Pull these last labels in.  E.G. REDCap type, REDCap Label, REDCap Choices, REDCap Form Name
df_all['ci']=0
df_all['ti']=0
df_all['pi']=0
df_all.loc[df_all['REDCap7 HCPD-child']=='child','ci']=1
df_all.loc[df_all['REDCap7 HCPD-18']=='teen','ti']=1
df_all.loc[df_all['REDCap7 HCPD-Parent']=='parent','pi']=1
df_all['sumi']=df_all.ci + df_all.ti + df_all.pi

df_miss=df_all.loc[ ((df_all['hcp_variable_name'].isnull()==False)  & 
                     ((df_all['REDCap7 HCPD-child']=='child') & (df_all['REDCap Label'].isnull()==True)) |
                     ((df_all['REDCap7 HCPD-Parent']=='parent') & (df_all['REDCap Label'].isnull()==True)) |
                     ((df_all['REDCap7 HCPD-18']=='teen') & (df_all['REDCap Label'].isnull()==True))) & 
                     (df_all['sumi']<2) & (~(df_all['nda_structure']=='socdem01') ) &
                     (df_all['nda_element'].str.contains('/')==False )]


#df_miss=df_miss.loc[df_miss.sumi<2]

df_nomiss=df_all.loc[~(((df_all['hcp_variable_name'].isnull()==False)  & 
                     ((df_all['REDCap7 HCPD-child']=='child') & (df_all['REDCap Label'].isnull()==True)) |
                     ((df_all['REDCap7 HCPD-Parent']=='parent') & (df_all['REDCap Label'].isnull()==True)) |
                     ((df_all['REDCap7 HCPD-18']=='teen') & (df_all['REDCap Label'].isnull()==True))) & 
                     (df_all['sumi']<2) & (~(df_all['nda_structure']=='socdem01') ) &
                     (df_all['nda_element'].str.contains('/')==False ))]


print(df_nomiss.shape)
print(df_miss.shape)
df_miss.to_csv('temp.csv')

In [None]:
df_childmiss=df_miss.loc[(df_miss['REDCap7 HCPD-child']=='child')]
df_parentmiss=df_miss.loc[(df_miss['REDCap7 HCPD-Parent']=='parent')]
df_teenmiss=df_miss.loc[(df_miss['REDCap7 HCPD-18']=='teen') ]
print(df_childmiss.shape)
print(df_parentmiss.shape)
print(df_teenmiss.shape)

In [None]:
#still a bunch of missing REDCap labels because of the one-to-many, many-to-one possibilities in child,teen, and parent 
#specifically.  Pull these last labels in.  E.G. REDCap type, REDCap Label, REDCap Choices, REDCap Form Name
child=pd.DataFrame(redcap['hcpdchild'])
childt=child.transpose().reset_index().rename(columns={'index':'hcp_variable_name','form':'REDCap Form Name','label': 'REDCap Label','choices':'REDCap Choices','type':'REDCap type'})[['hcp_variable_name','REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name']]

parent=pd.DataFrame(redcap['hcpdparent'])
parentt=parent.transpose().reset_index().rename(columns={'index':'hcp_variable_name','form':'REDCap Form Name','label': 'REDCap Label','choices':'REDCap Choices','type':'REDCap type'})[['hcp_variable_name','REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name']]

teen=pd.DataFrame(redcap['hcpd18'])
teent=teen.transpose().reset_index().rename(columns={'index':'hcp_variable_name','form':'REDCap Form Name','label': 'REDCap Label','choices':'REDCap Choices','type':'REDCap type'})[['hcp_variable_name','REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name']]


df_childmiss=df_miss.loc[(df_miss['REDCap7 HCPD-child']=='child')]
df_childmiss=df_childmiss.drop(columns=['REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name'])
df_childmiss=pd.merge(df_childmiss,childt,on='hcp_variable_name',how='left')
print(df_childmiss.shape)
#df_parentmiss=df_all.loc[(df_all['REDCap7 HCPD-Parent']=='parent') & (df_all['REDCap Label'].isnull()==True)]
#df_parentmiss=df_parentmiss.drop(columns=['REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name'])
#df_parentmiss=pd.merge(df_parentmiss,parentt,on='hcp_variable_name',how='left')

#df_teenmiss=df_all.loc[(df_all['REDCap7 HCPD-18']=='teen') & (df_all['REDCap Label'].isnull()==True)]
#df_teenmiss=df_teenmiss.drop(columns=['REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name'])
#df_teenmiss=pd.merge(df_teenmiss,teent,on='hcp_variable_name',how='left')


In [None]:
df_parentmiss=df_miss.loc[(df_miss['REDCap7 HCPD-Parent']=='parent')]
df_parentmiss=df_parentmiss.drop(columns=['REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name'])
df_parentmiss=pd.merge(df_parentmiss,parentt,on='hcp_variable_name',how='left')
print(df_parentmiss.shape)

df_teenmiss=df_miss.loc[(df_miss['REDCap7 HCPD-18']=='teen') ]
df_teenmiss=df_teenmiss.drop(columns=['REDCap type', 'REDCap Label', 'REDCap Choices', 'REDCap Form Name'])
df_teenmiss=pd.merge(df_teenmiss,teent,on='hcp_variable_name',how='left')
print(df_teenmiss.shape)

In [None]:
newdf_miss=pd.concat([df_childmiss,df_teenmiss,df_parentmiss],axis=0)
df=pd.concat([df_nomiss,newdf_miss],axis=0)
newdf_miss.head()
newdf_miss.shape

In [None]:
df.head()
df.shape

In [None]:
df.loc[df['C-2847 (HCP-A)'].isnull()==True,'C-2847 (HCP-A)']=0.0
df.loc[df['C-2846 (HCP-D)'].isnull()==True,'C-2846 (HCP-D)']=0.0
df['check']=df['C-2847 (HCP-A)'] + df['C-2846 (HCP-D)']

df['Warning']=''
df.loc[df.check==0.0,'Warning']='No Collection'
df=df.drop(columns=['check'])
df['nda_structure_link']="https://nda.nih.gov/data_structure.html?short_name="+df['nda_structure']
#df.head(10)



In [None]:
print(df.columns)
#df.head()


In [None]:
df=df[[ 'C-2847 (HCP-A)', 'C-2846 (HCP-D)', 'hcp_variable_name','nda_structure', 'nda_element',
         'nda_type', 'nda_description',
       'nda_notes', 'nda_aliases', 'nda_range',  'REDCap Form Name',
       'REDCap Section Header', 'REDCap type', 'REDCap Label',
       'REDCap Choices', 'request','func', 'code', 'recode', 'old_code', 'specialty_code','is_stacked',
       'Box Curated PennCNP',  'REDCap8 Qinteractive',
       'REDCap7 HCPA','REDCap7 HCPD-child', 'REDCap7 HCPD-18', 'REDCap7 HCPD-Parent','Box Curated Eprime', 
       'All Sources Rosetta',
       'nda_structure_link']]

df.to_csv('Crosswalk_HCP_NonTLBX_'+snapshotdate+'.csv', index=False)

In [None]:
structure_var_stats=df.groupby(['C-2847 (HCP-A)', 'C-2846 (HCP-D)','nda_structure']).count()[['hcp_variable_name']]

In [None]:
structure_var_stats=structure_var_stats.rename(columns={'hcp_variable_name':'number of HCP variables'})
structure_var_stats.to_csv('Collection_by_Structure_'+snapshotdate+'.csv', index=True)