## Notebook to take HCP-YA Behavioral data dictionary and corresponding data and request a new structure and reshape data for NDA

In [None]:
import pandas as pd
import re, datetime
import numpy as np

In [None]:
dictionya=pd.read_csv('CanonicalDataDictionaryCSV.csv')
print(dictionya.shape)
dictionya=dictionya.drop_duplicates(subset='columnHeader')
print(dictionya.shape)


In [None]:
#rename variables as NDA counterparts.
dictionya['Element']=dictionya.columnHeader
dictionya['Required']='Recommended'
dictionya['Data Type']=dictionya.dictType
dictionya['Description']=dictionya.description
#dictionya['Description']=dictionya['Description'].str.replace(r'<[^<>]*>', '', regex=True)
dictionya['Notes']=dictionya['values']
dictionya['Size']=''
dictionya['Value Range']=dictionya['values']
dictionya.loc[dictionya.description.isnull()==True,'Description']=dictionya.fullDisplayName
dictionya.loc[dictionya.dictType=='$','Data Type']='Float'

In [None]:
#load data fieldnames from data so we can 
#subset to fields that are available for download on IntraDB:
d1=pd.read_csv('data/RESTRICTED_plenzini_3_22_2022_11_34_54.csv',nrows=5)
d2=pd.read_csv('data/unrestricted_plenzini_3_22_2022_11_34_38.csv',nrows=5)
d3=pd.read_csv('data/unrestricted_plenzini_3_22_2022_11_35_0.csv',nrows=5)
d=pd.concat([d1.transpose(),d2.transpose(),d3.transpose()],axis=0)
d=d.reset_index()
print(d.shape)
d=d.drop_duplicates() #three Age bucketing variables from different sources
print(d.shape)
d=d.rename(columns={'index':'Element'})
d.head()

In [None]:
#merge together for intersection of datadictionary elements and data elements
a=pd.merge(dictionya,d,on='Element',how='right')
print(a.shape)
a.head()
#a.columns
a.loc[a.Element=='Age_in_Yrs']
a=a.drop_duplicates(subset='Element')
print(a.shape)


In [None]:
#patch notes and values, since this is faster than parsing all of the exceptions to trends
patch=pd.read_csv('ValuePatch.csv')
updated = a.merge(patch, how='left', on=['Element'], suffixes=('', '_new'))
updated['Value Range'] = np.where(pd.notnull(updated['Value Range_new']), updated['Value Range_new'], updated['Value Range'])
updated['Notes'] = np.where(pd.notnull(updated['Notes_new']), updated['Notes_new'], updated['Notes'])
updated=updated.loc[~(updated.Element=='Age')]
updated.loc[updated.Notes=='_','Notes']==''
updated[['Value Range','Notes']]

In [None]:
#these converted to NDA variables during data manipulation.  Right now we're just preparing data
#dictionary so don't need them (they'll get added in mandatory variables part next).
updated=updated.loc[~(updated.Element.isin(['Age_in_Yrs','Gender','Subject','subjectkey']))]#=updated.loc[~(updated.Element=='Age')]

#more fixes
updated.loc[updated['Data Type']=='Boolean','Data Type']='String'
#dictionya['Description']=dictionya['Description'].str.replace(r'<[^<>]*>', '', regex=True)
#updated.loc[updated.Description.str.contains('Neurolex')]#,'Description']#=updated.fullDisplayName+':'+updated['Description'].str.replace(r'<[^<>]*>', '', regex=True)

In [None]:
#add the NDA fields
structuremandatory=pd.DataFrame({'Element': ['subjectkey','src_subject_id','interview_date','interview_age','sex'], 
                                 'Required': ['Required','Required','Required','Required','Required'],
                                 'Data Type': ['GUID','String','Date','Integer','String'],
                                 'Size': ['','20','','','20'],
                                 'Description':['The NDAR Global Unique Identifier (GUID) for research subject',"Subject ID how it's defined in lab/project",'Date on which the interview/genetic test/sampling/imaging/biospecimen was completed. MM/DD/YYYY','Age in months at the time of the interview/test/sampling/imaging.','Sex of subject at birth'],
                                 'Value Range':['NDAR*','','','0 :: 1260','M;F; O; NR'],  
                                 'Notes':['','','','','']})

In [None]:

final=pd.concat([structuremandatory,updated[['Required','Description','Element','Data Type','Size','Notes','Value Range','Value Range_new','Notes_new',0,1,2,3,4]]],axis=0)#
#move race and ethnicity to ndar_subjects01
final=final.loc[~(final.Element.isin(['Race','Ethnicity']))]

final.rename(columns={'Element':'Element Name'}).to_csv("HCP_YA_CanonicalDataDictionary.csv",index=False)

print(final.shape)

In [None]:
#now prep data to match dictionary
#load data  fields that are available for download on IntraDB and will go:
d1=pd.read_csv('data/RESTRICTED_plenzini_3_22_2022_11_34_54.csv')
d2=pd.read_csv('data/unrestricted_plenzini_3_22_2022_11_34_38.csv')
d3=pd.read_csv('data/unrestricted_plenzini_3_22_2022_11_35_0.csv')
d2=d2.drop(columns=['Age'])
#d1=d1.drop(columns=['Age'])
#d3=d3.drop(columns=['Age'])

#list(d1.columns)
for i in list(d3.columns):
    if 'Age' in i:
        print(i)

In [None]:
dataraw=pd.merge(d1,d2,on='Subject',how='inner')
dataraw=pd.merge(dataraw,d3,on='Subject',how='inner')
dataraw.shape
dataraw=dataraw.rename(columns={'Subject':'src_subject_id','Gender':'sex'})
dataraw['interview_age']=dataraw['Age_in_Yrs']*12
dataraw=dataraw.drop(columns=['Age_in_Yrs'])
#print(dataraw.Acquisition.value_counts())
dataraw.loc[dataraw.Acquisition=='Q01','interview_date']='08/01/2012'
dataraw.loc[dataraw.Acquisition=='Q02','interview_date']='11/01/2012'
dataraw.loc[dataraw.Acquisition=='Q03','interview_date']='02/01/2013'
dataraw.loc[dataraw.Acquisition=='Q04','interview_date']='05/01/2013'
dataraw.loc[dataraw.Acquisition=='Q05','interview_date']='08/01/2013'
dataraw.loc[dataraw.Acquisition=='Q06','interview_date']='11/01/2013'
dataraw.loc[dataraw.Acquisition=='Q07','interview_date']='02/01/2014'
dataraw.loc[dataraw.Acquisition=='Q08','interview_date']='05/01/2014'
dataraw.loc[dataraw.Acquisition=='Q09','interview_date']='08/01/2014'
dataraw.loc[dataraw.Acquisition=='Q10','interview_date']='11/01/2014'
dataraw.loc[dataraw.Acquisition=='Q11','interview_date']='02/01/2015'
dataraw.loc[dataraw.Acquisition=='Q12','interview_date']='05/01/2015'
dataraw.loc[dataraw.Acquisition=='Q13','interview_date']='08/01/2015'

dataraw['interview_date']=pd.to_datetime(dataraw['interview_date']).dt.strftime('%m/%d/%Y')

#dataraw.to_csv('test.csv',index=False)
#len(d1.Subject.unique())
dataraw.sex.head()
dataraw.shape

In [None]:
#only variable left should be subjectkey, which is missing from data because atm 
#dont know location of pseudoguids

for i in list(final['Element']):
    if i not in list(dataraw.columns): 
        print('in annotation only:',i)
        
for i in list(dataraw.columns):
    if i not in list(final['Element']): 
        print('in data only:',i)        

In [None]:
#add psuedoguids (subjectkey) to data

In [None]:
#move  Race Ethnicity to ndar_subject01

