In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_LH
sheet_name = COMBINE_harmonizer.SHEET_FOLLOW_UP

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
_FILENAMES = [
    'lf01.csv',
    'lf03.csv',
    'lf04.csv',
    'lf04a.csv',
    'lf04ar.csv',
    'lf05.csv',
    'lf09a.csv',
    'lf10.csv',
    'lf11.csv',
    'lf12.csv',
]

_ANALYSIS_FILENAME = 'analysis.csv'

_ANALYSIS_FILENAMES = [
    'analysis.csv',
]

_FILENAMES_MERGE = [
    'lf01.csv',
    'lf03.csv',
    'lf04.csv',
    'lf04a.csv',
    'lf05.csv',
    'lf09a.csv',
    'lf10.csv',
    'lf11.csv',
    'lf12.csv',
]

In [5]:
input_dir = cfg.config[f'{study_name}_follow_up_dir']
input_analysis_dir = cfg.config[f'{study_name}_analysis_dir']

data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}"

os.makedirs(out_dir, exist_ok=True)

## 00-1. Column Map

In [6]:
df_data_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=sheet_name)
all_valid_columns = list(df_data_dict[COMBINE_harmonizer.DATA_DICT_VAR_NAME])
column_map = {each[study_name]: each[COMBINE_harmonizer.DATA_DICT_VAR_NAME] for _, each in df_data_dict.iterrows()}

## 00-2. df-dict from _FILENAMES and _ANALYSIS_FILENAMES

In [7]:
df_dict = {
    filename: pd.read_csv(os.sep.join([input_dir, filename]), dtype='O').rename(columns=column_map)
    for filename in _FILENAMES
}

df_analysis_dict = {filename: pd.read_csv(os.sep.join([input_analysis_dir, filename]), dtype='O').rename(columns=column_map) for filename in _ANALYSIS_FILENAMES}

df_dict.update(df_analysis_dict)

### 00-4. identifier-column

In [8]:
id_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_IDENTITY)

id_columns

['followupCenter', 'siteID', 'center', 'subjectID', 'followupID']

## 20-01. follow-up

In [9]:
df_all = None
for idx, each_filename in enumerate(_FILENAMES_MERGE):
    each_full_filename = os.sep.join([root_dir, each_filename])
    each_df = df_dict[each_filename]
    columns = list(each_df.columns)
    each_filename_prefix = re.sub(r'\.csv$', '', each_filename)
    # print(f'({idx}/{len(_FILENAMES_MERGE)}) filename: {each_filename} columns: {columns}')

    if df_all is None:
        df_all = each_df
    else:
        df_all = df_all.merge(each_df, on=['followupCenter', 'followupID'], how='outer', suffixes=['', ':' + each_filename_prefix])

# follow-up
print('to set follow-up')
df_follow_up = df_all

df_follow_up = COMBINE_harmonizer.valid_columns(df_follow_up, all_valid_columns, debug_df=True, debug_columns=False)
df_follow_up = COMBINE_harmonizer.postprocess(df_follow_up, subject_id_idx='followupID', center_id_idx='followupCenter')
df_follow_up.loc[:, 'center_orig'] = df_follow_up['center'].copy()
df_follow_up.loc[:, 'center'] = df_follow_up['center'].apply(COMBINE_harmonizer.to_center)
df_follow_up.loc[:, 'center'] = df_follow_up.apply(lambda x: x['center'] if isinstance(x['center'], str) and x['center'] != '' else x['followupCenter'], axis=1)


to set follow-up
(2/413) LAST not in columns
(3/413) FIRST not in columns
(5/413) PROTID02 not in columns
(6/413) PROTID03 not in columns
(7/413) PROTID04 not in columns
(8/413) PROTID05 not in columns
(9/413) PROTID06 not in columns
(10/413) PROTID07 not in columns
(11/413) PROTID08 not in columns
(12/413) PROTID09 not in columns
(13/413) PROTID10 not in columns
(14/413) PROTID11 not in columns
(15/413) PROTID12 not in columns
(16/413) PROTID13 not in columns
(17/413) PROTID14 not in columns
(18/413) PROTID15 not in columns
(19/413) PROTID16 not in columns
(20/413) PROTID17 not in columns
(21/413) PROTID18 not in columns
(22/413) PROTID19 not in columns
(23/413) PROTID20 not in columns
(24/413) REC_CMP not in columns
(30/413) CMP_DATE not in columns
(31/413) CRT_DATE not in columns
(64/413) CFINITS not in columns
(65/413) REC_CMP:lf03 not in columns
(105/413) CFPRRESP not in columns
(108/413) CMP_DATE:lf03 not in columns
(109/413) CRT_DATE:lf03 not in columns
(146/413) NF4LOCSP not in

### 20-01-1. XXX hack for "follow-up visit completed in another NRN center"

In [10]:
df_follow_up['childFinalStatus'].unique(), len(df_follow_up)

(array(['1', '4', '5', '3'], dtype=object), 155)

In [11]:
is_valid = df_follow_up['childFinalStatus'].isin(['5', '5.0']) == False
df_follow_up = df_follow_up[is_valid]
len(df_follow_up)

152

## 20-00. Follow-up

In [12]:
follow_up_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_FOLLOWUP)

all_follow_up_columns = id_columns + follow_up_columns
follow_up_columns

['birthDate', 'visitDate', 'birthNumber', 'center_orig']

In [13]:
df_follow_up_follow_up = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_follow_up_columns, debug_df=False, debug_columns=True)
df_follow_up_follow_up = COMBINE_harmonizer.postprocess(df_follow_up_follow_up)

out_filename = os.sep.join([out_dir, '20-00-follow-up.csv'])
df_follow_up_follow_up.to_csv(out_filename, index=False)

### 20-00-1. check empty

In [14]:
COMBINE_harmonizer.check_empty(df_follow_up_follow_up)

(0/10) column: center (152 / 0)
(1/10) column: subjectID (152 / 0)
(2/10) column: uniqueID (152 / 0)
(3/10) column: followupCenter (152 / 0)
(4/10) column: followupID (152 / 0)
(5/10) column: siteID (152 / 0)
(6/10) column: birthDate (152 / 0)
(7/10) column: visitDate (145 / 7)
(8/10) column: birthNumber (144 / 8)
(9/10) column: center_orig (3 / 149)


In [15]:
COMBINE_harmonizer.column_info(df_follow_up_follow_up)

(0/10) center: (152/0)
(1/10) subjectID: (152/0)
(2/10) uniqueID: (152/0)
(3/10) followupCenter: (152/0)
(4/10) followupID: (152/0)
(5/10) siteID: (152/0)
(6/10) birthDate: (152/0)
(7/10) visitDate: (145/7)
(8/10) birthNumber: (144/8)
(9/10) center_orig: (3/149)


## 20-01. SES

In [16]:
ses_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_SOCIOECONOMIC_STATUS)

all_ses_columns = id_columns + ses_columns
all_ses_columns

['followupCenter',
 'siteID',
 'center',
 'subjectID',
 'followupID',
 'SESVisitDate',
 'SESBirthDate',
 'chronologicalAge_mo',
 'correctedAge_mo',
 'underStateSupervision',
 'primaryCaretaker',
 'otherCaretaker',
 'maritalStatusPrimaryCaretaker',
 'livingArrangementChild',
 'numberPeopleInChildHousehold',
 'otherContributeMoneyToChildHousehold',
 'educationPrimaryCaretaker',
 'educationOtherCaretaker',
 'workPrimaryCaretaker',
 'workOtherCaretaker',
 'inSchoolPrimaryCaretaker',
 'inSchoolOtherCaretaker',
 'totalIncomeChildHousehold',
 'medicalInsuranceChild',
 'primaryLanguageChild',
 'primaryLanguageChildOtherText',
 'isSecondaryLanguageChild',
 'secondaryLanguageChild',
 'secondaryLanguageChildOtherText',
 'numberPlaceChildLive',
 'zipcode',
 'visitingNurseReceive',
 'visitingNurseNeed',
 'homeNurseReceive',
 'homeNurseNeed',
 'otPtReceive',
 'otPtNeed',
 'speechTherapyReceive',
 'speechTherapyNeed',
 'earlyInterventionReceive',
 'earlyInterventionNeed',
 'socialWorkForChildReceive'

In [17]:
df_ses = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_ses_columns, debug_df=False, debug_columns=True)
df_ses = COMBINE_harmonizer.postprocess(df_ses)

out_filename = os.sep.join([out_dir, '20-01-ses.csv'])
df_ses.to_csv(out_filename, index=False)

### 20-01-1. check empty

In [18]:
COMBINE_harmonizer.check_empty(df_ses)

(0/79) column: center (152 / 0)
(1/79) column: subjectID (152 / 0)
(2/79) column: uniqueID (152 / 0)
(3/79) column: followupCenter (152 / 0)
(4/79) column: followupID (152 / 0)
(5/79) column: siteID (152 / 0)
(6/79) column: underStateSupervision (142 / 10)
(7/79) column: otherContributeMoneyToChildHousehold (142 / 10)
(8/79) column: workPrimaryCaretaker (142 / 10)
(9/79) column: workOtherCaretaker (127 / 25)
(10/79) column: inSchoolPrimaryCaretaker (142 / 10)
(11/79) column: inSchoolOtherCaretaker (127 / 25)
(12/79) column: primaryLanguageChildOtherText (7 / 145)
(13/79) column: isSecondaryLanguageChild (142 / 10)
(14/79) column: secondaryLanguageChildOtherText (5 / 147)
(15/79) column: visitingNurseNeed (142 / 10)
(16/79) column: homeNurseNeed (142 / 10)
(17/79) column: otPtNeed (142 / 10)
(18/79) column: speechTherapyNeed (142 / 10)
(19/79) column: earlyInterventionNeed (142 / 10)
(20/79) column: socialWorkForChildNeed (142 / 10)
(21/79) column: specialClinicNeed (142 / 10)
(22/79) c

In [19]:
COMBINE_harmonizer.column_info(df_ses)

(0/79) center: (152/0)
(1/79) subjectID: (152/0)
(2/79) uniqueID: (152/0)
(3/79) followupCenter: (152/0)
(4/79) followupID: (152/0)
(5/79) siteID: (152/0)
(6/79) underStateSupervision: (142/10)
(7/79) otherContributeMoneyToChildHousehold: (142/10)
(8/79) workPrimaryCaretaker: (142/10)
(9/79) workOtherCaretaker: (127/25)
(10/79) inSchoolPrimaryCaretaker: (142/10)
(11/79) inSchoolOtherCaretaker: (127/25)
(12/79) primaryLanguageChildOtherText: (7/145)
(13/79) isSecondaryLanguageChild: (142/10)
(14/79) secondaryLanguageChildOtherText: (5/147)
(15/79) visitingNurseNeed: (142/10)
(16/79) homeNurseNeed: (142/10)
(17/79) otPtNeed: (142/10)
(18/79) speechTherapyNeed: (142/10)
(19/79) earlyInterventionNeed: (142/10)
(20/79) socialWorkForChildNeed: (142/10)
(21/79) specialClinicNeed: (142/10)
(22/79) pulmonaryNeed: (142/10)
(23/79) ophthalmologicNeed: (142/10)
(24/79) gastrointestinalNeed: (142/10)
(25/79) audiologicNeed: (142/10)
(26/79) neurologicNeed: (142/10)
(27/79) otherNeed: (142/10)
(28/7

## 20-02. Medical History

In [20]:
medical_history_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_MEDICAL_HISTORY)

all_medical_history_columns = id_columns + medical_history_columns
medical_history_columns

['rehospitalize',
 'numberRehospitalize',
 'operation',
 'operationTypanostomyTube',
 'operationTracheostomy',
 'operationEyeSurgery',
 'operationEyeSurgeryReason',
 'operationHerniaSurgery',
 'operationGastrostomyTube',
 'operationFundoplication',
 'operationShuntForHydrocephalus',
 'operationReanastomosisOfLargeOrSmallIntenstine',
 'operationPDALigation',
 'operationBrochoscopy',
 'operationHypospadiusRepair',
 'operationOther',
 'operationOtherText',
 'medication',
 'vitaminMineralSupplement',
 'highCaloricFormula',
 'diuretics',
 'antiRefluxMedication',
 'bronchodilator',
 'inhaledSteroid',
 'oralIvSteroid',
 'otherAsthmaMedication',
 'decongestantColdAllergyMedication',
 'anticonvulsantMedication',
 'prophylaticAntibiotics',
 'antibiotics',
 'constipationMedication',
 'bloodPressureMedication',
 'thyroidMedication',
 'muscleRelaxants',
 'botox',
 'otherMedication',
 'otherMedicationText',
 'seizure',
 'medicalEquipmentHomeUse',
 'apneaMonitor',
 'oxygen',
 'ventilatorCPAP',
 'gast

In [21]:
df_medical_history = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_medical_history_columns, debug_df=False, debug_columns=True)
df_medical_history = COMBINE_harmonizer.postprocess(df_medical_history)

out_filename = os.sep.join([out_dir, '20-02-medical-history.csv'])
df_medical_history.to_csv(out_filename, index=False)

(61/68) subcutaneousFatNecrosis not in df


### 20-02-1. check empty

In [22]:
COMBINE_harmonizer.check_empty(df_medical_history)

(0/68) column: center (152 / 0)
(1/68) column: subjectID (152 / 0)
(2/68) column: uniqueID (152 / 0)
(3/68) column: followupCenter (152 / 0)
(4/68) column: followupID (152 / 0)
(5/68) column: siteID (152 / 0)
(6/68) column: rehospitalize (143 / 9)
(7/68) column: operation (143 / 9)
(8/68) column: operationTypanostomyTube (25 / 127)
(9/68) column: operationTracheostomy (25 / 127)
(10/68) column: operationEyeSurgery (25 / 127)
(11/68) column: operationHerniaSurgery (25 / 127)
(12/68) column: operationGastrostomyTube (25 / 127)
(13/68) column: operationFundoplication (25 / 127)
(14/68) column: operationShuntForHydrocephalus (25 / 127)
(15/68) column: operationReanastomosisOfLargeOrSmallIntenstine (25 / 127)
(16/68) column: operationPDALigation (25 / 127)
(17/68) column: operationBrochoscopy (25 / 127)
(18/68) column: operationHypospadiusRepair (25 / 127)
(19/68) column: operationOther (25 / 127)
(20/68) column: operationOtherText (12 / 140)
(21/68) column: medication (143 / 9)
(22/68) col

In [23]:
COMBINE_harmonizer.column_info(df_medical_history)

(0/68) center: (152/0)
(1/68) subjectID: (152/0)
(2/68) uniqueID: (152/0)
(3/68) followupCenter: (152/0)
(4/68) followupID: (152/0)
(5/68) siteID: (152/0)
(6/68) rehospitalize: (143/9)
(7/68) operation: (143/9)
(8/68) operationTypanostomyTube: (25/127)
(9/68) operationTracheostomy: (25/127)
(10/68) operationEyeSurgery: (25/127)
(11/68) operationHerniaSurgery: (25/127)
(12/68) operationGastrostomyTube: (25/127)
(13/68) operationFundoplication: (25/127)
(14/68) operationShuntForHydrocephalus: (25/127)
(15/68) operationReanastomosisOfLargeOrSmallIntenstine: (25/127)
(16/68) operationPDALigation: (25/127)
(17/68) operationBrochoscopy: (25/127)
(18/68) operationHypospadiusRepair: (25/127)
(19/68) operationOther: (25/127)
(20/68) operationOtherText: (12/140)
(21/68) medication: (143/9)
(22/68) otherMedicationText: (7/145)
(23/68) seizure: (143/9)
(24/68) medicalEquipmentHomeUse: (143/9)
(25/68) fluShot: (143/9)
(26/68) RSVProphylaxis: (143/9)
(27/68) independentFeedSelf: (143/9)
(28/68) assi

## 20-03. Medical Exam

In [24]:
medical_exam_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_MEDICAL_EXAM)

all_medical_exam_columns = id_columns + medical_exam_columns
medical_exam_columns

['weight_kg',
 'length_cm',
 'headCircumference_cm',
 'strabismusRight',
 'strabismusLeft',
 'nystagmusRight',
 'nystagmusLeft',
 'rovingEyeMovementRight',
 'rovingEyeMovementLeft',
 'eyeTrackRight',
 'eyeTrackLeft',
 'visionRight',
 'visionLeft',
 'audiologicAssessment',
 'audiologicPendingForAssessment',
 'visualReinforcementAudiometry',
 'VRARight',
 'VRALeft',
 'VRASoundField',
 'ABR',
 'ABRRight',
 'ABRLeft',
 'hearingTestUnknown',
 'hearingTestUnknownRight',
 'hearingTestUnknownLeft',
 'hearingImpaired',
 'hearingAidRequirement',
 'hearingImplant',
 'swallowing',
 'dysphagia',
 'aspiration',
 'abnormalVoice',
 'drooling',
 'nothingByMouth',
 'observedAbnormalMovement',
 'observedAbnormalMovementShortJerky',
 'observedAbnormalMovementSlowWrithing',
 'observedAbnormalMovementTremor',
 'passiveMuscleToneNeckTrunk',
 'upperExtremityMuscleToneRight',
 'upperExtremityMuscleToneLeft',
 'lowerExtremityMuscleToneHipKneeRight',
 'lowerExtremityMuscleToneHipKneeLeft',
 'lowerExtremityMuscle

In [25]:
df_medical_exam = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_medical_exam_columns, debug_df=False, debug_columns=True)
df_medical_exam = COMBINE_harmonizer.postprocess(df_medical_exam)

out_filename = os.sep.join([out_dir, '20-03-medical-exam.csv'])
df_medical_exam.to_csv(out_filename, index=False)

(32/99) hearingImplant not in df
(81/99) dystonia not in df
(82/99) athetosis not in df


### 20-03-1. check empty

In [26]:
COMBINE_harmonizer.check_empty(df_medical_exam)

(0/97) column: center (152 / 0)


(1/97) column: subjectID (152 / 0)
(2/97) column: uniqueID (152 / 0)
(3/97) column: followupCenter (152 / 0)
(4/97) column: followupID (152 / 0)
(5/97) column: siteID (152 / 0)
(6/97) column: audiologicAssessment (141 / 11)
(7/97) column: audiologicPendingForAssessment (90 / 62)
(8/97) column: visualReinforcementAudiometry (36 / 116)
(9/97) column: ABR (36 / 116)
(10/97) column: hearingTestUnknown (51 / 101)
(11/97) column: dysphagia (141 / 11)
(12/97) column: aspiration (141 / 11)
(13/97) column: abnormalVoice (141 / 11)
(14/97) column: drooling (141 / 11)
(15/97) column: nothingByMouth (141 / 11)
(16/97) column: observedAbnormalMovement (141 / 11)
(17/97) column: observedAbnormalMovementShortJerky (6 / 146)
(18/97) column: observedAbnormalMovementSlowWrithing (6 / 146)
(19/97) column: observedAbnormalMovementTremor (6 / 146)
(20/97) column: scissoringLegs (141 / 11)
(21/97) column: neuralNormal (141 / 11)
(22/97) column: generalizedHypotonia (31 / 121)
(23/97) column: hypertonia (31

In [27]:
COMBINE_harmonizer.column_info(df_medical_exam)

(0/97) center: (152/0)
(1/97) subjectID: (152/0)
(2/97) uniqueID: (152/0)
(3/97) followupCenter: (152/0)
(4/97) followupID: (152/0)
(5/97) siteID: (152/0)
(6/97) audiologicAssessment: (141/11)
(7/97) audiologicPendingForAssessment: (90/62)
(8/97) visualReinforcementAudiometry: (36/116)
(9/97) ABR: (36/116)
(10/97) hearingTestUnknown: (51/101)
(11/97) dysphagia: (141/11)
(12/97) aspiration: (141/11)
(13/97) abnormalVoice: (141/11)
(14/97) drooling: (141/11)
(15/97) nothingByMouth: (141/11)
(16/97) observedAbnormalMovement: (141/11)
(17/97) observedAbnormalMovementShortJerky: (6/146)
(18/97) observedAbnormalMovementSlowWrithing: (6/146)
(19/97) observedAbnormalMovementTremor: (6/146)
(20/97) scissoringLegs: (141/11)
(21/97) neuralNormal: (141/11)
(22/97) generalizedHypotonia: (31/121)
(23/97) hypertonia: (31/121)
(24/97) neuralOther: (31/121)
(25/97) neuralOtherText: (13/139)
(26/97) spasticDiplegia: (32/120)
(27/97) spasticHemiplegiaRight: (32/120)
(28/97) spasticHemiplegiaLeft: (0/152)

## 20-04. Bayley-III

In [28]:
bayley_iii_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_BAYLEY_III)

all_bayley_iii_columns = id_columns + bayley_iii_columns
bayley_iii_columns

['BayleyIIICognitiveSubtest',
 'BayleyIIIReasonNoSuccessCognitiveSubtest',
 'BayleyIIIReasonNoSuccessCognitiveSubtestText',
 'BayleyIIILanguageReceptiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageReceptiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText',
 'BayleyIIILanguageExpressiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageExpressiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText',
 'BayleyIIIMotorFineSubtest',
 'BayleyIIIReasonNoSuccessMotorFineSubtest',
 'BayleyIIIReasonNoSuccessMotorFineSubtestText',
 'BayleyIIIMotorGrossSubtest',
 'BayleyIIIReasonNoSuccessMotorGrossSubtest',
 'BayleyIIIReasonNoSuccessMotorGrossSubtestText',
 'BayleyIIIAdjustedAgeForCognitiveTest',
 'BayleyIIIAdjustedAgeForReceptiveCommunication',
 'BayleyIIIAdjustedAgeForExpressiveCommunication',
 'BayleyIIIAdjustedAgeForMotorFineSubtest',
 'BayleyIIIAdjustedAgeForMotorGrossSubtest',
 'BayleyIIICognitiveRaw',
 'BayleyIIICognitiveScale',
 'BayleyIIICognitiveComposite',
 'BayleyI

In [29]:
df_bayley_iii = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_bayley_iii_columns, debug_df=False, debug_columns=True)
df_bayley_iii = COMBINE_harmonizer.postprocess(df_bayley_iii)

out_filename = os.sep.join([out_dir, '20-04-bayley-iii.csv'])
df_bayley_iii.to_csv(out_filename, index=False)

### 20-04-1. check empty

In [30]:
COMBINE_harmonizer.check_empty(df_bayley_iii)

(0/46) column: center (152 / 0)
(1/46) column: subjectID (152 / 0)
(2/46) column: uniqueID (152 / 0)
(3/46) column: followupCenter (152 / 0)
(4/46) column: followupID (152 / 0)
(5/46) column: siteID (152 / 0)
(6/46) column: BayleyIIICognitiveSubtest (141 / 11)
(7/46) column: BayleyIIIReasonNoSuccessCognitiveSubtestText (0 / 152)
(8/46) column: BayleyIIILanguageReceptiveSubtest (141 / 11)
(9/46) column: BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText (0 / 152)
(10/46) column: BayleyIIILanguageExpressiveSubtest (141 / 11)
(11/46) column: BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText (0 / 152)
(12/46) column: BayleyIIIMotorFineSubtest (141 / 11)
(13/46) column: BayleyIIIReasonNoSuccessMotorFineSubtestText (0 / 152)
(14/46) column: BayleyIIIMotorGrossSubtest (141 / 11)
(15/46) column: BayleyIIIReasonNoSuccessMotorGrossSubtestText (0 / 152)
(16/46) column: BayleyIIIInEnglish (140 / 12)
(17/46) column: BayleyIIIRequireInterpreter (17 / 135)
(18/46) column: BayleyIIIAdministrator

In [31]:
COMBINE_harmonizer.column_info(df_bayley_iii)

(0/46) center: (152/0)
(1/46) subjectID: (152/0)
(2/46) uniqueID: (152/0)
(3/46) followupCenter: (152/0)
(4/46) followupID: (152/0)
(5/46) siteID: (152/0)
(6/46) BayleyIIICognitiveSubtest: (141/11)
(7/46) BayleyIIIReasonNoSuccessCognitiveSubtestText: (0/152)
(8/46) BayleyIIILanguageReceptiveSubtest: (141/11)
(9/46) BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText: (0/152)
(10/46) BayleyIIILanguageExpressiveSubtest: (141/11)
(11/46) BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText: (0/152)
(12/46) BayleyIIIMotorFineSubtest: (141/11)
(13/46) BayleyIIIReasonNoSuccessMotorFineSubtestText: (0/152)
(14/46) BayleyIIIMotorGrossSubtest: (141/11)
(15/46) BayleyIIIReasonNoSuccessMotorGrossSubtestText: (0/152)
(16/46) BayleyIIIInEnglish: (140/12)
(17/46) BayleyIIIRequireInterpreter: (17/135)
(18/46) BayleyIIIAdministratorMaskedToChildHistory: (140/12)
(19/46) BayleyIIIReasonNoSuccessCognitiveSubtest: (9/143)
(20/46) BayleyIIIReasonNoSuccessLanguageReceptiveSubtest: (9/143)
(21/46) BayleyI

## 20-05. GMFCS

In [32]:
gmfcs_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_GMFCS)

all_gmfcs_columns = id_columns + gmfcs_columns
gmfcs_columns

['grossMotorFunctionLevel']

In [33]:
df_gmfcs = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_gmfcs_columns, debug_df=False, debug_columns=True)
df_gmfcs = COMBINE_harmonizer.postprocess(df_gmfcs)

out_filename = os.sep.join([out_dir, '20-05-gmfcs.csv'])
df_gmfcs.to_csv(out_filename, index=False)

### 20-05-1. check empty

In [34]:
COMBINE_harmonizer.check_empty(df_gmfcs)

(0/7) column: center (152 / 0)
(1/7) column: subjectID (152 / 0)
(2/7) column: uniqueID (152 / 0)
(3/7) column: followupCenter (152 / 0)
(4/7) column: followupID (152 / 0)
(5/7) column: siteID (152 / 0)
(6/7) column: grossMotorFunctionLevel (141 / 11)


In [35]:
COMBINE_harmonizer.column_info(df_gmfcs)

(0/7) center: (152/0)
(1/7) subjectID: (152/0)
(2/7) uniqueID: (152/0)
(3/7) followupCenter: (152/0)
(4/7) followupID: (152/0)
(5/7) siteID: (152/0)
(6/7) grossMotorFunctionLevel: (141/11)


## 20-06. Status

In [36]:
status_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_STATUS)

all_status_columns = id_columns + status_columns
status_columns

['statusVisitDate',
 'statusBirthDate',
 'childFinalStatus',
 'deathDate',
 'deathCause',
 'reasonLossFollowUp',
 'firstVisitDate',
 'finalVisitDate']

In [37]:
df_status = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_status_columns, debug_df=False, debug_columns=True)
df_status = COMBINE_harmonizer.postprocess(df_status)

out_filename = os.sep.join([out_dir, '20-06-status.csv'])
df_status.to_csv(out_filename, index=False)

(9/13) deathCause not in df


### 20-06-1. check empty

In [38]:
COMBINE_harmonizer.check_empty(df_status)

(0/13) column: center (152 / 0)
(1/13) column: subjectID (152 / 0)
(2/13) column: uniqueID (152 / 0)
(3/13) column: followupCenter (152 / 0)
(4/13) column: followupID (152 / 0)
(5/13) column: siteID (152 / 0)
(6/13) column: statusBirthDate (152 / 0)
(7/13) column: statusVisitDate (0 / 152)
(8/13) column: childFinalStatus (152 / 0)
(9/13) column: deathDate (2 / 150)
(10/13) column: reasonLossFollowUp (9 / 143)
(11/13) column: firstVisitDate (140 / 12)
(12/13) column: finalVisitDate (140 / 12)


In [39]:
COMBINE_harmonizer.column_info(df_status)

(0/13) center: (152/0)
(1/13) subjectID: (152/0)
(2/13) uniqueID: (152/0)
(3/13) followupCenter: (152/0)
(4/13) followupID: (152/0)
(5/13) siteID: (152/0)
(6/13) statusBirthDate: (152/0)
(7/13) statusVisitDate: (0/152)
(8/13) childFinalStatus: (152/0)
(9/13) deathDate: (2/150)
(10/13) reasonLossFollowUp: (9/143)
(11/13) firstVisitDate: (140/12)
(12/13) finalVisitDate: (140/12)


## 20-08. Lost Follow-up

In [40]:
lost_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_LOST_FOLLOW_UP)

all_lost_columns = id_columns + lost_columns
lost_columns

['lostFollowUpInformationAvailableIndirectSrc',
 'lostFollowUpLastContactDate',
 'lostFollowUpFormCompleteDate',
 'lostFollowUpChildAlive',
 'lostFollowUpLastKnownAliveCorrectedAge_mo',
 'lostFollowUpDeathDate',
 'lostFollowUpInterview',
 'lostFollowUpInterviewDate',
 'lostFollowUpInterviewCorrectedAge_mo',
 'lostFollowUpAnyQuestionCompleteChartReview',
 'lostFollowUpChartReviewDate',
 'lostFollowUpChartReviewCorrectedAge_mo',
 'interviewChildHealth',
 'interviewWalkAlone',
 'interviewWalkAloneAge_mo',
 'interviewSittingAlong',
 'interviewHeadControl',
 'interviewSee',
 'interviewEyeExam',
 'interviewNeedWearGlasses',
 'interviewHear',
 'interviewHearExam',
 'interviewNeedWearHearingAid',
 'interviewNumberWordVocabulary',
 'interviewCombine2Words',
 'interviewCombine3Words',
 'interviewHydrocephalusShunt',
 'interviewCerebralPalsy',
 'interviewDevelopmentalDelay',
 'interviewLanguageDelay',
 'interviewPoorWeightGain',
 'interviewSeizure',
 'interviewBlindness',
 'interviewOtherBehavior

In [41]:
df_lost = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_lost_columns, debug_df=False, debug_columns=True)
df_lost = COMBINE_harmonizer.postprocess(df_lost)

out_filename = os.sep.join([out_dir, '20-08-lost.csv'])
df_lost.to_csv(out_filename, index=False)

(44/62) interviewMotorGrossFunctionLevel not in df
(61/62) chartReviewMotorGrossFunctionLevel not in df


### 20-08-1. check empty

In [42]:
COMBINE_harmonizer.check_empty(df_lost)

(0/61) column: center (152 / 0)
(1/61) column: subjectID (152 / 0)
(2/61) column: uniqueID (152 / 0)
(3/61) column: followupCenter (152 / 0)
(4/61) column: followupID (152 / 0)
(5/61) column: siteID (152 / 0)
(6/61) column: lostFollowUpInformationAvailableIndirectSrc (10 / 142)
(7/61) column: lostFollowUpChildAlive (7 / 145)
(8/61) column: lostFollowUpInterview (6 / 146)
(9/61) column: lostFollowUpAnyQuestionCompleteChartReview (6 / 146)
(10/61) column: interviewWalkAlone (2 / 150)
(11/61) column: interviewSittingAlong (0 / 152)
(12/61) column: interviewHeadControl (0 / 152)
(13/61) column: interviewSee (2 / 150)
(14/61) column: interviewEyeExam (2 / 150)
(15/61) column: interviewNeedWearGlasses (2 / 150)
(16/61) column: interviewHear (2 / 150)
(17/61) column: interviewHearExam (2 / 150)
(18/61) column: interviewNeedWearHearingAid (0 / 152)
(19/61) column: interviewCombine2Words (2 / 150)
(20/61) column: interviewCombine3Words (2 / 150)
(21/61) column: interviewHydrocephalusShunt (2 / 

In [43]:
COMBINE_harmonizer.column_info(df_lost)

(0/61) center: (152/0)
(1/61) subjectID: (152/0)
(2/61) uniqueID: (152/0)
(3/61) followupCenter: (152/0)
(4/61) followupID: (152/0)
(5/61) siteID: (152/0)
(6/61) lostFollowUpInformationAvailableIndirectSrc: (10/142)
(7/61) lostFollowUpChildAlive: (7/145)
(8/61) lostFollowUpInterview: (6/146)
(9/61) lostFollowUpAnyQuestionCompleteChartReview: (6/146)
(10/61) interviewWalkAlone: (2/150)
(11/61) interviewSittingAlong: (0/152)
(12/61) interviewHeadControl: (0/152)
(13/61) interviewSee: (2/150)
(14/61) interviewEyeExam: (2/150)
(15/61) interviewNeedWearGlasses: (2/150)
(16/61) interviewHear: (2/150)
(17/61) interviewHearExam: (2/150)
(18/61) interviewNeedWearHearingAid: (0/152)
(19/61) interviewCombine2Words: (2/150)
(20/61) interviewCombine3Words: (2/150)
(21/61) interviewHydrocephalusShunt: (2/150)
(22/61) interviewCerebralPalsy: (2/150)
(23/61) interviewDevelopmentalDelay: (2/150)
(24/61) interviewLanguageDelay: (2/150)
(25/61) interviewPoorWeightGain: (2/150)
(26/61) interviewSeizure: (

## 20-09. Secondary Analysis

In [44]:
secondary_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_SECONDARY_ANALYSIS)

all_secondary_columns = id_columns + secondary_columns
secondary_columns

['blindness',
 'moderateSevereCerebralPalsy',
 'cerebralPalsyMerge',
 'gastrostomyTube_b',
 'grossMotorFunctionLevelSeverity',
 'hearingImpairedWithAid',
 'hearingImpairedLevel',
 'multipleImpairment',
 'afterDischargeSeizure']

In [45]:
df_analysis = df_dict[_ANALYSIS_FILENAME]
df_analysis = df_analysis.rename(columns={'followupCenter': 'center', 'fcenter': 'followupCenter'})

df_secondary = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_secondary_columns, debug_df=False, debug_columns=True)
df_secondary = COMBINE_harmonizer.postprocess(df_secondary)

out_filename = os.sep.join([out_dir, '20-09-secondary.csv'])
df_secondary.to_csv(out_filename, index=False)

(1/14) siteID not in df
(8/14) gastrostomyTube_b not in df
(12/14) multipleImpairment not in df


### 20-09-1. check empty

In [46]:
COMBINE_harmonizer.check_empty(df_secondary)

(0/12) column: center (168 / 0)
(1/12) column: subjectID (168 / 0)
(2/12) column: uniqueID (168 / 0)
(3/12) column: followupCenter (151 / 17)
(4/12) column: blindness (141 / 27)
(5/12) column: hearingImpairedLevel (141 / 27)
(6/12) column: hearingImpairedWithAid (141 / 27)
(7/12) column: grossMotorFunctionLevelSeverity (141 / 27)
(8/12) column: cerebralPalsyMerge (141 / 27)
(9/12) column: moderateSevereCerebralPalsy (141 / 27)
(10/12) column: followupID (133 / 35)
(11/12) column: afterDischargeSeizure (143 / 25)


In [47]:
COMBINE_harmonizer.column_info(df_secondary)

(0/12) center: (168/0)
(1/12) subjectID: (168/0)
(2/12) uniqueID: (168/0)
(3/12) followupCenter: (151/17)
(4/12) blindness: (141/27)
(5/12) hearingImpairedLevel: (141/27)
(6/12) hearingImpairedWithAid: (141/27)
(7/12) grossMotorFunctionLevelSeverity: (141/27)
(8/12) cerebralPalsyMerge: (141/27)
(9/12) moderateSevereCerebralPalsy: (141/27)
(10/12) followupID: (133/35)
(11/12) afterDischargeSeizure: (143/25)


## 20-10. Outcome

In [48]:
outcome_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_OUTCOME)

all_outcome_columns = id_columns + outcome_columns
outcome_columns

['flagAdjudicatedOutcome',
 'normalPrimaryOutcome',
 'BayleyIIILanguage',
 'BayleyIIIMotor',
 'BayleyIIICognitive',
 'deathBeforeFollowup',
 'deathBeforeDischarge',
 'disabilityLevelSurvivor',
 'disabilityLevelDeath4Category',
 'moderateSevereDisabilityOrDeath',
 'moderateSevereDisabilitySurvivor',
 'disabilityLevelDeath',
 'outcomeGroup']

In [49]:
df_outcome = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_outcome_columns, debug_df=False, debug_columns=True)
df_outcome = COMBINE_harmonizer.postprocess(df_outcome)

out_filename = os.sep.join([out_dir, '20-10-outcome.csv'])
df_outcome.to_csv(out_filename, index=False)

(1/18) siteID not in df
(5/18) flagAdjudicatedOutcome not in df
(7/18) BayleyIIILanguage not in df
(8/18) BayleyIIIMotor not in df
(16/18) disabilityLevelDeath not in df
(17/18) outcomeGroup not in df


### 20-10-1. check empty

In [50]:
COMBINE_harmonizer.check_empty(df_outcome)

(0/13) column: center (168 / 0)
(1/13) column: subjectID (168 / 0)
(2/13) column: uniqueID (168 / 0)
(3/13) column: followupCenter (151 / 17)
(4/13) column: BayleyIIICognitive (138 / 30)
(5/13) column: deathBeforeFollowup (165 / 3)
(6/13) column: normalPrimaryOutcome (139 / 29)
(7/13) column: deathBeforeDischarge (167 / 1)
(8/13) column: followupID (133 / 35)
(9/13) column: moderateSevereDisabilitySurvivor (139 / 29)
(10/13) column: disabilityLevelSurvivor (139 / 29)
(11/13) column: moderateSevereDisabilityOrDeath (157 / 11)
(12/13) column: disabilityLevelDeath4Category (157 / 11)


In [51]:
COMBINE_harmonizer.column_info(df_outcome)

(0/13) center: (168/0)
(1/13) subjectID: (168/0)
(2/13) uniqueID: (168/0)
(3/13) followupCenter: (151/17)
(4/13) BayleyIIICognitive: (138/30)
(5/13) deathBeforeFollowup: (165/3)
(6/13) normalPrimaryOutcome: (139/29)
(7/13) deathBeforeDischarge: (167/1)
(8/13) followupID: (133/35)
(9/13) moderateSevereDisabilitySurvivor: (139/29)
(10/13) disabilityLevelSurvivor: (139/29)
(11/13) moderateSevereDisabilityOrDeath: (157/11)
(12/13) disabilityLevelDeath4Category: (157/11)


## 20-07. Readmission

In [52]:
columns_subject_id_map = ['center', 'followupID', 'followupCenter', 'subjectID']
df_subject_id_map = df_follow_up[columns_subject_id_map]

In [53]:
df_subject_id_map

Unnamed: 0,center,followupID,followupCenter,subjectID
0,11,LHF01,11,LH006
1,11,LHF02,11,LH011
2,11,LHF03,11,LH023
3,11,LHF04,11,LH030
4,11,LHF05,11,LH033
...,...,...,...,...
150,09,LHF06,09,LH112
151,09,LHF07,09,LH146
152,09,LHF08,09,LH135
153,09,LHF09,09,LH204


In [54]:
df_readmission = df_dict['lf04ar.csv']
df_readmission = COMBINE_harmonizer.valid_columns(df_readmission, all_valid_columns, debug_df=True, debug_columns=False)
df_readmission = COMBINE_harmonizer.postprocess(df_readmission, subject_id_idx='followupID', center_id_idx='followupCenter')

df_readmission = df_readmission.merge(df_subject_id_map, on=['followupCenter', 'followupID'], how='left')
df_readmission = COMBINE_harmonizer.postprocess(df_readmission)

out_filename = os.sep.join([out_dir, '20-07-readmission.csv'])
df_readmission.to_csv(out_filename, index=False)

(2/9) REC_CMP not in columns
(7/9) CMP_DATE not in columns
(8/9) CRT_DATE not in columns


### 20-07-1. check readmission

In [55]:
df_readmission_groupby = df_readmission.groupby(['uniqueID', 'readmissionNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_readmission_groupby['_count'] > 1
df_readmission_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,readmissionNumber,Unnamed: 2_level_1


### 20-07-2. check empty cells

In [56]:
COMBINE_harmonizer.check_empty(df_readmission)

(0/9) column: center (65 / 0)
(1/9) column: subjectID (65 / 0)
(2/9) column: uniqueID (65 / 0)
(3/9) column: followupCenter (65 / 0)
(4/9) column: followupID (65 / 0)
(5/9) column: readmissionPrimaryCauseOtherText (7 / 58)
(6/9) column: readmissionNumber (65 / 0)
(7/9) column: readmissionTimePeriod (65 / 0)
(8/9) column: readmissionPrimaryCause (65 / 0)


In [57]:
COMBINE_harmonizer.column_info(df_readmission)

(0/9) center: (65/0)
(1/9) subjectID: (65/0)
(2/9) uniqueID: (65/0)
(3/9) followupCenter: (65/0)
(4/9) followupID: (65/0)
(5/9) readmissionPrimaryCauseOtherText: (7/58)
(6/9) readmissionNumber: (65/0)
(7/9) readmissionTimePeriod: (65/0)
(8/9) readmissionPrimaryCause: (65/0)
