In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re
import pydoc

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_OC
sheet_name = COMBINE_harmonizer.SHEET_FOLLOW_UP

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')


In [4]:
_FILENAMES = [
    'of01.csv',
    'of03.csv',
    'of04.csv',
    'of04a.csv',
    'of04ar.csv',
    'of05.csv',
    'of09a.csv',
    'of10.csv',
    'of11.csv',
    'of12.csv',
]

_ANALYSIS_FILENAME = 'analysis.csv'

_ANALYSIS_FILENAMES = [
    'analysis.csv',
]

_FILENAMES_MERGE = [
    'of01.csv',
    'of03.csv',
    'of04.csv',
    'of04a.csv',
    'of05.csv',
    'of09a.csv',
    'of10.csv',
    'of11.csv',
    'of12.csv',
]


In [5]:
input_dir = cfg.config[f'{study_name}_follow_up_dir']
input_analysis_dir = cfg.config[f'{study_name}_analysis_dir']

data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}"

os.makedirs(out_dir, exist_ok=True)

## 00-1. Column Map

In [6]:
df_data_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=sheet_name)
all_valid_columns = list(df_data_dict[COMBINE_harmonizer.DATA_DICT_VAR_NAME])
column_map = {each[study_name]: each[COMBINE_harmonizer.DATA_DICT_VAR_NAME] for _, each in df_data_dict.iterrows()}

## 00-2. df-dict from _FILENAMES and _ANALYSIS_FILENAMES

In [7]:
df_dict = {
    filename: pd.read_csv(os.sep.join([input_dir, filename]), dtype='O').rename(columns=column_map)
    for filename in _FILENAMES
}

df_analysis_dict = {filename: pd.read_csv(os.sep.join([input_analysis_dir, filename]), dtype='O').rename(columns=column_map) for filename in _ANALYSIS_FILENAMES}

df_dict.update(df_analysis_dict)

### 00-4. identifier-column

In [8]:
id_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_IDENTITY)

id_columns

['followupCenter', 'siteID', 'center', 'subjectID', 'followupID']

## 20-01. follow-up

In [9]:
df_all = None
for idx, each_filename in enumerate(_FILENAMES_MERGE):
    each_full_filename = os.sep.join([root_dir, each_filename])
    each_df = df_dict[each_filename]
    columns = list(each_df.columns)
    each_filename_prefix = re.sub(r'\.csv$', '', each_filename)
    # print(f'({idx}/{len(_FILENAMES_MERGE)}) filename: {each_filename} columns: {columns}')

    if df_all is None:
        df_all = each_df
    else:
        df_all = df_all.merge(each_df, on=['followupCenter', 'followupID'], how='outer', suffixes=['', ':' + each_filename_prefix])


# follow-up
print('to set follow-up')
df_follow_up = df_all

df_follow_up = COMBINE_harmonizer.valid_columns(df_follow_up, all_valid_columns, debug_df=True, debug_columns=False)
df_follow_up = COMBINE_harmonizer.postprocess(df_follow_up, subject_id_idx='followupID', center_id_idx='followupCenter')
df_follow_up['center_orig'] = df_follow_up['center'].copy()
df_follow_up['center'] = df_follow_up['center'].apply(COMBINE_harmonizer.to_center)
df_follow_up['center'] = df_follow_up.apply(lambda x: x['center'] if isinstance(x['center'], str) and x['center'] != '' else x['followupCenter'], axis=1)


to set follow-up
(2/410) LAST not in columns
(3/410) FIRST not in columns
(5/410) PROTID02 not in columns
(6/410) PROTID03 not in columns
(7/410) PROTID04 not in columns
(8/410) PROTID05 not in columns
(9/410) PROTID06 not in columns
(10/410) PROTID07 not in columns
(11/410) PROTID08 not in columns
(12/410) PROTID09 not in columns
(13/410) PROTID10 not in columns
(14/410) PROTID11 not in columns
(15/410) PROTID12 not in columns
(16/410) PROTID13 not in columns
(17/410) PROTID14 not in columns
(18/410) PROTID15 not in columns
(19/410) PROTID16 not in columns
(20/410) PROTID17 not in columns
(21/410) PROTID18 not in columns
(22/410) PROTID19 not in columns
(23/410) PROTID20 not in columns
(24/410) REC_CMP not in columns
(30/410) CMP_DATE not in columns
(31/410) CRT_DATE not in columns
(62/410) CFINITS not in columns
(63/410) REC_CMP:of03 not in columns
(99/410) CFPRRESP not in columns
(102/410) CMP_DATE:of03 not in columns
(103/410) CRT_DATE:of03 not in columns
(140/410) NF4LOCSP not in 

### 20-01-1. XXX hack for "follow-up visit completed in another NRN center"

In [10]:
df_follow_up['childFinalStatus'].unique(), len(df_follow_up)

(array(['1', '5', '4', '3', '6'], dtype=object), 320)

In [11]:
is_valid = df_follow_up['childFinalStatus'].isin(['5', '5.0']) == False
df_follow_up = df_follow_up[is_valid]
len(df_follow_up)

315

## 20-00. Follow-up

In [12]:
follow_up_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_FOLLOWUP)

all_follow_up_columns = id_columns + follow_up_columns
follow_up_columns

['birthDate', 'visitDate', 'birthNumber', 'center_orig']

In [13]:
df_follow_up_follow_up = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_follow_up_columns, debug_df=False, debug_columns=True)
df_follow_up_follow_up = COMBINE_harmonizer.postprocess(df_follow_up_follow_up)

out_filename = os.sep.join([out_dir, '20-00-follow-up.csv'])
df_follow_up_follow_up.to_csv(out_filename, index=False)

### 20-00-1. check empty

In [14]:
COMBINE_harmonizer.check_empty(df_follow_up_follow_up)

(0/10) column: center (315 / 0)
(1/10) column: subjectID (315 / 0)
(2/10) column: uniqueID (315 / 0)
(3/10) column: followupCenter (315 / 0)
(4/10) column: followupID (315 / 0)
(5/10) column: siteID (315 / 0)
(6/10) column: birthDate (315 / 0)
(7/10) column: visitDate (291 / 24)
(8/10) column: birthNumber (0 / 315)
(9/10) column: center_orig (6 / 309)


In [15]:
COMBINE_harmonizer.column_info(df_follow_up_follow_up)

(0/10) center: (315/0)
(1/10) subjectID: (315/0)
(2/10) uniqueID: (315/0)
(3/10) followupCenter: (315/0)
(4/10) followupID: (315/0)
(5/10) siteID: (315/0)
(6/10) birthDate: (315/0)
(7/10) visitDate: (291/24)
(8/10) birthNumber: (0/315)
(9/10) center_orig: (6/309)


## 20-01. SES

In [16]:
ses_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_SOCIOECONOMIC_STATUS)

all_ses_columns = id_columns + ses_columns
all_ses_columns

['followupCenter',
 'siteID',
 'center',
 'subjectID',
 'followupID',
 'SESVisitDate',
 'SESBirthDate',
 'chronologicalAge_mo',
 'correctedAge_mo',
 'underStateSupervision',
 'primaryCaretaker',
 'otherCaretaker',
 'maritalStatusPrimaryCaretaker',
 'livingArrangementChild',
 'numberPeopleInChildHousehold',
 'otherContributeMoneyToChildHousehold',
 'educationPrimaryCaretaker',
 'educationOtherCaretaker',
 'workPrimaryCaretaker',
 'workOtherCaretaker',
 'inSchoolPrimaryCaretaker',
 'inSchoolOtherCaretaker',
 'totalIncomeChildHousehold',
 'medicalInsuranceChild',
 'primaryLanguageChild',
 'primaryLanguageChildOtherText',
 'isSecondaryLanguageChild',
 'secondaryLanguageChild',
 'secondaryLanguageChildOtherText',
 'numberPlaceChildLive',
 'zipcode',
 'visitingNurseReceive',
 'visitingNurseNeed',
 'homeNurseReceive',
 'homeNurseNeed',
 'otPtReceive',
 'otPtNeed',
 'speechTherapyReceive',
 'speechTherapyNeed',
 'earlyInterventionReceive',
 'earlyInterventionNeed',
 'socialWorkForChildReceive'

In [17]:
df_ses = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_ses_columns, debug_df=False, debug_columns=True)
df_ses = COMBINE_harmonizer.postprocess(df_ses)

out_filename = os.sep.join([out_dir, '20-01-ses.csv'])
df_ses.to_csv(out_filename, index=False)

(22/78) totalIncomeChildHousehold not in df
(30/78) zipcode not in df
(43/78) specialClinicReceive not in df
(44/78) specialClinicNeed not in df
(60/78) prematureFollowupClinicReceive not in df
(61/78) prematureFollowupClinicNeed not in df


### 20-01-1. check empty

In [18]:
COMBINE_harmonizer.check_empty(df_ses)

(0/73) column: center (315 / 0)
(1/73) column: subjectID (315 / 0)
(2/73) column: uniqueID (315 / 0)
(3/73) column: followupCenter (315 / 0)
(4/73) column: followupID (315 / 0)
(5/73) column: siteID (315 / 0)
(6/73) column: underStateSupervision (288 / 27)
(7/73) column: otherContributeMoneyToChildHousehold (288 / 27)
(8/73) column: workPrimaryCaretaker (288 / 27)
(9/73) column: workOtherCaretaker (254 / 61)
(10/73) column: inSchoolPrimaryCaretaker (288 / 27)
(11/73) column: inSchoolOtherCaretaker (254 / 61)
(12/73) column: primaryLanguageChildOtherText (15 / 300)
(13/73) column: isSecondaryLanguageChild (288 / 27)
(14/73) column: secondaryLanguageChildOtherText (22 / 293)
(15/73) column: visitingNurseNeed (288 / 27)
(16/73) column: homeNurseNeed (288 / 27)
(17/73) column: otPtNeed (288 / 27)
(18/73) column: speechTherapyNeed (288 / 27)
(19/73) column: earlyInterventionNeed (288 / 27)
(20/73) column: socialWorkForChildNeed (288 / 27)
(21/73) column: pulmonaryNeed (288 / 27)
(22/73) col

In [19]:
COMBINE_harmonizer.column_info(df_ses)

(0/73) center: (315/0)
(1/73) subjectID: (315/0)
(2/73) uniqueID: (315/0)
(3/73) followupCenter: (315/0)
(4/73) followupID: (315/0)
(5/73) siteID: (315/0)
(6/73) underStateSupervision: (288/27)
(7/73) otherContributeMoneyToChildHousehold: (288/27)
(8/73) workPrimaryCaretaker: (288/27)
(9/73) workOtherCaretaker: (254/61)
(10/73) inSchoolPrimaryCaretaker: (288/27)
(11/73) inSchoolOtherCaretaker: (254/61)
(12/73) primaryLanguageChildOtherText: (15/300)
(13/73) isSecondaryLanguageChild: (288/27)
(14/73) secondaryLanguageChildOtherText: (22/293)
(15/73) visitingNurseNeed: (288/27)
(16/73) homeNurseNeed: (288/27)
(17/73) otPtNeed: (288/27)
(18/73) speechTherapyNeed: (288/27)
(19/73) earlyInterventionNeed: (288/27)
(20/73) socialWorkForChildNeed: (288/27)
(21/73) pulmonaryNeed: (288/27)
(22/73) ophthalmologicNeed: (288/27)
(23/73) gastrointestinalNeed: (288/27)
(24/73) audiologicNeed: (288/27)
(25/73) neurologicNeed: (288/27)
(26/73) otherNeed: (288/27)
(27/73) otherNeedText: (93/222)
(28/73)

## 20-02. Medical History

In [20]:
medical_history_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_MEDICAL_HISTORY)

all_medical_history_columns = id_columns + medical_history_columns
medical_history_columns

['rehospitalize',
 'numberRehospitalize',
 'operation',
 'operationTypanostomyTube',
 'operationTracheostomy',
 'operationEyeSurgery',
 'operationEyeSurgeryReason',
 'operationHerniaSurgery',
 'operationGastrostomyTube',
 'operationFundoplication',
 'operationShuntForHydrocephalus',
 'operationReanastomosisOfLargeOrSmallIntenstine',
 'operationPDALigation',
 'operationBrochoscopy',
 'operationHypospadiusRepair',
 'operationOther',
 'operationOtherText',
 'medication',
 'vitaminMineralSupplement',
 'highCaloricFormula',
 'diuretics',
 'antiRefluxMedication',
 'bronchodilator',
 'inhaledSteroid',
 'oralIvSteroid',
 'otherAsthmaMedication',
 'decongestantColdAllergyMedication',
 'anticonvulsantMedication',
 'prophylaticAntibiotics',
 'antibiotics',
 'constipationMedication',
 'bloodPressureMedication',
 'thyroidMedication',
 'muscleRelaxants',
 'botox',
 'otherMedication',
 'otherMedicationText',
 'seizure',
 'medicalEquipmentHomeUse',
 'apneaMonitor',
 'oxygen',
 'ventilatorCPAP',
 'gast

In [21]:
df_medical_history = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_medical_history_columns, debug_df=False, debug_columns=True)
df_medical_history = COMBINE_harmonizer.postprocess(df_medical_history)

out_filename = os.sep.join([out_dir, '20-02-medical-history.csv'])
df_medical_history.to_csv(out_filename, index=False)

### 20-02-1. check empty

In [22]:
COMBINE_harmonizer.check_empty(df_medical_history)

(0/69) column: center (315 / 0)
(1/69) column: subjectID (315 / 0)
(2/69) column: uniqueID (315 / 0)
(3/69) column: followupCenter (315 / 0)
(4/69) column: followupID (315 / 0)
(5/69) column: siteID (315 / 0)
(6/69) column: rehospitalize (287 / 28)
(7/69) column: operation (287 / 28)
(8/69) column: operationTypanostomyTube (62 / 253)
(9/69) column: operationTracheostomy (62 / 253)
(10/69) column: operationEyeSurgery (62 / 253)
(11/69) column: operationHerniaSurgery (62 / 253)
(12/69) column: operationGastrostomyTube (62 / 253)
(13/69) column: operationFundoplication (62 / 253)
(14/69) column: operationShuntForHydrocephalus (62 / 253)
(15/69) column: operationReanastomosisOfLargeOrSmallIntenstine (62 / 253)
(16/69) column: operationPDALigation (62 / 253)
(17/69) column: operationBrochoscopy (62 / 253)
(18/69) column: operationHypospadiusRepair (62 / 253)
(19/69) column: operationOther (62 / 253)
(20/69) column: operationOtherText (29 / 286)
(21/69) column: medication (287 / 28)
(22/69) 

In [23]:
COMBINE_harmonizer.column_info(df_medical_history)

(0/69) center: (315/0)
(1/69) subjectID: (315/0)
(2/69) uniqueID: (315/0)
(3/69) followupCenter: (315/0)
(4/69) followupID: (315/0)
(5/69) siteID: (315/0)
(6/69) rehospitalize: (287/28)
(7/69) operation: (287/28)
(8/69) operationTypanostomyTube: (62/253)
(9/69) operationTracheostomy: (62/253)
(10/69) operationEyeSurgery: (62/253)
(11/69) operationHerniaSurgery: (62/253)
(12/69) operationGastrostomyTube: (62/253)
(13/69) operationFundoplication: (62/253)
(14/69) operationShuntForHydrocephalus: (62/253)
(15/69) operationReanastomosisOfLargeOrSmallIntenstine: (62/253)
(16/69) operationPDALigation: (62/253)
(17/69) operationBrochoscopy: (62/253)
(18/69) operationHypospadiusRepair: (62/253)
(19/69) operationOther: (62/253)
(20/69) operationOtherText: (29/286)
(21/69) medication: (287/28)
(22/69) otherMedicationText: (19/296)
(23/69) seizure: (287/28)
(24/69) medicalEquipmentHomeUse: (287/28)
(25/69) fluShot: (287/28)
(26/69) RSVProphylaxis: (287/28)
(27/69) independentFeedSelf: (287/28)
(28

## 20-03. Medical Exam

In [24]:
medical_exam_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_MEDICAL_EXAM)

all_medical_exam_columns = id_columns + medical_exam_columns
medical_exam_columns

['weight_cm',
 'length_cm',
 'headCircumference_cm',
 'strabismusRight',
 'strabismusLeft',
 'nystagmusRight',
 'nystagmusLeft',
 'rovingEyeMovementRight',
 'rovingEyeMovementLeft',
 'eyeTrackRight',
 'eyeTrackLeft',
 'visionRight',
 'visionLeft',
 'audiologicAssessment',
 'audiologicPendingForAssessment',
 'visualReinforcementAudiometry',
 'VRARight',
 'VRALeft',
 'VRASoundField',
 'ABR',
 'ABRRight',
 'ABRLeft',
 'hearingTestUnknown',
 'hearingTestUnknownRight',
 'hearingTestUnknownLeft',
 'hearingImpaired',
 'hearingAidRequirement',
 'hearingImplant',
 'swallowing',
 'dysphagia',
 'aspiration',
 'abnormalVoice',
 'drooling',
 'nothingByMouth',
 'observedAbnormalMovement',
 'observedAbnormalMovementShortJerky',
 'observedAbnormalMovementSlowWrithing',
 'observedAbnormalMovementTremor',
 'passiveMuscleToneNeckTrunk',
 'upperExtremityMuscleToneRight',
 'upperExtremityMuscleToneLeft',
 'lowerExtremityMuscleToneHipKneeRight',
 'lowerExtremityMuscleToneHipKneeLeft',
 'lowerExtremityMuscle

In [25]:
df_medical_exam = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_medical_exam_columns, debug_df=False, debug_columns=True)
df_medical_exam = COMBINE_harmonizer.postprocess(df_medical_exam)

out_filename = os.sep.join([out_dir, '20-03-medical-exam.csv'])
df_medical_exam.to_csv(out_filename, index=False)

(80/99) spasticTriplegia not in df
(83/99) athetosisDystonia not in df
(85/99) spasticMonoplegia not in df
(86/99) mixedCerebralPalsy not in df


### 20-03-1. check empty

In [26]:
COMBINE_harmonizer.check_empty(df_medical_exam)

(0/96) column: center (315 / 0)
(1/96) column: subjectID (315 / 0)
(2/96) column: uniqueID (315 / 0)
(3/96) column: followupCenter (315 / 0)
(4/96) column: followupID (315 / 0)
(5/96) column: siteID (315 / 0)
(6/96) column: audiologicAssessment (285 / 30)
(7/96) column: audiologicPendingForAssessment (152 / 163)
(8/96) column: visualReinforcementAudiometry (90 / 225)
(9/96) column: ABR (90 / 225)
(10/96) column: hearingTestUnknown (133 / 182)
(11/96) column: dysphagia (285 / 30)
(12/96) column: aspiration (285 / 30)
(13/96) column: abnormalVoice (285 / 30)
(14/96) column: drooling (285 / 30)
(15/96) column: nothingByMouth (285 / 30)
(16/96) column: observedAbnormalMovement (285 / 30)
(17/96) column: observedAbnormalMovementShortJerky (19 / 296)
(18/96) column: observedAbnormalMovementSlowWrithing (19 / 296)
(19/96) column: observedAbnormalMovementTremor (19 / 296)
(20/96) column: scissoringLegs (285 / 30)
(21/96) column: neuralNormal (285 / 30)
(22/96) column: generalizedHypotonia (54 

In [27]:
COMBINE_harmonizer.column_info(df_medical_exam)

(0/96) center: (315/0)
(1/96) subjectID: (315/0)
(2/96) uniqueID: (315/0)
(3/96) followupCenter: (315/0)
(4/96) followupID: (315/0)
(5/96) siteID: (315/0)
(6/96) audiologicAssessment: (285/30)
(7/96) audiologicPendingForAssessment: (152/163)
(8/96) visualReinforcementAudiometry: (90/225)
(9/96) ABR: (90/225)
(10/96) hearingTestUnknown: (133/182)
(11/96) dysphagia: (285/30)
(12/96) aspiration: (285/30)
(13/96) abnormalVoice: (285/30)
(14/96) drooling: (285/30)
(15/96) nothingByMouth: (285/30)
(16/96) observedAbnormalMovement: (285/30)
(17/96) observedAbnormalMovementShortJerky: (19/296)
(18/96) observedAbnormalMovementSlowWrithing: (19/296)
(19/96) observedAbnormalMovementTremor: (19/296)
(20/96) scissoringLegs: (285/30)
(21/96) neuralNormal: (285/30)
(22/96) generalizedHypotonia: (54/261)
(23/96) hypertonia: (54/261)
(24/96) neuralOther: (54/261)
(25/96) neuralOtherText: (17/298)
(26/96) spasticQuadriplegia: (55/260)
(27/96) spasticDiplegia: (55/260)
(28/96) spasticHemiplegiaRight: (55

## 20-04. Bayley-III

In [28]:
bayley_iii_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_BAYLEY_III)

all_bayley_iii_columns = id_columns + bayley_iii_columns
bayley_iii_columns

['BayleyIIICognitiveSubtest',
 'BayleyIIIReasonNoSuccessCognitiveSubtest',
 'BayleyIIIReasonNoSuccessCognitiveSubtestText',
 'BayleyIIILanguageReceptiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageReceptiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText',
 'BayleyIIILanguageExpressiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageExpressiveSubtest',
 'BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText',
 'BayleyIIIMotorFineSubtest',
 'BayleyIIIReasonNoSuccessMotorFineSubtest',
 'BayleyIIIReasonNoSuccessMotorFineSubtestText',
 'BayleyIIIMotorGrossSubtest',
 'BayleyIIIReasonNoSuccessMotorGrossSubtest',
 'BayleyIIIReasonNoSuccessMotorGrossSubtestText',
 'BayleyIIIAdjustedAgeForCognitiveTest',
 'BayleyIIIAdjustedAgeForReceptiveCommunication',
 'BayleyIIIAdjustedAgeForExpressiveCommunication',
 'BayleyIIIAdjustedAgeForMotorFineSubtest',
 'BayleyIIIAdjustedAgeForMotorGrossSubtest',
 'BayleyIIICognitiveRaw',
 'BayleyIIICognitiveScale',
 'BayleyIIICognitiveComposite',
 'BayleyI

In [29]:
df_bayley_iii = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_bayley_iii_columns, debug_df=False, debug_columns=True)
df_bayley_iii = COMBINE_harmonizer.postprocess(df_bayley_iii)

out_filename = os.sep.join([out_dir, '20-04-bayley-iii.csv'])
df_bayley_iii.to_csv(out_filename, index=False)

### 20-04-1. check empty

In [30]:
COMBINE_harmonizer.check_empty(df_bayley_iii)

(0/46) column: center (315 / 0)
(1/46) column: subjectID (315 / 0)
(2/46) column: uniqueID (315 / 0)
(3/46) column: followupCenter (315 / 0)
(4/46) column: followupID (315 / 0)
(5/46) column: siteID (315 / 0)
(6/46) column: BayleyIIICognitiveSubtest (287 / 28)
(7/46) column: BayleyIIIReasonNoSuccessCognitiveSubtestText (0 / 315)
(8/46) column: BayleyIIILanguageReceptiveSubtest (287 / 28)
(9/46) column: BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText (0 / 315)
(10/46) column: BayleyIIILanguageExpressiveSubtest (287 / 28)
(11/46) column: BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText (0 / 315)
(12/46) column: BayleyIIIMotorFineSubtest (287 / 28)
(13/46) column: BayleyIIIReasonNoSuccessMotorFineSubtestText (0 / 315)
(14/46) column: BayleyIIIMotorGrossSubtest (287 / 28)
(15/46) column: BayleyIIIReasonNoSuccessMotorGrossSubtestText (0 / 315)
(16/46) column: BayleyIIIInEnglish (283 / 32)
(17/46) column: BayleyIIIRequireInterpreter (27 / 288)
(18/46) column: BayleyIIIAdministrator

In [31]:
COMBINE_harmonizer.column_info(df_bayley_iii)

(0/46) center: (315/0)
(1/46) subjectID: (315/0)
(2/46) uniqueID: (315/0)
(3/46) followupCenter: (315/0)
(4/46) followupID: (315/0)
(5/46) siteID: (315/0)
(6/46) BayleyIIICognitiveSubtest: (287/28)
(7/46) BayleyIIIReasonNoSuccessCognitiveSubtestText: (0/315)
(8/46) BayleyIIILanguageReceptiveSubtest: (287/28)
(9/46) BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText: (0/315)
(10/46) BayleyIIILanguageExpressiveSubtest: (287/28)
(11/46) BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText: (0/315)
(12/46) BayleyIIIMotorFineSubtest: (287/28)
(13/46) BayleyIIIReasonNoSuccessMotorFineSubtestText: (0/315)
(14/46) BayleyIIIMotorGrossSubtest: (287/28)
(15/46) BayleyIIIReasonNoSuccessMotorGrossSubtestText: (0/315)
(16/46) BayleyIIIInEnglish: (283/32)
(17/46) BayleyIIIRequireInterpreter: (27/288)
(18/46) BayleyIIIAdministratorMaskedToChildHistory: (283/32)
(19/46) BayleyIIIReasonNoSuccessCognitiveSubtest: (17/298)
(20/46) BayleyIIIReasonNoSuccessLanguageReceptiveSubtest: (22/293)
(21/46) Bayle

## 20-05. GMFCS

In [32]:
gmfcs_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_GMFCS)

all_gmfcs_columns = id_columns + gmfcs_columns
gmfcs_columns

['grossMotorFunctionLevel']

In [33]:
df_gmfcs = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_gmfcs_columns, debug_df=False, debug_columns=True)
df_gmfcs = COMBINE_harmonizer.postprocess(df_gmfcs)

out_filename = os.sep.join([out_dir, '20-05-gmfcs.csv'])
df_gmfcs.to_csv(out_filename, index=False)

### 20-05-1. check empty

In [34]:
COMBINE_harmonizer.check_empty(df_gmfcs)

(0/7) column: center (315 / 0)
(1/7) column: subjectID (315 / 0)
(2/7) column: uniqueID (315 / 0)
(3/7) column: followupCenter (315 / 0)
(4/7) column: followupID (315 / 0)
(5/7) column: siteID (315 / 0)
(6/7) column: grossMotorFunctionLevel (285 / 30)


In [35]:
COMBINE_harmonizer.column_info(df_gmfcs)

(0/7) center: (315/0)
(1/7) subjectID: (315/0)
(2/7) uniqueID: (315/0)
(3/7) followupCenter: (315/0)
(4/7) followupID: (315/0)
(5/7) siteID: (315/0)
(6/7) grossMotorFunctionLevel: (285/30)


## 20-06. Status

In [36]:
status_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_STATUS)

all_status_columns = id_columns + status_columns
status_columns

['statusVisitDate',
 'statusBirthDate',
 'childFinalStatus',
 'deathDate',
 'deathCause',
 'reasonLossFollowUp',
 'firstVisitDate',
 'finalVisitDate']

In [37]:
df_status = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_status_columns, debug_df=False, debug_columns=True)
df_status = COMBINE_harmonizer.postprocess(df_status)

out_filename = os.sep.join([out_dir, '20-06-status.csv'])
df_status.to_csv(out_filename, index=False)

### 20-06-1. check empty

In [38]:
COMBINE_harmonizer.check_empty(df_status)

(0/14) column: center (315 / 0)
(1/14) column: subjectID (315 / 0)
(2/14) column: uniqueID (315 / 0)
(3/14) column: followupCenter (315 / 0)
(4/14) column: followupID (315 / 0)
(5/14) column: siteID (315 / 0)
(6/14) column: statusBirthDate (315 / 0)
(7/14) column: statusVisitDate (0 / 315)
(8/14) column: childFinalStatus (315 / 0)
(9/14) column: deathDate (7 / 308)
(10/14) column: deathCause (7 / 308)
(11/14) column: reasonLossFollowUp (21 / 294)
(12/14) column: firstVisitDate (291 / 24)
(13/14) column: finalVisitDate (292 / 23)


In [39]:
COMBINE_harmonizer.column_info(df_status)

(0/14) center: (315/0)
(1/14) subjectID: (315/0)
(2/14) uniqueID: (315/0)
(3/14) followupCenter: (315/0)
(4/14) followupID: (315/0)
(5/14) siteID: (315/0)
(6/14) statusBirthDate: (315/0)
(7/14) statusVisitDate: (0/315)
(8/14) childFinalStatus: (315/0)
(9/14) deathDate: (7/308)
(10/14) deathCause: (7/308)
(11/14) reasonLossFollowUp: (21/294)
(12/14) firstVisitDate: (291/24)
(13/14) finalVisitDate: (292/23)


## 20-08. Lost Follow-up

In [40]:
lost_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_LOST_FOLLOW_UP)

all_lost_columns = id_columns + lost_columns
lost_columns

['lostFollowUpInformationAvailableIndirectSrc',
 'lostFollowUpLastContactDate',
 'lostFollowUpFormCompleteDate',
 'lostFollowUpChildAlive',
 'lostFollowUpLastKnownAliveCorrectedAge_mo',
 'lostFollowUpDeathDate',
 'lostFollowUpInterview',
 'lostFollowUpInterviewDate',
 'lostFollowUpInterviewCorrectedAge_mo',
 'lostFollowUpAnyQuestionCompleteChartReview',
 'lostFollowUpChartReviewDate',
 'lostFollowUpChartReviewCorrectedAge_mo',
 'interviewChildHealth',
 'interviewWalkAlone',
 'interviewWalkAloneAge_mo',
 'interviewSittingAlong',
 'interviewHeadControl',
 'interviewSee',
 'interviewEyeExam',
 'interviewNeedWearGlasses',
 'interviewHear',
 'interviewHearExam',
 'interviewNeedWearHearingAid',
 'interviewNumberWordVocabulary',
 'interviewCombine2Words',
 'interviewCombine3Words',
 'interviewHydrocephalusShunt',
 'interviewCerebralPalsy',
 'interviewDevelopmentalDelay',
 'interviewLanguageDelay',
 'interviewPoorWeightGain',
 'interviewSeizure',
 'interviewBlindness',
 'interviewOtherBehavior

In [41]:
df_lost = COMBINE_harmonizer.valid_columns(df_follow_up.copy(), all_lost_columns, debug_df=False, debug_columns=True)
df_lost = COMBINE_harmonizer.postprocess(df_lost)

out_filename = os.sep.join([out_dir, '20-08-lost.csv'])
df_lost.to_csv(out_filename, index=False)

### 20-08-1. check empty

In [42]:
COMBINE_harmonizer.check_empty(df_lost)

(0/63) column: center (315 / 0)
(1/63) column: subjectID (315 / 0)
(2/63) column: uniqueID (315 / 0)
(3/63) column: followupCenter (315 / 0)
(4/63) column: followupID (315 / 0)
(5/63) column: siteID (315 / 0)
(6/63) column: lostFollowUpInformationAvailableIndirectSrc (24 / 291)
(7/63) column: lostFollowUpChildAlive (14 / 301)
(8/63) column: lostFollowUpInterview (13 / 302)
(9/63) column: lostFollowUpAnyQuestionCompleteChartReview (13 / 302)
(10/63) column: interviewWalkAlone (5 / 310)
(11/63) column: interviewSittingAlong (2 / 313)
(12/63) column: interviewHeadControl (2 / 313)
(13/63) column: interviewSee (5 / 310)
(14/63) column: interviewEyeExam (5 / 310)
(15/63) column: interviewNeedWearGlasses (5 / 310)
(16/63) column: interviewHear (5 / 310)
(17/63) column: interviewHearExam (5 / 310)
(18/63) column: interviewNeedWearHearingAid (5 / 310)
(19/63) column: interviewCombine2Words (5 / 310)
(20/63) column: interviewCombine3Words (5 / 310)
(21/63) column: interviewHydrocephalusShunt (5

In [43]:
COMBINE_harmonizer.column_info(df_lost)

(0/63) center: (315/0)
(1/63) subjectID: (315/0)
(2/63) uniqueID: (315/0)
(3/63) followupCenter: (315/0)
(4/63) followupID: (315/0)
(5/63) siteID: (315/0)
(6/63) lostFollowUpInformationAvailableIndirectSrc: (24/291)
(7/63) lostFollowUpChildAlive: (14/301)
(8/63) lostFollowUpInterview: (13/302)
(9/63) lostFollowUpAnyQuestionCompleteChartReview: (13/302)
(10/63) interviewWalkAlone: (5/310)
(11/63) interviewSittingAlong: (2/313)
(12/63) interviewHeadControl: (2/313)
(13/63) interviewSee: (5/310)
(14/63) interviewEyeExam: (5/310)
(15/63) interviewNeedWearGlasses: (5/310)
(16/63) interviewHear: (5/310)
(17/63) interviewHearExam: (5/310)
(18/63) interviewNeedWearHearingAid: (5/310)
(19/63) interviewCombine2Words: (5/310)
(20/63) interviewCombine3Words: (5/310)
(21/63) interviewHydrocephalusShunt: (5/310)
(22/63) interviewCerebralPalsy: (5/310)
(23/63) interviewDevelopmentalDelay: (5/310)
(24/63) interviewLanguageDelay: (5/310)
(25/63) interviewPoorWeightGain: (5/310)
(26/63) interviewSeizure

## 20-09. Secondary Analysis

In [44]:
secondary_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_SECONDARY_ANALYSIS)

all_secondary_columns = id_columns + secondary_columns
secondary_columns

['blindness',
 'moderateSevereCerebralPalsy',
 'cerebralPalsyMerge',
 'gastrostomyTube_b',
 'grossMotorFunctionLevelSeverity',
 'hearingImpairedWithAid',
 'hearingImpairedLevel',
 'multipleImpairment',
 'afterDischargeSeizure']

In [45]:
df_analysis = df_dict[_ANALYSIS_FILENAME]
df_analysis = df_analysis.rename(columns={'followupCenter': 'center', 'fcenter': 'followupCenter', 'ocfolnum': 'followupID'})

df_secondary = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_secondary_columns, debug_df=False, debug_columns=True)
df_secondary = COMBINE_harmonizer.postprocess(df_secondary)

out_filename = os.sep.join([out_dir, '20-09-secondary.csv'])
df_secondary.to_csv(out_filename, index=False)

(1/14) siteID not in df


### 20-09-1. check empty

In [46]:
COMBINE_harmonizer.check_empty(df_secondary)

(0/14) column: center (364 / 0)
(1/14) column: subjectID (364 / 0)
(2/14) column: uniqueID (364 / 0)
(3/14) column: followupID (315 / 49)
(4/14) column: followupCenter (315 / 49)
(5/14) column: blindness (294 / 70)
(6/14) column: hearingImpairedLevel (285 / 79)
(7/14) column: hearingImpairedWithAid (294 / 70)
(8/14) column: grossMotorFunctionLevelSeverity (285 / 79)
(9/14) column: cerebralPalsyMerge (294 / 70)
(10/14) column: moderateSevereCerebralPalsy (294 / 70)
(11/14) column: multipleImpairment (294 / 70)
(12/14) column: gastrostomyTube_b (287 / 77)
(13/14) column: afterDischargeSeizure (293 / 71)


In [47]:
COMBINE_harmonizer.column_info(df_secondary)

(0/14) center: (364/0)
(1/14) subjectID: (364/0)
(2/14) uniqueID: (364/0)
(3/14) followupID: (315/49)
(4/14) followupCenter: (315/49)
(5/14) blindness: (294/70)
(6/14) hearingImpairedLevel: (285/79)
(7/14) hearingImpairedWithAid: (294/70)
(8/14) grossMotorFunctionLevelSeverity: (285/79)
(9/14) cerebralPalsyMerge: (294/70)
(10/14) moderateSevereCerebralPalsy: (294/70)
(11/14) multipleImpairment: (294/70)
(12/14) gastrostomyTube_b: (287/77)
(13/14) afterDischargeSeizure: (293/71)


## 20-10. Outcome

In [48]:
outcome_columns = COMBINE_harmonizer.get_columns(
    df_data_dict,
    COMBINE_harmonizer.CATEGORY_FOLLOW_UP,
    COMBINE_harmonizer.SUBCATEGORY_OUTCOME)

all_outcome_columns = id_columns + outcome_columns
outcome_columns

['flagAdjudicatedOutcome',
 'normalPrimaryOutcome',
 'BayleyIIILanguage',
 'BayleyIIIMotor',
 'BayleyIIICognitive',
 'deathBeforeFollowup',
 'deathBeforeDischarge',
 'disabilityLevelSurvivor',
 'disabilityLevelDeath4Category',
 'moderateSevereDisabilityOrDeath',
 'moderateSevereDisabilitySurvivor',
 'disabilityLevelDeath',
 'outcomeGroup']

In [49]:
df_outcome = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_outcome_columns, debug_df=False, debug_columns=True)
df_outcome = COMBINE_harmonizer.postprocess(df_outcome)

out_filename = os.sep.join([out_dir, '20-10-outcome.csv'])
df_outcome.to_csv(out_filename, index=False)

(1/18) siteID not in df
(16/18) disabilityLevelDeath not in df


### 20-10-1. check empty

In [50]:
COMBINE_harmonizer.check_empty(df_outcome)

(0/17) column: center (364 / 0)
(1/17) column: subjectID (364 / 0)
(2/17) column: uniqueID (364 / 0)
(3/17) column: followupID (315 / 49)
(4/17) column: followupCenter (315 / 49)
(5/17) column: BayleyIIICognitive (283 / 81)
(6/17) column: BayleyIIILanguage (275 / 89)
(7/17) column: BayleyIIIMotor (277 / 87)
(8/17) column: deathBeforeFollowup (354 / 10)
(9/17) column: normalPrimaryOutcome (285 / 79)
(10/17) column: flagAdjudicatedOutcome (9 / 355)
(11/17) column: deathBeforeDischarge (364 / 0)
(12/17) column: moderateSevereDisabilitySurvivor (291 / 73)
(13/17) column: disabilityLevelSurvivor (285 / 79)
(14/17) column: moderateSevereDisabilityOrDeath (347 / 17)
(15/17) column: disabilityLevelDeath4Category (341 / 23)
(16/17) column: outcomeGroup (347 / 17)


In [51]:
COMBINE_harmonizer.column_info(df_outcome)

(0/17) center: (364/0)
(1/17) subjectID: (364/0)
(2/17) uniqueID: (364/0)
(3/17) followupID: (315/49)
(4/17) followupCenter: (315/49)
(5/17) BayleyIIICognitive: (283/81)
(6/17) BayleyIIILanguage: (275/89)
(7/17) BayleyIIIMotor: (277/87)
(8/17) deathBeforeFollowup: (354/10)
(9/17) normalPrimaryOutcome: (285/79)
(10/17) flagAdjudicatedOutcome: (9/355)
(11/17) deathBeforeDischarge: (364/0)
(12/17) moderateSevereDisabilitySurvivor: (291/73)
(13/17) disabilityLevelSurvivor: (285/79)
(14/17) moderateSevereDisabilityOrDeath: (347/17)
(15/17) disabilityLevelDeath4Category: (341/23)
(16/17) outcomeGroup: (347/17)


## 20-07. Readmission

In [52]:
columns_subject_id_map = ['center', 'followupID', 'followupCenter', 'subjectID']
df_subject_id_map = df_follow_up[columns_subject_id_map]

In [53]:
df_readmission = df_dict['of04ar.csv']
df_readmission = COMBINE_harmonizer.valid_columns(df_readmission, all_valid_columns, debug_df=True, debug_columns=False)
df_readmission = COMBINE_harmonizer.postprocess(df_readmission, subject_id_idx='followupID', center_id_idx='followupCenter')

df_readmission = df_readmission.merge(df_subject_id_map, on=['followupCenter', 'followupID'], how='left')
df_readmission = COMBINE_harmonizer.postprocess(df_readmission)

out_filename = os.sep.join([out_dir, '20-07-readmission.csv'])
df_readmission.to_csv(out_filename, index=False)

(3/11) REC_CMP not in columns
(9/11) CMP_DATE not in columns
(10/11) CRT_DATE not in columns


### 20-07-1. check readmission

In [54]:
df_readmission_groupby = df_readmission.groupby(['uniqueID', 'readmissionNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_readmission_groupby['_count'] > 1
df_readmission_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,readmissionNumber,Unnamed: 2_level_1


### 20-07-2. check empty cells

In [55]:
COMBINE_harmonizer.check_empty(df_readmission)

(0/11) column: center (147 / 0)
(1/11) column: subjectID (147 / 0)
(2/11) column: uniqueID (147 / 0)
(3/11) column: followupCenter (147 / 0)
(4/11) column: followupID (147 / 0)
(5/11) column: readmissionPrimaryCauseOtherText (11 / 136)
(6/11) column: readmissionICU (147 / 0)
(7/11) column: readmissionNumber (147 / 0)
(8/11) column: readmissionTimePeriod (147 / 0)
(9/11) column: readmissionPrimaryCause (147 / 0)
(10/11) column: readmissionLengthOfStay (147 / 0)


In [56]:
COMBINE_harmonizer.column_info(df_readmission)

(0/11) center: (147/0)
(1/11) subjectID: (147/0)
(2/11) uniqueID: (147/0)
(3/11) followupCenter: (147/0)
(4/11) followupID: (147/0)
(5/11) readmissionPrimaryCauseOtherText: (11/136)
(6/11) readmissionICU: (147/0)
(7/11) readmissionNumber: (147/0)
(8/11) readmissionTimePeriod: (147/0)
(9/11) readmissionPrimaryCause: (147/0)
(10/11) readmissionLengthOfStay: (147/0)
