In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re
import pydoc

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_OC
sheet_name = COMBINE_harmonizer.SHEET_FOLLOW_UP

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
input_dir = f"{cfg.config['out_dir']}/out-{study_name}"
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}-normalized"

os.makedirs(out_dir, exist_ok=True)

In [5]:
COMBINE_harmonizer.init_mapping(data_dict_filename, study_name)
_VALUE_MAP = COMBINE_harmonizer.build_value_map(data_dict_filename, sheet_name)

[INFO] init_mapping (0/125): signOfHIETone
[INFO] init_mapping (1/125): signOfHIELvlOfCons
[INFO] init_mapping (2/125): signOfHIEPosture
[INFO] init_mapping (3/125): signOfHIEMoro
[INFO] init_mapping (4/125): signOfHIESuck
[INFO] init_mapping (5/125): signOfHIERespiratory
[INFO] init_mapping (6/125): signOfHIEHeartRate
[INFO] init_mapping (7/125): signOfHIEPupils
[INFO] init_mapping (8/125): signOfHIESpontaneousActivity
[INFO] init_mapping (9/125): noNeuroExamReason
[INFO] init_mapping (10/125): consentStatus
[INFO] init_mapping (11/125): treatmentAssign
[INFO] init_mapping (12/125): targetTreatmentTemperature
[INFO] init_mapping (13/125): blanketType
[INFO] init_mapping (14/125): encephalopathyLevel
[INFO] init_mapping (15/125): infantAge
[INFO] init_mapping (16/125): infantSex
[INFO] init_mapping (17/125): ethnicity
[INFO] init_mapping (18/125): education
[INFO] init_mapping (19/125): insurance
[INFO] init_mapping (20/125): race
[INFO] init_mapping (21/125): maritalStatus
[INFO] init

build_value_map: (1108/377) variable: followupCenter type: center
build_value_map: (1109/377) variable: siteID type: text
build_value_map: (1110/377) variable: birthDate type: date
build_value_map: (1111/377) variable: visitDate type: date
build_value_map: (1112/377) variable: birthNumber type: int
build_value_map: (1113/377) variable: center type: center
build_value_map: (1114/377) variable: subjectID type: text
build_value_map: (1115/377) variable: followupID type: text
build_value_map: (1116/377) variable: center_orig type: text
build_value_map: (1117/377) variable: SESVisitDate type: date
build_value_map: (1118/377) variable: SESBirthDate type: date
build_value_map: (1119/377) variable: chronologicalAge_mo type: int
build_value_map: (1120/377) variable: correctedAge_mo type: int
build_value_map: (1121/377) variable: underStateSupervision type: bool
build_value_map: (1122/377) variable: primaryCaretaker type: relationship
build_value_map: (1123/377) variable: otherCaretaker type: re

In [6]:
_ORDER_MAP = COMBINE_harmonizer.build_variable_order_map(data_dict_filename, sheet_name)

## 20-00-follow-up

In [7]:
base_filename = '20-00-follow-up.csv'

In [8]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/10) normalize_value: column: center
(1/10) normalize_value: column: subjectID
(2/10) normalize_value: column: uniqueID
(3/10) normalize_value: column: followupCenter
(4/10) normalize_value: column: followupID
(5/10) normalize_value: column: siteID
(6/10) normalize_value: column: birthDate
(7/10) normalize_value: column: visitDate
(8/10) normalize_value: column: birthNumber
(9/10) normalize_value: column: center_orig


In [9]:
COMBINE_harmonizer.column_info(df)

(0/21) center: (315/0)
(1/21) subjectID: (315/0)
(2/21) uniqueID: (315/0)
(3/21) followupCenter: (315/0)
(4/21) followupID: (315/0)
(5/21) uniqueFollowupID: (315/0)
(6/21) center.orig: (315/0)
(7/21) subjectID.orig: (315/0)
(8/21) uniqueID.orig: (315/0)
(9/21) followupCenter.orig: (315/0)
(10/21) followupID.orig: (315/0)
(11/21) siteID: (315/0)
(12/21) siteID.orig: (315/0)
(13/21) birthDate: (315/0)
(14/21) birthDate.orig: (315/0)
(15/21) visitDate: (315/0)
(16/21) visitDate.orig: (291/24)
(17/21) birthNumber: (315/0)
(18/21) birthNumber.orig: (0/315)
(19/21) center_orig: (315/0)
(20/21) center_orig.orig: (6/309)


## 20-01-ses

In [10]:
base_filename = '20-01-ses.csv'

In [11]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/73) normalize_value: column: center
(1/73) normalize_value: column: subjectID
(2/73) normalize_value: column: uniqueID
(3/73) normalize_value: column: followupCenter
(4/73) normalize_value: column: followupID
(5/73) normalize_value: column: siteID
(6/73) normalize_value: column: underStateSupervision
(7/73) normalize_value: column: otherContributeMoneyToChildHousehold
(8/73) normalize_value: column: workPrimaryCaretaker
(9/73) normalize_value: column: workOtherCaretaker
[WARN] unable to bool: val: (**/<class 'str'>)
(10/73) normalize_value: column: inSchoolPrimaryCaretaker
(11/73) normalize_value: column: inSchoolOtherCaretaker
[WARN] unable to bool: val: (**/<class 'str'>)
(12/73) normalize_value: column: primaryLanguageChildOtherText
(13/73) normalize_value: column: isSecondaryLanguageChild
(14/73) normalize_value: column: secondaryLanguageChildOtherText
(15/73) normalize_value: column: visitingNurseNeed
(16/73) normalize_value: column: homeNurseNeed
(17/73) normalize_value: colum

In [12]:
COMBINE_harmonizer.column_info(df)

(0/147) center: (315/0)
(1/147) subjectID: (315/0)
(2/147) uniqueID: (315/0)
(3/147) followupCenter: (315/0)
(4/147) followupID: (315/0)
(5/147) uniqueFollowupID: (315/0)
(6/147) center.orig: (315/0)
(7/147) subjectID.orig: (315/0)
(8/147) uniqueID.orig: (315/0)
(9/147) followupCenter.orig: (315/0)
(10/147) followupID.orig: (315/0)
(11/147) siteID: (315/0)
(12/147) siteID.orig: (315/0)
(13/147) SESVisitDate: (315/0)
(14/147) SESVisitDate.orig: (287/28)
(15/147) SESBirthDate: (315/0)
(16/147) SESBirthDate.orig: (288/27)
(17/147) chronologicalAge_mo: (315/0)
(18/147) chronologicalAge_mo.orig: (288/27)
(19/147) correctedAge_mo: (315/0)
(20/147) correctedAge_mo.orig: (285/30)
(21/147) underStateSupervision: (315/0)
(22/147) underStateSupervision.orig: (288/27)
(23/147) primaryCaretaker: (315/0)
(24/147) primaryCaretaker.orig: (288/27)
(25/147) otherCaretaker: (315/0)
(26/147) otherCaretaker.orig: (255/60)
(27/147) maritalStatusPrimaryCaretaker: (315/0)
(28/147) maritalStatusPrimaryCaretake

## 20-02-medical-history

In [13]:
base_filename = '20-02-medical-history.csv'

In [14]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/69) normalize_value: column: center
(1/69) normalize_value: column: subjectID
(2/69) normalize_value: column: uniqueID
(3/69) normalize_value: column: followupCenter
(4/69) normalize_value: column: followupID
(5/69) normalize_value: column: siteID
(6/69) normalize_value: column: rehospitalize
(7/69) normalize_value: column: operation
(8/69) normalize_value: column: operationTypanostomyTube
(9/69) normalize_value: column: operationTracheostomy
(10/69) normalize_value: column: operationEyeSurgery
(11/69) normalize_value: column: operationHerniaSurgery
(12/69) normalize_value: column: operationGastrostomyTube
(13/69) normalize_value: column: operationFundoplication
(14/69) normalize_value: column: operationShuntForHydrocephalus
(15/69) normalize_value: column: operationReanastomosisOfLargeOrSmallIntenstine
(16/69) normalize_value: column: operationPDALigation
(17/69) normalize_value: column: operationBrochoscopy
(18/69) normalize_value: column: operationHypospadiusRepair
(19/69) normal

In [15]:
COMBINE_harmonizer.column_info(df)

(0/139) center: (315/0)
(1/139) subjectID: (315/0)
(2/139) uniqueID: (315/0)
(3/139) followupCenter: (315/0)
(4/139) followupID: (315/0)
(5/139) uniqueFollowupID: (315/0)
(6/139) center.orig: (315/0)
(7/139) subjectID.orig: (315/0)
(8/139) uniqueID.orig: (315/0)
(9/139) followupCenter.orig: (315/0)
(10/139) followupID.orig: (315/0)
(11/139) siteID: (315/0)
(12/139) siteID.orig: (315/0)
(13/139) rehospitalize: (315/0)
(14/139) rehospitalize.orig: (287/28)
(15/139) numberRehospitalize: (315/0)
(16/139) numberRehospitalize.orig: (65/250)
(17/139) operation: (315/0)
(18/139) operation.orig: (287/28)
(19/139) operationTypanostomyTube: (315/0)
(20/139) operationTypanostomyTube.orig: (62/253)
(21/139) operationTracheostomy: (315/0)
(22/139) operationTracheostomy.orig: (62/253)
(23/139) operationEyeSurgery: (315/0)
(24/139) operationEyeSurgery.orig: (62/253)
(25/139) operationEyeSurgeryReason: (315/0)
(26/139) operationEyeSurgeryReason.orig: (2/313)
(27/139) operationHerniaSurgery: (315/0)
(28

## 20-03-medical-exam

In [16]:
base_filename = '20-03-medical-exam.csv'

In [17]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/96) normalize_value: column: center
(1/96) normalize_value: column: subjectID
(2/96) normalize_value: column: uniqueID
(3/96) normalize_value: column: followupCenter
(4/96) normalize_value: column: followupID
(5/96) normalize_value: column: siteID
(6/96) normalize_value: column: audiologicAssessment
(7/96) normalize_value: column: audiologicPendingForAssessment
(8/96) normalize_value: column: visualReinforcementAudiometry
(9/96) normalize_value: column: ABR
(10/96) normalize_value: column: hearingTestUnknown
(11/96) normalize_value: column: dysphagia
(12/96) normalize_value: column: aspiration
(13/96) normalize_value: column: abnormalVoice
(14/96) normalize_value: column: drooling
(15/96) normalize_value: column: nothingByMouth
(16/96) normalize_value: column: observedAbnormalMovement
(17/96) normalize_value: column: observedAbnormalMovementShortJerky
(18/96) normalize_value: column: observedAbnormalMovementSlowWrithing
(19/96) normalize_value: column: observedAbnormalMovementTremor

In [18]:
COMBINE_harmonizer.column_info(df)

(0/193) center: (315/0)
(1/193) subjectID: (315/0)
(2/193) uniqueID: (315/0)
(3/193) followupCenter: (315/0)
(4/193) followupID: (315/0)
(5/193) uniqueFollowupID: (315/0)
(6/193) center.orig: (315/0)
(7/193) subjectID.orig: (315/0)
(8/193) uniqueID.orig: (315/0)
(9/193) followupCenter.orig: (315/0)
(10/193) followupID.orig: (315/0)
(11/193) siteID: (315/0)
(12/193) siteID.orig: (315/0)
(13/193) weight_cm: (315/0)
(14/193) weight_cm.orig: (284/31)
(15/193) length_cm: (315/0)
(16/193) length_cm.orig: (280/35)
(17/193) headCircumference_cm: (315/0)
(18/193) headCircumference_cm.orig: (280/35)
(19/193) strabismusRight: (315/0)
(20/193) strabismusRight.orig: (283/32)
(21/193) strabismusLeft: (315/0)
(22/193) strabismusLeft.orig: (284/31)
(23/193) nystagmusRight: (315/0)
(24/193) nystagmusRight.orig: (284/31)
(25/193) nystagmusLeft: (315/0)
(26/193) nystagmusLeft.orig: (284/31)
(27/193) rovingEyeMovementRight: (315/0)
(28/193) rovingEyeMovementRight.orig: (284/31)
(29/193) rovingEyeMovementL

## 20-04-bayley-iii

In [19]:
base_filename = '20-04-bayley-iii.csv'

In [20]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/46) normalize_value: column: center
(1/46) normalize_value: column: subjectID
(2/46) normalize_value: column: uniqueID
(3/46) normalize_value: column: followupCenter
(4/46) normalize_value: column: followupID
(5/46) normalize_value: column: siteID
(6/46) normalize_value: column: BayleyIIICognitiveSubtest
(7/46) normalize_value: column: BayleyIIIReasonNoSuccessCognitiveSubtestText
(8/46) normalize_value: column: BayleyIIILanguageReceptiveSubtest
(9/46) normalize_value: column: BayleyIIIReasonNoSuccessLanguageReceptiveSubtestText
(10/46) normalize_value: column: BayleyIIILanguageExpressiveSubtest
(11/46) normalize_value: column: BayleyIIIReasonNoSuccessLanguageExpressiveSubtestText
(12/46) normalize_value: column: BayleyIIIMotorFineSubtest
(13/46) normalize_value: column: BayleyIIIReasonNoSuccessMotorFineSubtestText
(14/46) normalize_value: column: BayleyIIIMotorGrossSubtest
(15/46) normalize_value: column: BayleyIIIReasonNoSuccessMotorGrossSubtestText
(16/46) normalize_value: column:

In [21]:
COMBINE_harmonizer.column_info(df)

(0/93) center: (315/0)
(1/93) subjectID: (315/0)
(2/93) uniqueID: (315/0)
(3/93) followupCenter: (315/0)
(4/93) followupID: (315/0)
(5/93) uniqueFollowupID: (315/0)
(6/93) center.orig: (315/0)
(7/93) subjectID.orig: (315/0)
(8/93) uniqueID.orig: (315/0)
(9/93) followupCenter.orig: (315/0)
(10/93) followupID.orig: (315/0)
(11/93) siteID: (315/0)
(12/93) siteID.orig: (315/0)
(13/93) BayleyIIICognitiveSubtest: (315/0)
(14/93) BayleyIIICognitiveSubtest.orig: (287/28)
(15/93) BayleyIIIReasonNoSuccessCognitiveSubtest: (315/0)
(16/93) BayleyIIIReasonNoSuccessCognitiveSubtest.orig: (17/298)
(17/93) BayleyIIIReasonNoSuccessCognitiveSubtestText: (315/0)
(18/93) BayleyIIIReasonNoSuccessCognitiveSubtestText.orig: (0/315)
(19/93) BayleyIIILanguageReceptiveSubtest: (315/0)
(20/93) BayleyIIILanguageReceptiveSubtest.orig: (287/28)
(21/93) BayleyIIIReasonNoSuccessLanguageReceptiveSubtest: (315/0)
(22/93) BayleyIIIReasonNoSuccessLanguageReceptiveSubtest.orig: (22/293)
(23/93) BayleyIIIReasonNoSuccessLan

## 20-05. GMFCS

In [22]:
base_filename = '20-05-gmfcs.csv'

In [23]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/7) normalize_value: column: center
(1/7) normalize_value: column: subjectID
(2/7) normalize_value: column: uniqueID
(3/7) normalize_value: column: followupCenter
(4/7) normalize_value: column: followupID
(5/7) normalize_value: column: siteID
(6/7) normalize_value: column: grossMotorFunctionLevel


In [24]:
COMBINE_harmonizer.column_info(df)

(0/15) center: (315/0)
(1/15) subjectID: (315/0)
(2/15) uniqueID: (315/0)
(3/15) followupCenter: (315/0)
(4/15) followupID: (315/0)
(5/15) uniqueFollowupID: (315/0)
(6/15) center.orig: (315/0)
(7/15) subjectID.orig: (315/0)
(8/15) uniqueID.orig: (315/0)
(9/15) followupCenter.orig: (315/0)
(10/15) followupID.orig: (315/0)
(11/15) siteID: (315/0)
(12/15) siteID.orig: (315/0)
(13/15) grossMotorFunctionLevel: (315/0)
(14/15) grossMotorFunctionLevel.orig: (285/30)


## 20-06. Status

In [25]:
base_filename = '20-06-status.csv'

In [26]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/14) normalize_value: column: center
(1/14) normalize_value: column: subjectID
(2/14) normalize_value: column: uniqueID
(3/14) normalize_value: column: followupCenter
(4/14) normalize_value: column: followupID
(5/14) normalize_value: column: siteID
(6/14) normalize_value: column: statusBirthDate
(7/14) normalize_value: column: statusVisitDate
(8/14) normalize_value: column: childFinalStatus
(9/14) normalize_value: column: deathDate
(10/14) normalize_value: column: deathCause
(11/14) normalize_value: column: reasonLossFollowUp
(12/14) normalize_value: column: firstVisitDate
(13/14) normalize_value: column: finalVisitDate


In [27]:
COMBINE_harmonizer.column_info(df)

(0/29) center: (315/0)
(1/29) subjectID: (315/0)
(2/29) uniqueID: (315/0)
(3/29) followupCenter: (315/0)
(4/29) followupID: (315/0)
(5/29) uniqueFollowupID: (315/0)
(6/29) center.orig: (315/0)
(7/29) subjectID.orig: (315/0)
(8/29) uniqueID.orig: (315/0)
(9/29) followupCenter.orig: (315/0)
(10/29) followupID.orig: (315/0)
(11/29) siteID: (315/0)
(12/29) siteID.orig: (315/0)
(13/29) statusVisitDate: (315/0)
(14/29) statusVisitDate.orig: (0/315)
(15/29) statusBirthDate: (315/0)
(16/29) statusBirthDate.orig: (315/0)
(17/29) childFinalStatus: (315/0)
(18/29) childFinalStatus.orig: (315/0)
(19/29) deathDate: (315/0)
(20/29) deathDate.orig: (7/308)
(21/29) deathCause: (315/0)
(22/29) deathCause.orig: (7/308)
(23/29) reasonLossFollowUp: (315/0)
(24/29) reasonLossFollowUp.orig: (21/294)
(25/29) firstVisitDate: (315/0)
(26/29) firstVisitDate.orig: (291/24)
(27/29) finalVisitDate: (315/0)
(28/29) finalVisitDate.orig: (292/23)


## 20-08. Lost Follow-up

In [28]:
base_filename = '20-08-lost.csv'

In [29]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/63) normalize_value: column: center
(1/63) normalize_value: column: subjectID
(2/63) normalize_value: column: uniqueID
(3/63) normalize_value: column: followupCenter
(4/63) normalize_value: column: followupID
(5/63) normalize_value: column: siteID
(6/63) normalize_value: column: lostFollowUpInformationAvailableIndirectSrc
(7/63) normalize_value: column: lostFollowUpChildAlive
(8/63) normalize_value: column: lostFollowUpInterview
(9/63) normalize_value: column: lostFollowUpAnyQuestionCompleteChartReview
(10/63) normalize_value: column: interviewWalkAlone
(11/63) normalize_value: column: interviewSittingAlong
(12/63) normalize_value: column: interviewHeadControl
(13/63) normalize_value: column: interviewSee
(14/63) normalize_value: column: interviewEyeExam
(15/63) normalize_value: column: interviewNeedWearGlasses
(16/63) normalize_value: column: interviewHear
(17/63) normalize_value: column: interviewHearExam
(18/63) normalize_value: column: interviewNeedWearHearingAid
(19/63) normali

In [30]:
COMBINE_harmonizer.column_info(df)

(0/127) center: (315/0)
(1/127) subjectID: (315/0)
(2/127) uniqueID: (315/0)
(3/127) followupCenter: (315/0)
(4/127) followupID: (315/0)
(5/127) uniqueFollowupID: (315/0)
(6/127) center.orig: (315/0)
(7/127) subjectID.orig: (315/0)
(8/127) uniqueID.orig: (315/0)
(9/127) followupCenter.orig: (315/0)
(10/127) followupID.orig: (315/0)
(11/127) siteID: (315/0)
(12/127) siteID.orig: (315/0)
(13/127) lostFollowUpInformationAvailableIndirectSrc: (315/0)
(14/127) lostFollowUpInformationAvailableIndirectSrc.orig: (24/291)
(15/127) lostFollowUpLastContactDate: (315/0)
(16/127) lostFollowUpLastContactDate.orig: (9/306)
(17/127) lostFollowUpFormCompleteDate: (315/0)
(18/127) lostFollowUpFormCompleteDate.orig: (10/305)
(19/127) lostFollowUpChildAlive: (315/0)
(20/127) lostFollowUpChildAlive.orig: (14/301)
(21/127) lostFollowUpLastKnownAliveCorrectedAge_mo: (315/0)
(22/127) lostFollowUpLastKnownAliveCorrectedAge_mo.orig: (13/302)
(23/127) lostFollowUpDeathDate: (315/0)
(24/127) lostFollowUpDeathDate

## 20-09. Secondary Analysis

In [31]:
base_filename = '20-09-secondary.csv'

In [32]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/14) normalize_value: column: center
(1/14) normalize_value: column: subjectID
(2/14) normalize_value: column: uniqueID
(3/14) normalize_value: column: followupID
(4/14) normalize_value: column: followupCenter
(5/14) normalize_value: column: blindness
(6/14) normalize_value: column: hearingImpairedLevel
(7/14) normalize_value: column: hearingImpairedWithAid
(8/14) normalize_value: column: grossMotorFunctionLevelSeverity
(9/14) normalize_value: column: cerebralPalsyMerge
(10/14) normalize_value: column: moderateSevereCerebralPalsy
(11/14) normalize_value: column: multipleImpairment
(12/14) normalize_value: column: gastrostomyTube_b
(13/14) normalize_value: column: afterDischargeSeizure


In [33]:
COMBINE_harmonizer.column_info(df)

(0/29) center: (364/0)
(1/29) subjectID: (364/0)
(2/29) uniqueID: (364/0)
(3/29) followupCenter: (364/0)
(4/29) followupID: (364/0)
(5/29) uniqueFollowupID: (364/0)
(6/29) center.orig: (364/0)
(7/29) subjectID.orig: (364/0)
(8/29) uniqueID.orig: (364/0)
(9/29) followupCenter.orig: (315/49)
(10/29) followupID.orig: (315/49)
(11/29) blindness: (364/0)
(12/29) blindness.orig: (294/70)
(13/29) moderateSevereCerebralPalsy: (364/0)
(14/29) moderateSevereCerebralPalsy.orig: (294/70)
(15/29) cerebralPalsyMerge: (364/0)
(16/29) cerebralPalsyMerge.orig: (294/70)
(17/29) gastrostomyTube_b: (364/0)
(18/29) gastrostomyTube_b.orig: (287/77)
(19/29) grossMotorFunctionLevelSeverity: (364/0)
(20/29) grossMotorFunctionLevelSeverity.orig: (285/79)
(21/29) hearingImpairedWithAid: (364/0)
(22/29) hearingImpairedWithAid.orig: (294/70)
(23/29) hearingImpairedLevel: (364/0)
(24/29) hearingImpairedLevel.orig: (285/79)
(25/29) multipleImpairment: (364/0)
(26/29) multipleImpairment.orig: (294/70)
(27/29) afterDi

## 20-10. Outcome

In [34]:
base_filename = '20-10-outcome.csv'

In [35]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, order_map=_ORDER_MAP)

df['uniqueFollowupID'] = df['uniqueID'].copy()
df = COMBINE_harmonizer.postprocess(df, first_columns=['center', 'subjectID', 'uniqueID', 'followupCenter', 'followupID', 'uniqueFollowupID'])

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/17) normalize_value: column: center
(1/17) normalize_value: column: subjectID
(2/17) normalize_value: column: uniqueID
(3/17) normalize_value: column: followupID
(4/17) normalize_value: column: followupCenter
(5/17) normalize_value: column: BayleyIIICognitive
(6/17) normalize_value: column: BayleyIIILanguage
(7/17) normalize_value: column: BayleyIIIMotor
(8/17) normalize_value: column: deathBeforeFollowup
(9/17) normalize_value: column: normalPrimaryOutcome
(10/17) normalize_value: column: flagAdjudicatedOutcome
(11/17) normalize_value: column: deathBeforeDischarge
(12/17) normalize_value: column: moderateSevereDisabilitySurvivor
(13/17) normalize_value: column: disabilityLevelSurvivor
(14/17) normalize_value: column: moderateSevereDisabilityOrDeath
(15/17) normalize_value: column: disabilityLevelDeath4Category
(16/17) normalize_value: column: outcomeGroup


In [36]:
COMBINE_harmonizer.column_info(df)

(0/35) center: (364/0)
(1/35) subjectID: (364/0)
(2/35) uniqueID: (364/0)
(3/35) followupCenter: (364/0)
(4/35) followupID: (364/0)
(5/35) uniqueFollowupID: (364/0)
(6/35) center.orig: (364/0)
(7/35) subjectID.orig: (364/0)
(8/35) uniqueID.orig: (364/0)
(9/35) followupCenter.orig: (315/49)
(10/35) followupID.orig: (315/49)
(11/35) flagAdjudicatedOutcome: (364/0)
(12/35) flagAdjudicatedOutcome.orig: (9/355)
(13/35) normalPrimaryOutcome: (364/0)
(14/35) normalPrimaryOutcome.orig: (285/79)
(15/35) BayleyIIILanguage: (364/0)
(16/35) BayleyIIILanguage.orig: (275/89)
(17/35) BayleyIIIMotor: (364/0)
(18/35) BayleyIIIMotor.orig: (277/87)
(19/35) BayleyIIICognitive: (364/0)
(20/35) BayleyIIICognitive.orig: (283/81)
(21/35) deathBeforeFollowup: (364/0)
(22/35) deathBeforeFollowup.orig: (354/10)
(23/35) deathBeforeDischarge: (364/0)
(24/35) deathBeforeDischarge.orig: (364/0)
(25/35) disabilityLevelSurvivor: (364/0)
(26/35) disabilityLevelSurvivor.orig: (285/79)
(27/35) disabilityLevelDeath4Catego

## 20-07-readmission

In [37]:
_FLATTEN_IDS = ['readmissionNumber']
base_filename = '20-07-readmission.csv'

In [38]:
filename = os.sep.join([input_dir, base_filename])
df = pd.read_csv(filename, dtype='O')
df = COMBINE_harmonizer.normalize_value(df, _VALUE_MAP, _FLATTEN_IDS,  order_map=_ORDER_MAP)

out_filename = os.sep.join([out_dir, base_filename])
df.to_csv(out_filename, index=False)

(0/11) normalize_value: column: center
(1/11) normalize_value: column: subjectID
(2/11) normalize_value: column: uniqueID
(3/11) normalize_value: column: followupCenter
(4/11) normalize_value: column: followupID
(5/11) normalize_value: column: readmissionPrimaryCauseOtherText
(6/11) normalize_value: column: readmissionICU
(7/11) normalize_value: column: readmissionNumber
(8/11) normalize_value: column: readmissionTimePeriod
(9/11) normalize_value: column: readmissionPrimaryCause
(10/11) normalize_value: column: readmissionLengthOfStay


flatten_index: flatten_ids: ['readmissionNumber'] unique_id_map: {np.int64(1): np.int64(1), np.int64(2): np.int64(2), np.int64(3): np.int64(3), np.int64(4): np.int64(4), np.int64(5): np.int64(5), np.int64(6): np.int64(6), np.int64(7): np.int64(7), np.int64(8): np.int64(8), np.int64(9): np.int64(9), np.int64(10): np.int64(10), np.int64(11): np.int64(11), np.int64(12): np.int64(12), np.int64(13): np.int64(13), np.int64(14): np.int64(14)} the_type: int64


In [39]:
COMBINE_harmonizer.column_info(df)

(0/23) center: (147/0)
(1/23) center.orig: (147/0)
(2/23) subjectID: (147/0)
(3/23) subjectID.orig: (147/0)
(4/23) uniqueID: (147/0)
(5/23) uniqueID.orig: (147/0)
(6/23) followupCenter: (147/0)
(7/23) followupCenter.orig: (147/0)
(8/23) followupID: (147/0)
(9/23) followupID.orig: (147/0)
(10/23) _flatten_index: (147/0)
(11/23) readmissionNumber: (147/0)
(12/23) readmissionNumber.orig: (147/0)
(13/23) readmissionTimePeriod: (147/0)
(14/23) readmissionTimePeriod.orig: (147/0)
(15/23) readmissionPrimaryCause: (147/0)
(16/23) readmissionPrimaryCause.orig: (147/0)
(17/23) readmissionPrimaryCauseOtherText: (147/0)
(18/23) readmissionPrimaryCauseOtherText.orig: (11/136)
(19/23) readmissionLengthOfStay: (147/0)
(20/23) readmissionLengthOfStay.orig: (147/0)
(21/23) readmissionICU: (147/0)
(22/23) readmissionICU.orig: (147/0)
