In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re
import pydoc

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_LH
sheet_name = COMBINE_harmonizer.SHEET_MAIN

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
_FILENAMES = [
    'lh01.csv',
    'lh02.csv',
    'lh04.csv',
    'lh05.csv',
    'lh06.csv',
    'lh06a.csv',
    'lh06bg.csv',
    'lh06cv.csv',
    'lh06f.csv',
    'lh06he.csv',
    'lh06ht.csv',
    'lh06om.csv',
    'lh06rs.csv',
    'lh06tr.csv',
    'lh07.csv',
    'lh09.csv',
    'lh10.csv',
    'lh11.csv',
    'lh12.csv',
    'lh13.csv',
    'lh14.csv',
    'lhmr01.csv',
    'lhmr02.csv',
    'lhmr03.csv',
]

# intended for single record per unique-id.
_FILENAMES_MERGE = [
    'lh01.csv',
    'lh02.csv',
    'lh04.csv',
    'lh05.csv',
    'lh06.csv',
    'lh10.csv',
    'lh11.csv',
    'lh12.csv',
    'lh13.csv',
]

# MRI intended for single record per unique-id.
_MRI_FILENAMES_MERGE = [
    'lhmr01.csv',
    'lhmr02.csv',
]

_MRI_FILENAME = 'lhmr03.csv'

In [5]:
input_dir = cfg.config[f'{study_name}_dir']
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}"

os.makedirs(out_dir, exist_ok=True)

### 00-1. Column Map

In [6]:
df_data_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=sheet_name)
all_valid_columns = list(df_data_dict[COMBINE_harmonizer.DATA_DICT_VAR_NAME])
column_map = {each[study_name]: each[COMBINE_harmonizer.DATA_DICT_VAR_NAME] for _, each in df_data_dict.iterrows()}

### 00-2. df-dict from _FILENAMES

In [7]:
df_dict = {filename: pd.read_csv(os.sep.join([input_dir, filename]), dtype='O').rename(columns=column_map) for filename in _FILENAMES}

### 00-3. df-all and df-main

In [8]:
df_all = None
for idx, each_filename in enumerate(_FILENAMES_MERGE):
    each_full_filename = os.sep.join([root_dir, each_filename])
    each_df = df_dict[each_filename]
    columns = list(each_df.columns)
    each_filename_prefix = re.sub(r'\.csv$', '', each_filename)

    if df_all is None:
        df_all = each_df
    else:
        df_all = df_all.merge(each_df, on=['center', 'subjectID'], how='outer', suffixes=['', ':' + each_filename_prefix])

len(df_all), len(df_all.columns)

(3088, 406)

In [9]:
# main
print('to set main')
is_main = df_all['randomNumber'].isnull() == False

df_main = df_all[is_main]
len(df_main), len(df_main.columns)

to set main


(168, 406)

### 00-4. identifier-column

In [10]:
id_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Identity')
id_columns

['center', 'subjectID']

## 01-02. screening

In [11]:
screening_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Screening')
all_screening_columns = id_columns + screening_columns
screening_columns

['siteID',
 'birthDate',
 'birthNumber',
 'screenComment',
 'coreTempLess32p5COverEq2Hr_e',
 'coreTempLess33p5COver1Hr_e',
 'coreTempLess34COver1Hr_e',
 'first6HrCoolByClinicalProtocol_e',
 'chromosomalAbnormality_e',
 'majorCongenitalAnomaly_e',
 'birthWeightLessEq1800g_e',
 'infantUnlikelySurvive_e',
 'first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e',
 'postnatalAgeLess6HrOrGreater24Hr_e',
 'enrolledConflictingTrial_e',
 'first60MinAnyBloodGasPHLessEq7_i',
 'first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i',
 'historyPerinatalEvent_i',
 'at10MinApgarLessEq5OrVent_i',
 'randomEligible',
 'consentStatus',
 'noConsentReason',
 'noInStudyReason',
 'random',
 'noRandomReason',
 'noRandomReasonText',
 'randomDate',
 'randomTime',
 'randomNumber',
 'randomTreatmentAssign',
 'randomTreatmentReceive',
 'treatmentBlanketType',
 'inOtherTrial',
 'inOtherTrialText']

In [12]:
# screening
print('to set screening')
df_screening = COMBINE_harmonizer.valid_columns(df_all, all_screening_columns)
df_screening = COMBINE_harmonizer.postprocess(df_screening)

out_filename = os.sep.join([out_dir, '00-02-screening.csv'])
df_screening.to_csv(out_filename, index=False)

df_main_screening = COMBINE_harmonizer.valid_columns(df_main, all_screening_columns)
df_main_screening = COMBINE_harmonizer.postprocess(df_main_screening)

out_filename = os.sep.join([out_dir, '01-02-screening.csv'])
df_main_screening.to_csv(out_filename, index=False)

to set screening
(5/36) screenComment not in df
(6/36) coreTempLess32p5COverEq2Hr_e not in df
(7/36) coreTempLess33p5COver1Hr_e not in df
(21/36) randomEligible not in df
(25/36) random not in df
(26/36) noRandomReason not in df
(27/36) noRandomReasonText not in df
(32/36) randomTreatmentReceive not in df
(33/36) treatmentBlanketType not in df
(5/36) screenComment not in df
(6/36) coreTempLess32p5COverEq2Hr_e not in df
(7/36) coreTempLess33p5COver1Hr_e not in df
(21/36) randomEligible not in df
(25/36) random not in df
(26/36) noRandomReason not in df
(27/36) noRandomReasonText not in df
(32/36) randomTreatmentReceive not in df
(33/36) treatmentBlanketType not in df


### 01-02-1. check screening

In [13]:
len(df_screening['center'].unique()), len(df_screening['subjectID'].unique()), len(df_screening['uniqueID'].unique()), len(df_screening)

(21, 656, 3088, 3088)

In [14]:
df_screening_groupby = df_screening.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_screening_groupby['_count'] > 1
df_screening_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-02-2. check empty cells

In [15]:
COMBINE_harmonizer.check_empty(df_screening)

(0/28) column: center (3088 / 0)
(1/28) column: subjectID (3088 / 0)
(2/28) column: uniqueID (3088 / 0)
(3/28) column: siteID (3088 / 0)
(4/28) column: birthDate (3088 / 0)
(5/28) column: birthNumber (3087 / 1)
(6/28) column: coreTempLess34COver1Hr_e (3088 / 0)
(7/28) column: first6HrCoolByClinicalProtocol_e (1295 / 1793)
(8/28) column: chromosomalAbnormality_e (3088 / 0)
(9/28) column: majorCongenitalAnomaly_e (3088 / 0)
(10/28) column: birthWeightLessEq1800g_e (3088 / 0)
(11/28) column: infantUnlikelySurvive_e (3088 / 0)
(12/28) column: first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e (2774 / 314)
(13/28) column: postnatalAgeLess6HrOrGreater24Hr_e (3088 / 0)
(14/28) column: enrolledConflictingTrial_e (2968 / 120)
(15/28) column: first60MinAnyBloodGasPHLessEq7_i (1121 / 1967)
(16/28) column: first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i (1080 / 2008)
(17/28) column: historyPerinatalEvent_i (632 / 2456)
(18/28) column: at10MinApgarLessEq5OrVent_i (632 / 2456)
(19/28)

### 01-02-3. check main-screening

In [16]:
len(df_main_screening['center'].unique()), len(df_main_screening['subjectID'].unique()), len(df_main_screening['uniqueID'].unique()), len(df_main_screening)

(21, 122, 168, 168)

In [17]:
df_main_screening_groupby = df_main_screening.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_main_screening_groupby['_count'] > 1
df_main_screening_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-02-4. check empty cells

In [18]:
COMBINE_harmonizer.check_empty(df_main_screening)

(0/28) column: center (168 / 0)
(1/28) column: subjectID (168 / 0)
(2/28) column: uniqueID (168 / 0)
(3/28) column: siteID (168 / 0)
(4/28) column: birthDate (168 / 0)
(5/28) column: birthNumber (168 / 0)
(6/28) column: coreTempLess34COver1Hr_e (168 / 0)
(7/28) column: first6HrCoolByClinicalProtocol_e (0 / 168)
(8/28) column: chromosomalAbnormality_e (168 / 0)
(9/28) column: majorCongenitalAnomaly_e (168 / 0)
(10/28) column: birthWeightLessEq1800g_e (168 / 0)
(11/28) column: infantUnlikelySurvive_e (168 / 0)
(12/28) column: first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e (153 / 15)
(13/28) column: postnatalAgeLess6HrOrGreater24Hr_e (168 / 0)
(14/28) column: enrolledConflictingTrial_e (162 / 6)
(15/28) column: first60MinAnyBloodGasPHLessEq7_i (156 / 12)
(16/28) column: first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i (141 / 27)
(17/28) column: historyPerinatalEvent_i (44 / 124)
(18/28) column: at10MinApgarLessEq5OrVent_i (44 / 124)
(19/28) column: noConsentReason (0 / 1

In [19]:
COMBINE_harmonizer.column_info(df_main_screening)

(0/28) center: (168/0)
(1/28) subjectID: (168/0)
(2/28) uniqueID: (168/0)
(3/28) siteID: (168/0)
(4/28) birthDate: (168/0)
(5/28) birthNumber: (168/0)
(6/28) coreTempLess34COver1Hr_e: (168/0)
(7/28) first6HrCoolByClinicalProtocol_e: (0/168)
(8/28) chromosomalAbnormality_e: (168/0)
(9/28) majorCongenitalAnomaly_e: (168/0)
(10/28) birthWeightLessEq1800g_e: (168/0)
(11/28) infantUnlikelySurvive_e: (168/0)
(12/28) first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e: (153/15)
(13/28) postnatalAgeLess6HrOrGreater24Hr_e: (168/0)
(14/28) enrolledConflictingTrial_e: (162/6)
(15/28) first60MinAnyBloodGasPHLessEq7_i: (156/12)
(16/28) first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i: (141/27)
(17/28) historyPerinatalEvent_i: (44/124)
(18/28) at10MinApgarLessEq5OrVent_i: (44/124)
(19/28) noConsentReason: (0/168)
(20/28) noInStudyReason: (0/168)
(21/28) consentStatus: (168/0)
(22/28) randomDate: (168/0)
(23/28) randomTime: (168/0)
(24/28) randomNumber: (168/0)
(25/28) randomTreatmentAss

## 01-12. Pre-intervention Neuro Exam

In [20]:
pre_neuro_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Neuro Exam')
all_screening_neuro_columns = id_columns + pre_neuro_columns
pre_neuro_columns

['pre_NeuroExam',
 'pre_NoNeuroExamReason',
 'pre_NeuroExamSignModerateSevereHIE3Category',
 'pre_NeuroExamLevelConsciousness',
 'pre_NeuroExamSpontaneousActivity',
 'pre_NeuroExamPosture',
 'pre_NeuroExamTone',
 'pre_NeuroExamSuck',
 'pre_NeuroExamMoro',
 'pre_NeuroExamPupils',
 'pre_NeuroExamHeartRate',
 'pre_NeuroExamRespiration',
 'pre_NeuroExamDate',
 'pre_NeuroExamTime',
 'pre_NeuroExamSedate',
 'pre_NeuroExamSeizure']

In [21]:
print('to set pre-intervention neuro exam')
df_pre_neuro = COMBINE_harmonizer.valid_columns(df_all, all_screening_neuro_columns)
df_pre_neuro = COMBINE_harmonizer.postprocess(df_pre_neuro)

out_filename = os.sep.join([out_dir, '00-12-neuro-exam.csv'])
df_pre_neuro.to_csv(out_filename, index=False)

df_main_pre_neuro = COMBINE_harmonizer.valid_columns(df_main, all_screening_neuro_columns)
df_main_pre_neuro = COMBINE_harmonizer.postprocess(df_main_pre_neuro)

out_filename = os.sep.join([out_dir, '01-12-neuro-exam.csv'])
df_main_pre_neuro.to_csv(out_filename, index=False)

to set pre-intervention neuro exam


### 01-12-1. check screening-neuro

In [22]:
len(df_pre_neuro['center'].unique()), len(df_pre_neuro['subjectID'].unique()), len(df_pre_neuro['uniqueID'].unique()), len(df_pre_neuro)

(21, 656, 3088, 3088)

In [23]:
df_pre_neuro_groupby = df_pre_neuro.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_pre_neuro_groupby['_count'] > 1
df_pre_neuro_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-12-2. check empty cells

In [24]:
COMBINE_harmonizer.check_empty(df_pre_neuro)

(0/19) column: center (3088 / 0)
(1/19) column: subjectID (3088 / 0)
(2/19) column: uniqueID (3088 / 0)
(3/19) column: pre_NeuroExamSeizure (816 / 2272)
(4/19) column: pre_NeuroExam (816 / 2272)
(5/19) column: pre_NeuroExamSignModerateSevereHIE3Category (734 / 2354)
(6/19) column: pre_NeuroExamRespiration (734 / 2354)
(7/19) column: pre_NeuroExamSedate (734 / 2354)
(8/19) column: pre_NoNeuroExamReason (63 / 3025)
(9/19) column: pre_NeuroExamLevelConsciousness (730 / 2358)
(10/19) column: pre_NeuroExamSpontaneousActivity (730 / 2358)
(11/19) column: pre_NeuroExamPosture (727 / 2361)
(12/19) column: pre_NeuroExamTone (730 / 2358)
(13/19) column: pre_NeuroExamSuck (719 / 2369)
(14/19) column: pre_NeuroExamMoro (706 / 2382)
(15/19) column: pre_NeuroExamPupils (711 / 2377)
(16/19) column: pre_NeuroExamHeartRate (729 / 2359)
(17/19) column: pre_NeuroExamDate (732 / 2356)
(18/19) column: pre_NeuroExamTime (716 / 2372)


### 01-12-3. check main-screening-neuro

In [25]:
len(df_main_pre_neuro['center'].unique()), len(df_main_pre_neuro['subjectID'].unique()), len(df_main_pre_neuro['uniqueID'].unique()), len(df_main_pre_neuro)

(21, 122, 168, 168)

In [26]:
df_main_pre_neuro_groupby = df_main_pre_neuro.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_main_pre_neuro_groupby['_count'] > 1
df_main_pre_neuro_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-12-4. check empty cells

In [27]:
COMBINE_harmonizer.check_empty(df_main_pre_neuro)

(0/19) column: center (168 / 0)
(1/19) column: subjectID (168 / 0)
(2/19) column: uniqueID (168 / 0)
(3/19) column: pre_NeuroExamSeizure (168 / 0)
(4/19) column: pre_NeuroExam (168 / 0)
(5/19) column: pre_NeuroExamSignModerateSevereHIE3Category (168 / 0)
(6/19) column: pre_NeuroExamRespiration (168 / 0)
(7/19) column: pre_NeuroExamSedate (168 / 0)
(8/19) column: pre_NoNeuroExamReason (0 / 168)
(9/19) column: pre_NeuroExamLevelConsciousness (168 / 0)
(10/19) column: pre_NeuroExamSpontaneousActivity (168 / 0)
(11/19) column: pre_NeuroExamPosture (167 / 1)
(12/19) column: pre_NeuroExamTone (168 / 0)
(13/19) column: pre_NeuroExamSuck (168 / 0)
(14/19) column: pre_NeuroExamMoro (164 / 4)
(15/19) column: pre_NeuroExamPupils (164 / 4)
(16/19) column: pre_NeuroExamHeartRate (167 / 1)
(17/19) column: pre_NeuroExamDate (165 / 3)
(18/19) column: pre_NeuroExamTime (164 / 4)


In [28]:
COMBINE_harmonizer.column_info(df_main_pre_neuro)

(0/19) center: (168/0)
(1/19) subjectID: (168/0)
(2/19) uniqueID: (168/0)
(3/19) pre_NeuroExamSeizure: (168/0)
(4/19) pre_NeuroExam: (168/0)
(5/19) pre_NeuroExamSignModerateSevereHIE3Category: (168/0)
(6/19) pre_NeuroExamRespiration: (168/0)
(7/19) pre_NeuroExamSedate: (168/0)
(8/19) pre_NoNeuroExamReason: (0/168)
(9/19) pre_NeuroExamLevelConsciousness: (168/0)
(10/19) pre_NeuroExamSpontaneousActivity: (168/0)
(11/19) pre_NeuroExamPosture: (167/1)
(12/19) pre_NeuroExamTone: (168/0)
(13/19) pre_NeuroExamSuck: (168/0)
(14/19) pre_NeuroExamMoro: (164/4)
(15/19) pre_NeuroExamPupils: (164/4)
(16/19) pre_NeuroExamHeartRate: (167/1)
(17/19) pre_NeuroExamDate: (165/3)
(18/19) pre_NeuroExamTime: (164/4)


## 01-03. Maternal Demographics

In [29]:
maternal_demographics_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Maternal Demographics')
all_maternal_demographics_columns = id_columns + maternal_demographics_columns
maternal_demographics_columns

['motherAge_year',
 'motherRace',
 'motherRaceOther1',
 'motherRaceOther2',
 'motherRaceOther3',
 'motherRaceOther4',
 'motherRaceOther5',
 'motherRaceOther6',
 'motherEthnicity',
 'motherMaritalStatus',
 'motherEducation',
 'motherInsurance']

In [30]:
print('to set maternal demographics')
df_maternal_demographics = COMBINE_harmonizer.valid_columns(df_main, all_maternal_demographics_columns)
df_maternal_demographics = COMBINE_harmonizer.postprocess(df_maternal_demographics)

out_filename = os.sep.join([out_dir, '01-03-maternal-demographics.csv'])
df_maternal_demographics.to_csv(out_filename, index=False)

to set maternal demographics
(8/14) motherRaceOther5 not in df
(9/14) motherRaceOther6 not in df
(13/14) motherInsurance not in df


### 01-03-2. check empty cells

In [31]:
COMBINE_harmonizer.check_empty(df_maternal_demographics)

(0/12) column: center (168 / 0)
(1/12) column: subjectID (168 / 0)
(2/12) column: uniqueID (168 / 0)
(3/12) column: motherAge_year (168 / 0)
(4/12) column: motherRace (168 / 0)
(5/12) column: motherRaceOther1 (0 / 168)
(6/12) column: motherRaceOther2 (0 / 168)
(7/12) column: motherRaceOther3 (0 / 168)
(8/12) column: motherRaceOther4 (0 / 168)
(9/12) column: motherEthnicity (168 / 0)
(10/12) column: motherMaritalStatus (168 / 0)
(11/12) column: motherEducation (167 / 1)


In [32]:
COMBINE_harmonizer.column_info(df_maternal_demographics)

(0/12) center: (168/0)
(1/12) subjectID: (168/0)
(2/12) uniqueID: (168/0)
(3/12) motherAge_year: (168/0)
(4/12) motherRace: (168/0)
(5/12) motherRaceOther1: (0/168)
(6/12) motherRaceOther2: (0/168)
(7/12) motherRaceOther3: (0/168)
(8/12) motherRaceOther4: (0/168)
(9/12) motherEthnicity: (168/0)
(10/12) motherMaritalStatus: (168/0)
(11/12) motherEducation: (167/1)


## 01-04. Pregnancy History

In [33]:
pregnancy_history_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Pregnancy History')
all_pregnancy_history_columns = id_columns + pregnancy_history_columns
pregnancy_history_columns

['gravida',
 'parity',
 'multipleBirth',
 'numFetus',
 'prenatalCare',
 'hypertensionEclampsia',
 'antepartumHemorrhage',
 'thyroidMalfunction',
 'diabetes']

In [34]:
print('to set pregnancy history')
df_pregnancy_history = COMBINE_harmonizer.valid_columns(df_main, all_pregnancy_history_columns)
df_pregnancy_history = COMBINE_harmonizer.postprocess(df_pregnancy_history)

out_filename = os.sep.join([out_dir, '01-04-pregnancy-history.csv'])
df_pregnancy_history.to_csv(out_filename, index=False)

to set pregnancy history


### 01-04-2. check empty cells

In [35]:
COMBINE_harmonizer.check_empty(df_pregnancy_history)

(0/12) column: center (168 / 0)
(1/12) column: subjectID (168 / 0)
(2/12) column: uniqueID (168 / 0)
(3/12) column: multipleBirth (168 / 0)
(4/12) column: prenatalCare (168 / 0)
(5/12) column: hypertensionEclampsia (168 / 0)
(6/12) column: antepartumHemorrhage (168 / 0)
(7/12) column: thyroidMalfunction (168 / 0)
(8/12) column: diabetes (168 / 0)
(9/12) column: gravida (168 / 0)
(10/12) column: parity (168 / 0)
(11/12) column: numFetus (3 / 165)


In [36]:
COMBINE_harmonizer.column_info(df_pregnancy_history)

(0/12) center: (168/0)
(1/12) subjectID: (168/0)
(2/12) uniqueID: (168/0)
(3/12) multipleBirth: (168/0)
(4/12) prenatalCare: (168/0)
(5/12) hypertensionEclampsia: (168/0)
(6/12) antepartumHemorrhage: (168/0)
(7/12) thyroidMalfunction: (168/0)
(8/12) diabetes: (168/0)
(9/12) gravida: (168/0)
(10/12) parity: (168/0)
(11/12) numFetus: (3/165)


## 01-05. Labor Delivery

In [37]:
labor_delivery_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Labor Delivery')
all_labor_delivery_columns = id_columns + labor_delivery_columns
labor_delivery_columns

['maternalAdmissionDate',
 'maternalAdmissionTime',
 'ruptureDate',
 'ruptureTime',
 'ruptureOver18Hr',
 'ruptureBeforeDelivery',
 'labor',
 'laborOnsetDate',
 'laborOnsetTime',
 'deliveryMode',
 'fetalDecelerate',
 'cordMishap',
 'uterineRupture',
 'shoulderDystocia',
 'placentalProblem',
 'maternalHemorrhage',
 'maternalTrauma',
 'maternalCardioRespiratoryArrest',
 'maternalSeizure',
 'pyrexiaOver37p6C',
 'chorioamnionitis',
 'placentalPathologyPerformed',
 'histologicChorioamionitis',
 'laborAntibiotics',
 'laborAntibioticsCode1',
 'laborAntibioticsCode2',
 'laborAntibioticsCode3',
 'laborAntibioticsCode4',
 'laborAntibioticsCode5',
 'laborAntibioticsCode6']

In [38]:
print('to set labor delivery')
df_labor_delivery = COMBINE_harmonizer.valid_columns(df_main, all_labor_delivery_columns)
df_labor_delivery = COMBINE_harmonizer.postprocess(df_labor_delivery)

out_filename = os.sep.join([out_dir, '01-05-labor-delivery.csv'])
df_labor_delivery.to_csv(out_filename, index=False)

to set labor delivery
(7/32) ruptureBeforeDelivery not in df
(29/32) laborAntibioticsCode4 not in df
(30/32) laborAntibioticsCode5 not in df
(31/32) laborAntibioticsCode6 not in df


### 01-05-2. check empty cells

In [39]:
COMBINE_harmonizer.check_empty(df_labor_delivery)

(0/29) column: center (168 / 0)
(1/29) column: subjectID (168 / 0)
(2/29) column: uniqueID (168 / 0)
(3/29) column: fetalDecelerate (168 / 0)
(4/29) column: cordMishap (168 / 0)
(5/29) column: uterineRupture (168 / 0)
(6/29) column: shoulderDystocia (168 / 0)
(7/29) column: placentalProblem (168 / 0)
(8/29) column: maternalHemorrhage (168 / 0)
(9/29) column: maternalTrauma (168 / 0)
(10/29) column: maternalCardioRespiratoryArrest (168 / 0)
(11/29) column: maternalSeizure (168 / 0)
(12/29) column: pyrexiaOver37p6C (168 / 0)
(13/29) column: chorioamnionitis (168 / 0)
(14/29) column: placentalPathologyPerformed (168 / 0)
(15/29) column: histologicChorioamionitis (63 / 105)
(16/29) column: laborAntibiotics (168 / 0)
(17/29) column: ruptureOver18Hr (18 / 150)
(18/29) column: labor (168 / 0)
(19/29) column: laborAntibioticsCode1 (41 / 127)
(20/29) column: laborAntibioticsCode2 (13 / 155)
(21/29) column: laborAntibioticsCode3 (3 / 165)
(22/29) column: ruptureDate (160 / 8)
(23/29) column: rup

In [40]:
COMBINE_harmonizer.column_info(df_labor_delivery)

(0/29) center: (168/0)
(1/29) subjectID: (168/0)
(2/29) uniqueID: (168/0)
(3/29) fetalDecelerate: (168/0)
(4/29) cordMishap: (168/0)
(5/29) uterineRupture: (168/0)
(6/29) shoulderDystocia: (168/0)
(7/29) placentalProblem: (168/0)
(8/29) maternalHemorrhage: (168/0)
(9/29) maternalTrauma: (168/0)
(10/29) maternalCardioRespiratoryArrest: (168/0)
(11/29) maternalSeizure: (168/0)
(12/29) pyrexiaOver37p6C: (168/0)
(13/29) chorioamnionitis: (168/0)
(14/29) placentalPathologyPerformed: (168/0)
(15/29) histologicChorioamionitis: (63/105)
(16/29) laborAntibiotics: (168/0)
(17/29) ruptureOver18Hr: (18/150)
(18/29) labor: (168/0)
(19/29) laborAntibioticsCode1: (41/127)
(20/29) laborAntibioticsCode2: (13/155)
(21/29) laborAntibioticsCode3: (3/165)
(22/29) ruptureDate: (160/8)
(23/29) ruptureTime: (150/18)
(24/29) laborOnsetDate: (116/52)
(25/29) laborOnsetTime: (85/83)
(26/29) deliveryMode: (168/0)
(27/29) maternalAdmissionDate: (155/13)
(28/29) maternalAdmissionTime: (121/47)


## 01-06. Birth

In [41]:
birth_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Birth')
all_birth_columns = id_columns + birth_columns
birth_columns

['encephalopathyLevel',
 'randomInfantAge',
 'birthDate',
 'birthTime',
 'birthWeight_g',
 'birthLength_cm',
 'birthHeadCircumference_cm',
 'birthGestationalAge_week',
 'infantSex',
 'infantOutborn',
 'outbornInHospital',
 'outbornOutHospital',
 'neonateAdmissionDate',
 'neonateAdmissionTime',
 'Apgar1min',
 'Apgar5min',
 'Apgar10min',
 'Apgar15min',
 'Apgar20min',
 'deliveryResuscitation',
 'deliveryOxygen',
 'deliveryBaggingAndMask',
 'deliveryChestCompression',
 'deliveryIntubation',
 'deliveryDrug',
 'at10MinContinueResuscitation',
 'at10MinOxygen',
 'at10MinBaggingAndMask',
 'at10MinChestCompression',
 'at10MinIntubation',
 'at10MinDrug',
 'spontaneousRespirationTime',
 'cordBloodGas',
 'cordBloodGasSrc',
 'cordBloodGasPH',
 'cordBloodGasPCO2_mmHg',
 'cordBloodGasPO2_mmHg',
 'cordBloodGasHCO3_mEqPerL',
 'cordBloodGasBaseDeficit_mEqPerL',
 'firstPostnatalBloodGas',
 'firstPostnatalBloodGasSrc',
 'firstPostnatalBloodGasDate',
 'firstPostnatalBloodGasTime',
 'firstPostnatalBloodGasPH

In [42]:
print('to set birth')
df_birth = COMBINE_harmonizer.valid_columns(df_main, all_birth_columns)
df_birth = COMBINE_harmonizer.postprocess(df_birth)

out_filename = os.sep.join([out_dir, '01-06-birth.csv'])
df_birth.to_csv(out_filename, index=False)

to set birth
(19/50) Apgar15min not in df


### 01-06-2. check empty cells

In [43]:
COMBINE_harmonizer.check_empty(df_birth)

(0/50) column: center (168 / 0)
(1/50) column: subjectID (168 / 0)
(2/50) column: uniqueID (168 / 0)
(3/50) column: birthDate (168 / 0)
(4/50) column: randomInfantAge (168 / 0)
(5/50) column: encephalopathyLevel (168 / 0)
(6/50) column: infantOutborn (168 / 0)
(7/50) column: outbornInHospital (146 / 22)
(8/50) column: outbornOutHospital (146 / 22)
(9/50) column: deliveryResuscitation (168 / 0)
(10/50) column: deliveryOxygen (166 / 2)
(11/50) column: deliveryBaggingAndMask (166 / 2)
(12/50) column: deliveryChestCompression (166 / 2)
(13/50) column: deliveryIntubation (166 / 2)
(14/50) column: deliveryDrug (166 / 2)
(15/50) column: at10MinContinueResuscitation (166 / 2)
(16/50) column: at10MinOxygen (116 / 52)
(17/50) column: at10MinBaggingAndMask (116 / 52)
(18/50) column: at10MinChestCompression (116 / 52)
(19/50) column: at10MinIntubation (116 / 52)
(20/50) column: at10MinDrug (116 / 52)
(21/50) column: cordBloodGas (168 / 0)
(22/50) column: firstPostnatalBloodGas (168 / 0)
(23/50) co

In [44]:
COMBINE_harmonizer.column_info(df_birth)

(0/50) center: (168/0)
(1/50) subjectID: (168/0)
(2/50) uniqueID: (168/0)
(3/50) birthDate: (168/0)
(4/50) randomInfantAge: (168/0)
(5/50) encephalopathyLevel: (168/0)
(6/50) infantOutborn: (168/0)
(7/50) outbornInHospital: (146/22)
(8/50) outbornOutHospital: (146/22)
(9/50) deliveryResuscitation: (168/0)
(10/50) deliveryOxygen: (166/2)
(11/50) deliveryBaggingAndMask: (166/2)
(12/50) deliveryChestCompression: (166/2)
(13/50) deliveryIntubation: (166/2)
(14/50) deliveryDrug: (166/2)
(15/50) at10MinContinueResuscitation: (166/2)
(16/50) at10MinOxygen: (116/52)
(17/50) at10MinBaggingAndMask: (116/52)
(18/50) at10MinChestCompression: (116/52)
(19/50) at10MinIntubation: (116/52)
(20/50) at10MinDrug: (116/52)
(21/50) cordBloodGas: (168/0)
(22/50) firstPostnatalBloodGas: (168/0)
(23/50) birthTime: (168/0)
(24/50) neonateAdmissionDate: (146/22)
(25/50) neonateAdmissionTime: (146/22)
(26/50) Apgar1min: (167/1)
(27/50) Apgar5min: (168/0)
(28/50) Apgar10min: (140/28)
(29/50) Apgar20min: (32/136)


## 01-07. Pre Intervention - Temperature

In [45]:
temperature_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Temperature')
all_temperature_pre_intervention_columns = id_columns + temperature_pre_intervention_columns
temperature_pre_intervention_columns

['targetTreatmentTemperature_C',
 'pre_CoolInitiate',
 'pre_CoolbyIceGelPack',
 'pre_CoolPassively',
 'pre_CoolClinically',
 'pre_CoolInitiateDate',
 'pre_CoolInitiateTime',
 'pre_AfterOvershootReach33p5C',
 'pre_AfterOvershootReach33p5CDate',
 'pre_AfterOvershootReach33p5CTime',
 'pre_TemperatureMinDate',
 'pre_TemperatureMinTime',
 'pre_SkinTemperatureMin_C',
 'pre_AxillaryTemperatureMin_C',
 'pre_EsophagealTemperatureMin_C',
 'pre_ServoSetMin_C',
 'pre_TemperatureMaxDate',
 'pre_TemperatureMaxTime',
 'pre_SkinTemperatureMax_C',
 'pre_AxillaryTemperatureMax_C',
 'pre_EsophagealTemperatureMax_C',
 'pre_ServoSetMax_C']

In [46]:
df_temperature = df_dict['lh06tr.csv'].copy()

temperatureTimeSlot_int = df_temperature['temperatureTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre_intervention_min = temperatureTimeSlot_int == 0
is_pre_intervention_max = temperatureTimeSlot_int == 1

# before_baseline_min
df_temperature_pre_intervention_min = df_temperature[is_pre_intervention_min]
pre_intervention_min_rename_map = {
    'temperatureDate': 'pre_TemperatureMinDate',
    'temperatureTime': 'pre_TemperatureMinTime',
    'skinTemperature_C': 'pre_SkinTemperatureMin_C',
    'axillaryTemperature_C': 'pre_AxillaryTemperatureMin_C',
    'esophagealTemperature_C': 'pre_EsophagealTemperatureMin_C',
    'servoSetTemperature_C': 'pre_ServoSetMin_C',
}

df_temperature_pre_intervention_min = df_temperature_pre_intervention_min.rename(columns=pre_intervention_min_rename_map)

# before_baseline_max
df_temperature_before_baseline_max = df_temperature[is_pre_intervention_max]
before_baseline_max_rename_map = {
    'temperatureDate': 'pre_TemperatureMaxDate',
    'temperatureTime': 'pre_TemperatureMaxTime',
    'skinTemperature_C': 'pre_SkinTemperatureMax_C',
    'axillaryTemperature_C': 'pre_AxillaryTemperatureMax_C',
    'esophagealTemperature_C': 'pre_EsophagealTemperatureMax_C',
    'servoSetTemperature_C': 'pre_ServoSetMax_C',
}
df_temperature_before_baseline_max = df_temperature_before_baseline_max.rename(columns=before_baseline_max_rename_map)

# merge
merge_columns = ['center', 'subjectID']
df_temperature_pre_intervention = df_temperature_pre_intervention_min.merge(df_temperature_before_baseline_max, on=merge_columns, how='outer')


In [47]:
print('to set pre-baseline temperature')
df_temperature_pre_intervention = COMBINE_harmonizer.valid_columns(df_temperature_pre_intervention, all_temperature_pre_intervention_columns)
df_temperature_pre_intervention = COMBINE_harmonizer.postprocess(df_temperature_pre_intervention)

out_filename = os.sep.join([out_dir, '01-07-pre-temperature.csv'])
df_temperature_pre_intervention.to_csv(out_filename, index=False)

to set pre-baseline temperature
(2/24) targetTreatmentTemperature_C not in df
(3/24) pre_CoolInitiate not in df
(4/24) pre_CoolbyIceGelPack not in df
(5/24) pre_CoolPassively not in df
(6/24) pre_CoolClinically not in df
(7/24) pre_CoolInitiateDate not in df
(8/24) pre_CoolInitiateTime not in df
(9/24) pre_AfterOvershootReach33p5C not in df
(10/24) pre_AfterOvershootReach33p5CDate not in df
(11/24) pre_AfterOvershootReach33p5CTime not in df


### 01-07-2. check empty cells

In [48]:
COMBINE_harmonizer.check_empty(df_temperature_pre_intervention)

(0/15) column: center (155 / 0)
(1/15) column: subjectID (155 / 0)
(2/15) column: uniqueID (155 / 0)
(3/15) column: pre_TemperatureMinDate (155 / 0)
(4/15) column: pre_TemperatureMinTime (153 / 2)
(5/15) column: pre_SkinTemperatureMin_C (66 / 89)
(6/15) column: pre_AxillaryTemperatureMin_C (147 / 8)
(7/15) column: pre_EsophagealTemperatureMin_C (0 / 155)
(8/15) column: pre_ServoSetMin_C (69 / 86)
(9/15) column: pre_TemperatureMaxDate (151 / 4)
(10/15) column: pre_TemperatureMaxTime (148 / 7)
(11/15) column: pre_SkinTemperatureMax_C (70 / 85)
(12/15) column: pre_AxillaryTemperatureMax_C (139 / 16)
(13/15) column: pre_EsophagealTemperatureMax_C (0 / 155)
(14/15) column: pre_ServoSetMax_C (68 / 87)


In [49]:
COMBINE_harmonizer.column_info(df_temperature_pre_intervention)

(0/15) center: (155/0)
(1/15) subjectID: (155/0)
(2/15) uniqueID: (155/0)
(3/15) pre_TemperatureMinDate: (155/0)
(4/15) pre_TemperatureMinTime: (153/2)
(5/15) pre_SkinTemperatureMin_C: (66/89)
(6/15) pre_AxillaryTemperatureMin_C: (147/8)
(7/15) pre_EsophagealTemperatureMin_C: (0/155)
(8/15) pre_ServoSetMin_C: (69/86)
(9/15) pre_TemperatureMaxDate: (151/4)
(10/15) pre_TemperatureMaxTime: (148/7)
(11/15) pre_SkinTemperatureMax_C: (70/85)
(12/15) pre_AxillaryTemperatureMax_C: (139/16)
(13/15) pre_EsophagealTemperatureMax_C: (0/155)
(14/15) pre_ServoSetMax_C: (68/87)


## 01-08. Pre-intervention - Cardio

In [50]:
cardio_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Cardiovascular')
all_cardio_pre_intervention_columns = id_columns + cardio_pre_intervention_columns
cardio_pre_intervention_columns

['pre_CardioDate',
 'pre_CardioTime',
 'pre_CardioSystolicBloodPressure_mmHg',
 'pre_CardioDiastolicBloodPressure_mmHg',
 'pre_CardioHeartRate_BPM',
 'pre_CardioVolumeExpand',
 'pre_CardioInotropicAgent',
 'pre_CardioBloodTransfusion',
 'pre_CardioPlatelets']

In [51]:
df_cardio = df_dict['lh06cv.csv'].copy()

cardioTimeSlot_int = df_cardio['cardioTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = cardioTimeSlot_int == 0
df_cardio_pre_intervention = df_cardio[is_pre]
pre_rename_map = {
    'cardioDate': 'pre_CardioDate',
    'cardioTime': 'pre_CardioTime',
    'cardioSystolicBloodPressure_mmHg': 'pre_CardioSystolicBloodPressure_mmHg',
    'cardioDiastolicBloodPressure_mmHg': 'pre_CardioDiastolicBloodPressure_mmHg',
    'cardioHeartRate_BPM': 'pre_CardioHeartRate_BPM',
    'cardioVolumeExpand': 'pre_CardioVolumeExpand',
    'cardioInotropicAgent': 'pre_CardioInotropicAgent',
    'cardioBloodTransfusion': 'pre_CardioBloodTransfusion',
    'cardioPlatelets': 'pre_CardioPlatelets',
}
df_cardio_pre_intervention = df_cardio_pre_intervention.rename(columns=pre_rename_map)

df_cardio_pre_intervention = COMBINE_harmonizer.valid_columns(df_cardio_pre_intervention, all_cardio_pre_intervention_columns, debug_df=True, debug_columns=True)
df_cardio_pre_intervention = COMBINE_harmonizer.postprocess(df_cardio_pre_intervention)

out_filename = os.sep.join([out_dir, '01-08-pre-cardiovascular.csv'])
df_cardio_pre_intervention.to_csv(out_filename, index=False)

(5/15) REC_CMP not in columns
(7/15) cardioTimeSlot not in columns
(13/15) CMP_DATE not in columns
(14/15) CRT_DATE not in columns


### 01-08-2. Check empty

In [52]:
COMBINE_harmonizer.check_empty(df_cardio_pre_intervention)

(0/12) column: center (167 / 0)
(1/12) column: subjectID (167 / 0)
(2/12) column: uniqueID (167 / 0)
(3/12) column: pre_CardioVolumeExpand (167 / 0)
(4/12) column: pre_CardioInotropicAgent (167 / 0)
(5/12) column: pre_CardioBloodTransfusion (167 / 0)
(6/12) column: pre_CardioPlatelets (167 / 0)
(7/12) column: pre_CardioDate (167 / 0)
(8/12) column: pre_CardioTime (164 / 3)
(9/12) column: pre_CardioSystolicBloodPressure_mmHg (158 / 9)
(10/12) column: pre_CardioDiastolicBloodPressure_mmHg (158 / 9)
(11/12) column: pre_CardioHeartRate_BPM (162 / 5)


In [53]:
COMBINE_harmonizer.column_info(df_cardio_pre_intervention)

(0/12) center: (167/0)
(1/12) subjectID: (167/0)
(2/12) uniqueID: (167/0)
(3/12) pre_CardioVolumeExpand: (167/0)
(4/12) pre_CardioInotropicAgent: (167/0)
(5/12) pre_CardioBloodTransfusion: (167/0)
(6/12) pre_CardioPlatelets: (167/0)
(7/12) pre_CardioDate: (167/0)
(8/12) pre_CardioTime: (164/3)
(9/12) pre_CardioSystolicBloodPressure_mmHg: (158/9)
(10/12) pre_CardioDiastolicBloodPressure_mmHg: (158/9)
(11/12) pre_CardioHeartRate_BPM: (162/5)


## 01-09. Pre-intervention - Infection

In [54]:
infection_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Infection')
all_infection_pre_intervention_columns = id_columns + infection_pre_intervention_columns
infection_pre_intervention_columns

['pre_PositiveCulture',
 'pre_PositiveCultureSrc',
 'pre_PositiveCultureDate',
 'pre_PositiveCultureTime',
 'pre_PositiveCultureOrganismCode1',
 'pre_PositiveCultureOrganismCode2',
 'pre_PositiveCultureOrganismCode3',
 'pre_Antibiotics',
 'pre_AntibioticsCode1',
 'pre_AntibioticsCode2',
 'pre_AntibioticsCode3']

In [55]:
df_infection_pre_intervention = df_dict['lh06.csv']
df_infection_pre_intervention = COMBINE_harmonizer.valid_columns(df_infection_pre_intervention, all_infection_pre_intervention_columns)
df_infection_pre_intervention = COMBINE_harmonizer.postprocess(df_infection_pre_intervention)

out_filename = os.sep.join([out_dir, '01-09-pre-infection.csv'])
df_infection_pre_intervention.to_csv(out_filename, index=False)

### 01-09-2. check empty cells

In [56]:
COMBINE_harmonizer.check_empty(df_infection_pre_intervention)

(0/14) column: center (168 / 0)
(1/14) column: subjectID (168 / 0)
(2/14) column: uniqueID (168 / 0)
(3/14) column: pre_PositiveCulture (168 / 0)
(4/14) column: pre_Antibiotics (166 / 2)
(5/14) column: pre_PositiveCultureSrc (1 / 167)
(6/14) column: pre_PositiveCultureDate (1 / 167)
(7/14) column: pre_PositiveCultureTime (0 / 168)
(8/14) column: pre_PositiveCultureOrganismCode1 (1 / 167)
(9/14) column: pre_PositiveCultureOrganismCode2 (0 / 168)
(10/14) column: pre_PositiveCultureOrganismCode3 (0 / 168)
(11/14) column: pre_AntibioticsCode1 (146 / 22)
(12/14) column: pre_AntibioticsCode2 (138 / 30)
(13/14) column: pre_AntibioticsCode3 (10 / 158)


In [57]:
COMBINE_harmonizer.column_info(df_infection_pre_intervention)

(0/14) center: (168/0)
(1/14) subjectID: (168/0)
(2/14) uniqueID: (168/0)
(3/14) pre_PositiveCulture: (168/0)
(4/14) pre_Antibiotics: (166/2)
(5/14) pre_PositiveCultureSrc: (1/167)
(6/14) pre_PositiveCultureDate: (1/167)
(7/14) pre_PositiveCultureTime: (0/168)
(8/14) pre_PositiveCultureOrganismCode1: (1/167)
(9/14) pre_PositiveCultureOrganismCode2: (0/168)
(10/14) pre_PositiveCultureOrganismCode3: (0/168)
(11/14) pre_AntibioticsCode1: (146/22)
(12/14) pre_AntibioticsCode2: (138/30)
(13/14) pre_AntibioticsCode3: (10/158)


## 01-10. Pre-intervention - Other Medication

In [58]:
other_med_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Other Medication')
all_other_med_pre_intervention_columns = id_columns + other_med_pre_intervention_columns
other_med_pre_intervention_columns

['pre_OtherMedTargetDate',
 'pre_OtherMedTargetTime',
 'pre_Anticonvulsants',
 'pre_Anticonvulsants1',
 'pre_Anticonvulsants2',
 'pre_Anticonvulsants3',
 'pre_Analgesics',
 'pre_AnalgesicsSedatives1',
 'pre_AnalgesicsSedatives2',
 'pre_AnalgesicsSedatives3',
 'pre_Antipyretics',
 'pre_Antipyretics1',
 'pre_Antipyretics2',
 'pre_Antipyretics3',
 'pre_Paralytics',
 'pre_Paralytics1',
 'pre_Paralytics2',
 'pre_Paralytics3',
 'pre_OtherMedFluidIntake_ccPerKg',
 'pre_OtherMedUrineOutput_ccPerKg']

In [59]:
df_other_med = df_dict['lh06om.csv'].copy()

otherMedTimeSlot_int = df_other_med['otherMedTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = otherMedTimeSlot_int == 0
df_other_med_pre_intervention = df_other_med[is_pre]
pre_rename_map = {
    'otherMedTargetDate': 'pre_OtherMedTargetDate',
    'otherMedTargetTime': 'pre_OtherMedTargetTime',
    'anticonvulsants': 'pre_Anticonvulsants',
    'anticonvulsants1': 'pre_Anticonvulsants1',
    'anticonvulsants2': 'pre_Anticonvulsants2',
    'anticonvulsants3': 'pre_Anticonvulsants3',
    'analgesics': 'pre_Analgesics',
    'analgesicsSedatives1': 'pre_AnalgesicsSedatives1',
    'analgesicsSedatives2': 'pre_AnalgesicsSedatives2',
    'analgesicsSedatives3': 'pre_AnalgesicsSedatives3',
    'antipyretics': 'pre_Antipyretics',
    'antipyretics1': 'pre_Antipyretics1',
    'antipyretics2': 'pre_Antipyretics2',
    'antipyretics3': 'pre_Antipyretics3',
    'paralytics': 'pre_Paralytics',
    'paralytics1': 'pre_Paralytics1',
    'paralytics2': 'pre_Paralytics2',
    'paralytics3': 'pre_Paralytics3',
    'otherMedFluidIntake': 'pre_OtherMedFluidIntake',
    'otherMedUrineOutput': 'pre_OtherMedUrineOutput',
}
df_other_med_pre_intervention = df_other_med_pre_intervention.rename(columns=pre_rename_map)

df_other_med_pre_intervention = COMBINE_harmonizer.valid_columns(df_other_med_pre_intervention, all_other_med_pre_intervention_columns, debug_df=True, debug_columns=True)
df_other_med_pre_intervention = COMBINE_harmonizer.postprocess(df_other_med_pre_intervention)

out_filename = os.sep.join([out_dir, '01-10-pre-other-med.csv'])
df_other_med_pre_intervention.to_csv(out_filename, index=False)

(1/26) REC_CMP not in columns
(3/26) otherMedTimeSlot not in columns
(22/26) otherMedFluidIntake_ccPerKg not in columns
(23/26) otherMedUrineOutput_ccPerKg not in columns
(24/26) CMP_DATE not in columns
(25/26) CRT_DATE not in columns
(20/22) pre_OtherMedFluidIntake_ccPerKg not in df
(21/22) pre_OtherMedUrineOutput_ccPerKg not in df


### 01-10-2. check empty cells

In [60]:
COMBINE_harmonizer.check_empty(df_other_med_pre_intervention)

(0/21) column: center (142 / 0)
(1/21) column: subjectID (142 / 0)
(2/21) column: uniqueID (142 / 0)
(3/21) column: pre_OtherMedTargetDate (3 / 139)
(4/21) column: pre_OtherMedTargetTime (2 / 140)
(5/21) column: pre_Anticonvulsants (1 / 141)
(6/21) column: pre_Anticonvulsants1 (97 / 45)
(7/21) column: pre_Anticonvulsants2 (15 / 127)
(8/21) column: pre_Anticonvulsants3 (0 / 142)
(9/21) column: pre_Analgesics (1 / 141)
(10/21) column: pre_AnalgesicsSedatives1 (51 / 91)
(11/21) column: pre_AnalgesicsSedatives2 (23 / 119)
(12/21) column: pre_AnalgesicsSedatives3 (1 / 141)
(13/21) column: pre_Antipyretics (0 / 142)
(14/21) column: pre_Antipyretics1 (2 / 140)
(15/21) column: pre_Antipyretics2 (0 / 142)
(16/21) column: pre_Antipyretics3 (0 / 142)
(17/21) column: pre_Paralytics (0 / 142)
(18/21) column: pre_Paralytics1 (8 / 134)
(19/21) column: pre_Paralytics2 (0 / 142)
(20/21) column: pre_Paralytics3 (0 / 142)


In [61]:
COMBINE_harmonizer.column_info(df_other_med_pre_intervention)

(0/21) center: (142/0)
(1/21) subjectID: (142/0)
(2/21) uniqueID: (142/0)
(3/21) pre_OtherMedTargetDate: (3/139)
(4/21) pre_OtherMedTargetTime: (2/140)
(5/21) pre_Anticonvulsants: (1/141)
(6/21) pre_Anticonvulsants1: (97/45)
(7/21) pre_Anticonvulsants2: (15/127)
(8/21) pre_Anticonvulsants3: (0/142)
(9/21) pre_Analgesics: (1/141)
(10/21) pre_AnalgesicsSedatives1: (51/91)
(11/21) pre_AnalgesicsSedatives2: (23/119)
(12/21) pre_AnalgesicsSedatives3: (1/141)
(13/21) pre_Antipyretics: (0/142)
(14/21) pre_Antipyretics1: (2/140)
(15/21) pre_Antipyretics2: (0/142)
(16/21) pre_Antipyretics3: (0/142)
(17/21) pre_Paralytics: (0/142)
(18/21) pre_Paralytics1: (8/134)
(19/21) pre_Paralytics2: (0/142)
(20/21) pre_Paralytics3: (0/142)


## 01-11. Pre-intervention - Imaging

In [62]:
imaging_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Imaging')
all_imaging_pre_intervention_columns = id_columns + imaging_pre_intervention_columns
imaging_pre_intervention_columns

['pre_HeadSonogram',
 'pre_HeadSonogramDate',
 'pre_HeadSonogramTime',
 'pre_HeadSonogramResult1',
 'pre_HeadSonogramResult2',
 'pre_HeadSonogramResult3',
 'pre_HeadSonogramResult4',
 'pre_HeadSonogramResult5',
 'pre_HeadSonogramResult6',
 'pre_HeadSonogramResult7',
 'pre_HeadSonogramResult8',
 'pre_HeadSonogramResultText',
 'pre_HeadCT',
 'pre_HeadCTDate',
 'pre_HeadCTTime',
 'pre_HeadCTResult1',
 'pre_HeadCTResult2',
 'pre_HeadCTResult3',
 'pre_HeadCTResult4',
 'pre_HeadCTResult5',
 'pre_HeadCTResult6',
 'pre_HeadCTResult7',
 'pre_HeadCTResult8',
 'pre_HeadCTResultText',
 'pre_BrainMRI',
 'pre_BrainMRIDate',
 'pre_BrainMRITime',
 'pre_BrainMRIResult1',
 'pre_BrainMRIResult2',
 'pre_BrainMRIResult3',
 'pre_BrainMRIResult4',
 'pre_BrainMRIResult5',
 'pre_BrainMRIResult6',
 'pre_BrainMRIResult7',
 'pre_BrainMRIResult8',
 'pre_BrainMRIResultText']

In [63]:
df_imaging = df_dict['lh09.csv'].copy()

imagingTimeSlot_int = df_imaging['imagingTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = imagingTimeSlot_int == 1
df_imaging_pre_intervention = df_imaging[is_pre]
pre_rename_map = {
    'headSonogram': 'pre_HeadSonogram',
    'headSonogramDate': 'pre_HeadSonogramDate',
    'headSonogramTime': 'pre_HeadSonogramTime',
    'headSonogramResult1': 'pre_HeadSonogramResult1',
    'headSonogramResult2': 'pre_HeadSonogramResult2',
    'headSonogramResult3': 'pre_HeadSonogramResult3',
    'headSonogramResult4': 'pre_HeadSonogramResult4',
    'headSonogramResult5': 'pre_HeadSonogramResult5',
    'headSonogramResult6': 'pre_HeadSonogramResult6',
    'headSonogramResult7': 'pre_HeadSonogramResult7',
    'headSonogramResult8': 'pre_HeadSonogramResult8',
    'headSonogramResultText': 'pre_HeadSonogramResultText',
    'headCT': 'pre_HeadCT',
    'headCTDate': 'pre_HeadCTDate',
    'headCTTime': 'pre_HeadCTTime',
    'headCTResult1': 'pre_HeadCTResult1',
    'headCTResult2': 'pre_HeadCTResult2',
    'headCTResult3': 'pre_HeadCTResult3',
    'headCTResult4': 'pre_HeadCTResult4',
    'headCTResult5': 'pre_HeadCTResult5',
    'headCTResult6': 'pre_HeadCTResult6',
    'headCTResult7': 'pre_HeadCTResult7',
    'headCTResult8': 'pre_HeadCTResult8',
    'headCTResultText': 'pre_HeadCTResultText',
    'brainMRI': 'pre_BrainMRI',
    'brainMRIDate': 'pre_BrainMRIDate',
    'brainMRITime': 'pre_BrainMRITime',
    'brainMRIResult1': 'pre_BrainMRIResult1',
    'brainMRIResult2': 'pre_BrainMRIResult2',
    'brainMRIResult3': 'pre_BrainMRIResult3',
    'brainMRIResult4': 'pre_BrainMRIResult4',
    'brainMRIResult5': 'pre_BrainMRIResult5',
    'brainMRIResult6': 'pre_BrainMRIResult6',
    'brainMRIResult7': 'pre_BrainMRIResult7',
    'brainMRIResult8': 'pre_BrainMRIResult8',
    'brainMRIResultText': 'pre_BrainMRIResultText'
}
df_imaging_pre_intervention = df_imaging_pre_intervention.rename(columns=pre_rename_map)

df_imaging_pre_intervention = COMBINE_harmonizer.valid_columns(df_imaging_pre_intervention, all_imaging_pre_intervention_columns, debug_df=True, debug_columns=True)
df_imaging_pre_intervention = COMBINE_harmonizer.postprocess(df_imaging_pre_intervention)

out_filename = os.sep.join([out_dir, '01-11-pre-imaging.csv'])
df_imaging_pre_intervention.to_csv(out_filename, index=False)

(7/42) REC_CMP not in columns
(9/42) imagingTimeSlot not in columns
(40/42) CMP_DATE not in columns
(41/42) CRT_DATE not in columns


### 01-11-2. check empty cells

In [64]:
COMBINE_harmonizer.check_empty(df_imaging_pre_intervention)

(0/39) column: center (168 / 0)
(1/39) column: subjectID (168 / 0)
(2/39) column: uniqueID (168 / 0)
(3/39) column: pre_HeadSonogram (168 / 0)
(4/39) column: pre_HeadSonogramResultText (4 / 164)
(5/39) column: pre_HeadCT (168 / 0)
(6/39) column: pre_HeadCTResultText (4 / 164)
(7/39) column: pre_BrainMRI (168 / 0)
(8/39) column: pre_BrainMRIResultText (2 / 166)
(9/39) column: pre_HeadSonogramDate (30 / 138)
(10/39) column: pre_HeadSonogramTime (27 / 141)
(11/39) column: pre_HeadSonogramResult1 (29 / 139)
(12/39) column: pre_HeadSonogramResult2 (5 / 163)
(13/39) column: pre_HeadSonogramResult3 (1 / 167)
(14/39) column: pre_HeadSonogramResult4 (1 / 167)
(15/39) column: pre_HeadSonogramResult5 (0 / 168)
(16/39) column: pre_HeadSonogramResult6 (0 / 168)
(17/39) column: pre_HeadSonogramResult7 (0 / 168)
(18/39) column: pre_HeadSonogramResult8 (0 / 168)
(19/39) column: pre_HeadCTDate (14 / 154)
(20/39) column: pre_HeadCTTime (12 / 156)
(21/39) column: pre_HeadCTResult1 (14 / 154)
(22/39) colu

In [65]:
COMBINE_harmonizer.column_info(df_imaging_pre_intervention)

(0/39) center: (168/0)
(1/39) subjectID: (168/0)
(2/39) uniqueID: (168/0)
(3/39) pre_HeadSonogram: (168/0)
(4/39) pre_HeadSonogramResultText: (4/164)
(5/39) pre_HeadCT: (168/0)
(6/39) pre_HeadCTResultText: (4/164)
(7/39) pre_BrainMRI: (168/0)
(8/39) pre_BrainMRIResultText: (2/166)
(9/39) pre_HeadSonogramDate: (30/138)
(10/39) pre_HeadSonogramTime: (27/141)
(11/39) pre_HeadSonogramResult1: (29/139)
(12/39) pre_HeadSonogramResult2: (5/163)
(13/39) pre_HeadSonogramResult3: (1/167)
(14/39) pre_HeadSonogramResult4: (1/167)
(15/39) pre_HeadSonogramResult5: (0/168)
(16/39) pre_HeadSonogramResult6: (0/168)
(17/39) pre_HeadSonogramResult7: (0/168)
(18/39) pre_HeadSonogramResult8: (0/168)
(19/39) pre_HeadCTDate: (14/154)
(20/39) pre_HeadCTTime: (12/156)
(21/39) pre_HeadCTResult1: (14/154)
(22/39) pre_HeadCTResult2: (7/161)
(23/39) pre_HeadCTResult3: (4/164)
(24/39) pre_HeadCTResult4: (1/167)
(25/39) pre_HeadCTResult5: (0/168)
(26/39) pre_HeadCTResult6: (0/168)
(27/39) pre_HeadCTResult7: (0/168)


## 02-01. Temperature

In [66]:
df_temperature = df_dict['lh06tr.csv'].copy()

temperatureTimeSlot_int = df_temperature['temperatureTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre_intervention_min = temperatureTimeSlot_int == 0
is_pre_intervention_max = temperatureTimeSlot_int == 1
is_clinical_course = (is_pre_intervention_min == False) & (is_pre_intervention_max == False)

# after_baseline
df_temperature = df_temperature[is_clinical_course]


In [67]:
temperatures_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Temperature')
all_temperatures_columns = id_columns + temperatures_columns
temperatures_columns

['temperatureTimeSlot',
 'temperatureTimeSlotNoForm',
 'temperatureDate',
 'temperatureTime',
 'skinTemperature_C',
 'axillaryTemperature_C',
 'esophagealTemperature_C',
 'blanketTemperature_C',
 'servoSetTemperature_C',
 'alterationSkinIntegrity',
 'shiver']

In [68]:
df_temperature = COMBINE_harmonizer.valid_columns(df_temperature, all_temperatures_columns, debug_df=True, debug_columns=True)
df_temperature = COMBINE_harmonizer.postprocess(df_temperature)

# XXX hack for center14 LH039 temperatureInterval == 90
is_center14_LH039_temperatureInterval90 = (df_temperature['center'] == '14') & (df_temperature['subjectID'] == 'LH039') & (df_temperature['temperatureTimeSlot'].isin(['90', '90.0']))
df_temperature.loc[is_center14_LH039_temperatureInterval90, 'temperatureDate'] = '2009-09-07'

out_filename = os.sep.join([out_dir, '02-01-temperature.csv'])
df_temperature.to_csv(out_filename, index=False)


(1/13) REC_CMP not in columns
(11/13) CMP_DATE not in columns
(12/13) CRT_DATE not in columns
(3/13) temperatureTimeSlotNoForm not in df
(11/13) alterationSkinIntegrity not in df
(12/13) shiver not in df


### 02-01-1. check temperature

In [69]:
df_temperature_groupby = df_temperature.groupby(['uniqueID', 'temperatureTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_temperature_groupby['_count'] > 1
df_temperature_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,temperatureTimeSlot,Unnamed: 2_level_1


### 02-01-2. check empty cells

In [70]:
COMBINE_harmonizer.check_empty(df_temperature)

(0/11) column: center (6164 / 0)
(1/11) column: subjectID (6164 / 0)
(2/11) column: uniqueID (6164 / 0)
(3/11) column: temperatureTimeSlot (6164 / 0)
(4/11) column: temperatureDate (6137 / 27)
(5/11) column: temperatureTime (6127 / 37)
(6/11) column: skinTemperature_C (5683 / 481)
(7/11) column: axillaryTemperature_C (4569 / 1595)
(8/11) column: esophagealTemperature_C (5814 / 350)
(9/11) column: blanketTemperature_C (3497 / 2667)
(10/11) column: servoSetTemperature_C (4959 / 1205)


In [71]:
COMBINE_harmonizer.column_info(df_temperature)

(0/11) center: (6164/0)
(1/11) subjectID: (6164/0)
(2/11) uniqueID: (6164/0)
(3/11) temperatureTimeSlot: (6164/0)
(4/11) temperatureDate: (6137/27)
(5/11) temperatureTime: (6127/37)
(6/11) skinTemperature_C: (5683/481)
(7/11) axillaryTemperature_C: (4569/1595)
(8/11) esophagealTemperature_C: (5814/350)
(9/11) blanketTemperature_C: (3497/2667)
(10/11) servoSetTemperature_C: (4959/1205)


## 02-02. cardio

In [72]:
cardio_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Cardiovascular')
all_cardio_columns = id_columns + cardio_columns
cardio_columns

['cardioTimeSlot',
 'cardioDate',
 'cardioTime',
 'cardioSystolicBloodPressure_mmHg',
 'cardioDiastolicBloodPressure_mmHg',
 'cardioHeartRate_BPM',
 'cardioVolumeExpand',
 'cardioInotropicAgent',
 'cardioBloodTransfusion',
 'cardioPlatelets']

In [73]:
df_cardio = df_dict['lh06cv.csv'].copy()

cardioTimeSlot_int = df_cardio['cardioTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = cardioTimeSlot_int == 0
is_clinical_course = is_pre == False
# after_baseline
df_cardio = df_cardio[is_clinical_course]

df_cardio = COMBINE_harmonizer.valid_columns(df_cardio, all_cardio_columns, debug_df=True, debug_columns=True)
df_cardio = COMBINE_harmonizer.postprocess(df_cardio)

out_filename = os.sep.join([out_dir, '02-02-cardiovascular.csv'])
df_cardio.to_csv(out_filename, index=False)

(5/15) REC_CMP not in columns
(13/15) CMP_DATE not in columns
(14/15) CRT_DATE not in columns


### 02-02-1. check cardio

In [74]:
df_cardio_groupby = df_cardio.groupby(['uniqueID', 'cardioTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_cardio_groupby['_count'] > 1
df_cardio_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,cardioTimeSlot,Unnamed: 2_level_1


### 02-02-2. check empty cells

In [75]:
COMBINE_harmonizer.check_empty(df_cardio)

(0/13) column: center (4056 / 0)
(1/13) column: subjectID (4056 / 0)
(2/13) column: uniqueID (4056 / 0)
(3/13) column: cardioVolumeExpand (4056 / 0)
(4/13) column: cardioInotropicAgent (4056 / 0)
(5/13) column: cardioBloodTransfusion (4056 / 0)
(6/13) column: cardioPlatelets (4056 / 0)
(7/13) column: cardioTimeSlot (4056 / 0)
(8/13) column: cardioDate (4053 / 3)
(9/13) column: cardioTime (4053 / 3)
(10/13) column: cardioSystolicBloodPressure_mmHg (3657 / 399)
(11/13) column: cardioDiastolicBloodPressure_mmHg (3656 / 400)
(12/13) column: cardioHeartRate_BPM (4025 / 31)


In [76]:
COMBINE_harmonizer.column_info(df_cardio)

(0/13) center: (4056/0)
(1/13) subjectID: (4056/0)
(2/13) uniqueID: (4056/0)
(3/13) cardioVolumeExpand: (4056/0)
(4/13) cardioInotropicAgent: (4056/0)
(5/13) cardioBloodTransfusion: (4056/0)
(6/13) cardioPlatelets: (4056/0)
(7/13) cardioTimeSlot: (4056/0)
(8/13) cardioDate: (4053/3)
(9/13) cardioTime: (4053/3)
(10/13) cardioSystolicBloodPressure_mmHg: (3657/399)
(11/13) cardioDiastolicBloodPressure_mmHg: (3656/400)
(12/13) cardioHeartRate_BPM: (4025/31)


## 02-03. respiratory

In [77]:
respiratory_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Respiratory')
all_respiratory_columns = id_columns + respiratory_columns
respiratory_columns

['respiratoryTimeSlot',
 'respiratoryDate',
 'respiratoryTime',
 'respiratorySupportType',
 'respiratoryFiO2',
 'respiratoryRate_Hz',
 'respiratoryPIP_cmH2O',
 'respiratoryMAP_cmH2O',
 'respiratoryPEEP_cmH2O']

In [78]:
df_respiratory = df_dict['lh06rs.csv'].copy()
df_respiratory = COMBINE_harmonizer.valid_columns(df_respiratory, all_respiratory_columns, debug_df=True, debug_columns=True)
df_respiratory = COMBINE_harmonizer.postprocess(df_respiratory)

out_filename = os.sep.join([out_dir, '02-03-respiratory.csv'])
df_respiratory.to_csv(out_filename, index=False)

(1/14) REC_CMP not in columns
(12/14) CMP_DATE not in columns
(13/14) CRT_DATE not in columns


### 02-03-1. check respiratory

In [79]:
df_respiratory_groupby = df_respiratory.groupby(['uniqueID', 'respiratoryTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_respiratory_groupby['_count'] > 1
df_respiratory_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,respiratoryTimeSlot,Unnamed: 2_level_1


### 02-03-2. check empty cells

In [80]:
COMBINE_harmonizer.check_empty(df_respiratory)

(0/12) column: center (815 / 0)
(1/12) column: subjectID (815 / 0)
(2/12) column: uniqueID (815 / 0)
(3/12) column: respiratoryTimeSlot (815 / 0)
(4/12) column: respiratoryDate (814 / 1)
(5/12) column: respiratoryTime (814 / 1)
(6/12) column: respiratorySupportType (813 / 2)
(7/12) column: respiratoryFiO2 (412 / 403)
(8/12) column: respiratoryRate_Hz (275 / 540)
(9/12) column: respiratoryPIP_cmH2O (269 / 546)
(10/12) column: respiratoryMAP_cmH2O (262 / 553)
(11/12) column: respiratoryPEEP_cmH2O (227 / 588)


In [81]:
COMBINE_harmonizer.column_info(df_respiratory)

(0/12) center: (815/0)
(1/12) subjectID: (815/0)
(2/12) uniqueID: (815/0)
(3/12) respiratoryTimeSlot: (815/0)
(4/12) respiratoryDate: (814/1)
(5/12) respiratoryTime: (814/1)
(6/12) respiratorySupportType: (813/2)
(7/12) respiratoryFiO2: (412/403)
(8/12) respiratoryRate_Hz: (275/540)
(9/12) respiratoryPIP_cmH2O: (269/546)
(10/12) respiratoryMAP_cmH2O: (262/553)
(11/12) respiratoryPEEP_cmH2O: (227/588)


## 02-04. blood-gas

In [82]:
blood_gas_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Blood Gas')
all_blood_gas_columns = id_columns + blood_gas_columns
blood_gas_columns

['bloodGasTimeSlot',
 'bloodGasDate',
 'bloodGasTime',
 'bloodGasSrc',
 'bloodGasPH',
 'bloodGasPCO2_mmHg',
 'bloodGasPO2_mmHg',
 'bloodGasHCO3_mEqPerL',
 'bloodGasBaseDeficit_mEqPerL',
 'bloodGasPHCorrect',
 'bloodGasPCO2Correct_mmHg',
 'bloodGasPO2Correct_mmHg',
 'bloodGasHCO3Correct_mEqPerL',
 'bloodGasBaseDeficitCorrect_mEqPerL']

In [83]:
df_blood_gas = df_dict['lh06bg.csv']
df_blood_gas = COMBINE_harmonizer.valid_columns(df_blood_gas, all_blood_gas_columns, debug_df=True, debug_columns=True)
df_blood_gas = COMBINE_harmonizer.postprocess(df_blood_gas)

out_filename = os.sep.join([out_dir, '02-04-blood-gas.csv'])
df_blood_gas.to_csv(out_filename, index=False)

(1/13) REC_CMP not in columns
(11/13) CMP_DATE not in columns
(12/13) CRT_DATE not in columns
(5/16) bloodGasSrc not in df
(11/16) bloodGasPHCorrect not in df
(12/16) bloodGasPCO2Correct_mmHg not in df
(13/16) bloodGasPO2Correct_mmHg not in df
(14/16) bloodGasHCO3Correct_mEqPerL not in df
(15/16) bloodGasBaseDeficitCorrect_mEqPerL not in df


### 02-04-1. check blood-gas

In [84]:
df_blood_gas_groupby = df_blood_gas.groupby(['uniqueID', 'bloodGasTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_blood_gas_groupby['_count'] > 1
df_blood_gas_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,bloodGasTimeSlot,Unnamed: 2_level_1


### 02-04-2. check empty cells

In [85]:
COMBINE_harmonizer.check_empty(df_blood_gas)

(0/11) column: center (1195 / 0)
(1/11) column: subjectID (1195 / 0)
(2/11) column: uniqueID (1195 / 0)
(3/11) column: bloodGasTimeSlot (1195 / 0)
(4/11) column: bloodGasDate (943 / 252)
(5/11) column: bloodGasTime (891 / 304)
(6/11) column: bloodGasPH (788 / 407)
(7/11) column: bloodGasPCO2_mmHg (789 / 406)
(8/11) column: bloodGasPO2_mmHg (784 / 411)
(9/11) column: bloodGasHCO3_mEqPerL (775 / 420)
(10/11) column: bloodGasBaseDeficit_mEqPerL (770 / 425)


In [86]:
COMBINE_harmonizer.column_info(df_blood_gas)

(0/11) center: (1195/0)
(1/11) subjectID: (1195/0)
(2/11) uniqueID: (1195/0)
(3/11) bloodGasTimeSlot: (1195/0)
(4/11) bloodGasDate: (943/252)
(5/11) bloodGasTime: (891/304)
(6/11) bloodGasPH: (788/407)
(7/11) bloodGasPCO2_mmHg: (789/406)
(8/11) bloodGasPO2_mmHg: (784/411)
(9/11) bloodGasHCO3_mEqPerL: (775/420)
(10/11) bloodGasBaseDeficit_mEqPerL: (770/425)


## 02-05. hematology

In [87]:
hematology_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Hematology CBC')
all_hematology_columns = id_columns + hematology_columns
hematology_columns

['hematology',
 'hematologyTimeSlot',
 'hematologyDate',
 'hematologyTime',
 'hematologyWBC',
 'hematologyHemoglobin',
 'hematologyPolymorphNeutrophilsDifferentialCount',
 'hematologyMonocytes',
 'hematologyLymphocytes',
 'hematologyPlateletCount',
 'hematologyPT_s',
 'hematologyPTT_s',
 'hematologyHematocritMin',
 'hematologyHematocritMinDate',
 'hematologyPlateletCountMin',
 'hematologyPlateletCountMinDate']

In [88]:
df_hematology = df_dict['lh06he.csv'].copy()

df_hematology = COMBINE_harmonizer.valid_columns(df_hematology, all_hematology_columns, debug_df=True, debug_columns=True)
df_hematology = COMBINE_harmonizer.postprocess(df_hematology)

out_filename = os.sep.join([out_dir, '02-05-hematology.csv'])
df_hematology.to_csv(out_filename, index=False)

(1/16) REC_CMP not in columns
(14/16) CMP_DATE not in columns
(15/16) CRT_DATE not in columns
(2/18) hematology not in df
(14/18) hematologyHematocritMin not in df
(15/18) hematologyHematocritMinDate not in df
(16/18) hematologyPlateletCountMin not in df
(17/18) hematologyPlateletCountMinDate not in df


### 02-05-1. check hematology

In [89]:
df_hematology_groupby = df_hematology.groupby(['uniqueID', 'hematologyTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_hematology_groupby['_count'] > 1
df_hematology_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,hematologyTimeSlot,Unnamed: 2_level_1


### 02-05-2. check empty cells

In [90]:
COMBINE_harmonizer.check_empty(df_hematology)

(0/14) column: center (763 / 0)
(1/14) column: subjectID (763 / 0)
(2/14) column: uniqueID (763 / 0)
(3/14) column: hematologyTimeSlot (763 / 0)
(4/14) column: hematologyDate (625 / 138)
(5/14) column: hematologyTime (564 / 199)
(6/14) column: hematologyWBC (454 / 309)
(7/14) column: hematologyHemoglobin (475 / 288)
(8/14) column: hematologyPolymorphNeutrophilsDifferentialCount (400 / 363)
(9/14) column: hematologyMonocytes (399 / 364)
(10/14) column: hematologyLymphocytes (405 / 358)
(11/14) column: hematologyPlateletCount (469 / 294)
(12/14) column: hematologyPT_s (190 / 573)
(13/14) column: hematologyPTT_s (179 / 584)


In [91]:
COMBINE_harmonizer.column_info(df_hematology)

(0/14) center: (763/0)
(1/14) subjectID: (763/0)
(2/14) uniqueID: (763/0)
(3/14) hematologyTimeSlot: (763/0)
(4/14) hematologyDate: (625/138)
(5/14) hematologyTime: (564/199)
(6/14) hematologyWBC: (454/309)
(7/14) hematologyHemoglobin: (475/288)
(8/14) hematologyPolymorphNeutrophilsDifferentialCount: (400/363)
(9/14) hematologyMonocytes: (399/364)
(10/14) hematologyLymphocytes: (405/358)
(11/14) hematologyPlateletCount: (469/294)
(12/14) hematologyPT_s: (190/573)
(13/14) hematologyPTT_s: (179/584)


### 02-05-3. Hematology summary

In [92]:
df_main_hematology = COMBINE_harmonizer.valid_columns(df_main, all_hematology_columns)
df_main_hematology = COMBINE_harmonizer.postprocess(df_main_hematology)

out_filename = os.sep.join([out_dir, '02-05_s-hematology.csv'])
df_main_hematology.to_csv(out_filename, index=False)

(2/18) hematology not in df
(3/18) hematologyTimeSlot not in df
(4/18) hematologyDate not in df
(5/18) hematologyTime not in df
(6/18) hematologyWBC not in df
(7/18) hematologyHemoglobin not in df
(8/18) hematologyPolymorphNeutrophilsDifferentialCount not in df
(9/18) hematologyMonocytes not in df
(10/18) hematologyLymphocytes not in df
(11/18) hematologyPlateletCount not in df
(12/18) hematologyPT_s not in df
(13/18) hematologyPTT_s not in df
(15/18) hematologyHematocritMinDate not in df
(17/18) hematologyPlateletCountMinDate not in df


### 02-05-4. check empty cells

In [93]:
COMBINE_harmonizer.check_empty(df_main_hematology)

(0/5) column: center (168 / 0)
(1/5) column: subjectID (168 / 0)
(2/5) column: uniqueID (168 / 0)
(3/5) column: hematologyHematocritMin (163 / 5)
(4/5) column: hematologyPlateletCountMin (161 / 7)


In [94]:
COMBINE_harmonizer.column_info(df_main_hematology)

(0/5) center: (168/0)
(1/5) subjectID: (168/0)
(2/5) uniqueID: (168/0)
(3/5) hematologyHematocritMin: (163/5)
(4/5) hematologyPlateletCountMin: (161/7)


## 02-06. Blood Value

In [95]:
blood_value_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Blood Value')
all_blood_value_columns = id_columns + blood_value_columns
blood_value_columns

['bloodValueBunBaseline_mgPerdL',
 'bloodValueBunBaseline_mgPerdLDate',
 'bloodValueCreatinineBaseline_mgPerdL',
 'bloodValueCreatinineBaseline_mgPerdLDate',
 'bloodValueASTSGOTBaseline_UPerL',
 'bloodValueASTSGOTBaseline_UPerLDate',
 'bloodValueALTSGPTBaseline_UPerL',
 'bloodValueALTSGPTBaseline_UPerLDate',
 'bloodValueTotalBilirubinBaseline_mgPerdL',
 'bloodValueTotalBilirubinBaseline_mgPerdLDate',
 'bloodValuePHMin',
 'bloodValuePHMinDate',
 'bloodValueHCO3Min_mEqPerL',
 'bloodValueHCO3Min_mEqPerLDate',
 'bloodValueSerumNaMin_mEqPerL',
 'bloodValueSerumNaMin_mEqPerLDate',
 'bloodValueSerumKMin_mEqPerL',
 'bloodValueSerumKMin_mEqPerLDate',
 'bloodValueClMin_mEqPerL',
 'bloodValueClMin_mEqPerLDate',
 'bloodValueGlucoseMin_mgPerdL',
 'bloodValueGlucoseMin_mgPerdLDate',
 'bloodValueTotalCaMin_mgPerdL',
 'bloodValueTotalCaMin_mgPerdLDate',
 'bloodValueIonCaMin_mgPerdL',
 'bloodValueIonCaMin_mgPerdLDate',
 'bloodValueASTSGOTMin_UPerL',
 'bloodValueASTSGOTMin_UPerLDate',
 'bloodValueALTSGP

In [96]:
df_main_blood_value = COMBINE_harmonizer.valid_columns(df_main, all_blood_value_columns)
df_main_blood_value = COMBINE_harmonizer.postprocess(df_main_blood_value)

out_filename = os.sep.join([out_dir, '02-06_s-blood-value.csv'])
df_main_blood_value.to_csv(out_filename, index=False)


(2/60) bloodValueBunBaseline_mgPerdL not in df
(3/60) bloodValueBunBaseline_mgPerdLDate not in df
(4/60) bloodValueCreatinineBaseline_mgPerdL not in df
(5/60) bloodValueCreatinineBaseline_mgPerdLDate not in df
(6/60) bloodValueASTSGOTBaseline_UPerL not in df
(7/60) bloodValueASTSGOTBaseline_UPerLDate not in df
(8/60) bloodValueALTSGPTBaseline_UPerL not in df
(9/60) bloodValueALTSGPTBaseline_UPerLDate not in df
(10/60) bloodValueTotalBilirubinBaseline_mgPerdL not in df
(11/60) bloodValueTotalBilirubinBaseline_mgPerdLDate not in df
(28/60) bloodValueASTSGOTMin_UPerL not in df
(29/60) bloodValueASTSGOTMin_UPerLDate not in df
(30/60) bloodValueALTSGPTMin_UPerL not in df
(31/60) bloodValueALTSGPTMin_UPerLDate not in df
(32/60) bloodValueTotalBilirubinMin_mgPerdL not in df
(33/60) bloodValueTotalBilirubinMin_mgPerdLDate not in df
(50/60) bloodValueTotalCaMax_mgPerdL not in df
(51/60) bloodValueTotalCaMax_mgPerdLDate not in df
(52/60) bloodValueIonCaMax_mgPerdL not in df
(53/60) bloodValueIon

### 02-06-2. check empty cells

In [97]:
COMBINE_harmonizer.check_empty(df_main_blood_value)

(0/39) column: center (168 / 0)
(1/39) column: subjectID (168 / 0)
(2/39) column: uniqueID (168 / 0)
(3/39) column: bloodValuePHMin (156 / 12)
(4/39) column: bloodValuePHMinDate (156 / 12)
(5/39) column: bloodValuePHMax (155 / 13)
(6/39) column: bloodValuePHMaxDate (155 / 13)
(7/39) column: bloodValueHCO3Min_mEqPerL (155 / 13)
(8/39) column: bloodValueHCO3Min_mEqPerLDate (155 / 13)
(9/39) column: bloodValueBaseDeficitMax_mEqPerL (154 / 14)
(10/39) column: bloodValueBaseDeficitMax_mEqPerLDate (154 / 14)
(11/39) column: bloodValueSerumNaMin_mEqPerL (168 / 0)
(12/39) column: bloodValueSerumNaMin_mEqPerLDate (168 / 0)
(13/39) column: bloodValueSerumNaMax_mEqPerL (168 / 0)
(14/39) column: bloodValueSerumNaMax_mEqPerLDate (168 / 0)
(15/39) column: bloodValueSerumKMin_mEqPerL (168 / 0)
(16/39) column: bloodValueSerumKMin_mEqPerLDate (168 / 0)
(17/39) column: bloodValueSerumKMax_mEqPerL (166 / 2)
(18/39) column: bloodValueSerumKMax_mEqPerLDate (167 / 1)
(19/39) column: bloodValueClMin_mEqPerL 

In [98]:
COMBINE_harmonizer.column_info(df_main_blood_value)

(0/39) center: (168/0)
(1/39) subjectID: (168/0)
(2/39) uniqueID: (168/0)
(3/39) bloodValuePHMin: (156/12)
(4/39) bloodValuePHMinDate: (156/12)
(5/39) bloodValuePHMax: (155/13)
(6/39) bloodValuePHMaxDate: (155/13)
(7/39) bloodValueHCO3Min_mEqPerL: (155/13)
(8/39) bloodValueHCO3Min_mEqPerLDate: (155/13)
(9/39) bloodValueBaseDeficitMax_mEqPerL: (154/14)
(10/39) bloodValueBaseDeficitMax_mEqPerLDate: (154/14)
(11/39) bloodValueSerumNaMin_mEqPerL: (168/0)
(12/39) bloodValueSerumNaMin_mEqPerLDate: (168/0)
(13/39) bloodValueSerumNaMax_mEqPerL: (168/0)
(14/39) bloodValueSerumNaMax_mEqPerLDate: (168/0)
(15/39) bloodValueSerumKMin_mEqPerL: (168/0)
(16/39) bloodValueSerumKMin_mEqPerLDate: (168/0)
(17/39) bloodValueSerumKMax_mEqPerL: (166/2)
(18/39) bloodValueSerumKMax_mEqPerLDate: (167/1)
(19/39) bloodValueClMin_mEqPerL: (166/2)
(20/39) bloodValueClMin_mEqPerLDate: (166/2)
(21/39) bloodValueClMax_mEqPerL: (165/3)
(22/39) bloodValueClMax_mEqPerLDate: (165/3)
(23/39) bloodValueBunMax_mgPerdL: (164/

## 02-07. infection

In [99]:
infection_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Infection')
all_infection_columns = id_columns + infection_columns
infection_columns

['positiveCultureNumber',
 'positiveCulture',
 'positiveCultureSrc',
 'positiveCultureDate',
 'positiveCultureTime',
 'positiveCultureOrganismCode1',
 'positiveCultureOrganismCode2',
 'positiveCultureOrganismCode3',
 'antibiotics',
 'antibioticsCode1',
 'antibioticsCode2',
 'antibioticsCode3',
 'rewarmingAntibiotics',
 'rewarmingAntibioticsCode1',
 'rewarmingAntibioticsCode2',
 'rewarmingAntibioticsCode3']

In [100]:
df_infection = df_dict['lh06.csv'].copy()
df_infection = COMBINE_harmonizer.valid_columns(df_infection, all_infection_columns)
df_infection = COMBINE_harmonizer.postprocess(df_infection)

### XXX add positive culture number as 1
df_infection['positiveCultureNumber'] = 1

out_filename = os.sep.join([out_dir, '02-07-infection.csv'])
df_infection.to_csv(out_filename, index=False)

(2/18) positiveCultureNumber not in df
(14/18) rewarmingAntibiotics not in df
(15/18) rewarmingAntibioticsCode1 not in df
(16/18) rewarmingAntibioticsCode2 not in df
(17/18) rewarmingAntibioticsCode3 not in df


### 02-07-2. check empty cells

In [101]:
COMBINE_harmonizer.check_empty(df_infection)

(0/15) column: center (168 / 0)
(1/15) column: subjectID (168 / 0)
(2/15) column: uniqueID (168 / 0)
(3/15) column: positiveCulture (163 / 5)
(4/15) column: antibiotics (157 / 11)
(5/15) column: positiveCultureSrc (2 / 166)
(6/15) column: positiveCultureDate (2 / 166)
(7/15) column: positiveCultureTime (2 / 166)
(8/15) column: positiveCultureOrganismCode1 (2 / 166)
(9/15) column: positiveCultureOrganismCode2 (0 / 168)
(10/15) column: positiveCultureOrganismCode3 (0 / 168)
(11/15) column: antibioticsCode1 (57 / 111)
(12/15) column: antibioticsCode2 (30 / 138)
(13/15) column: antibioticsCode3 (6 / 162)
(14/15) column: positiveCultureNumber (168 / 0)


In [102]:
COMBINE_harmonizer.column_info(df_infection)

(0/15) center: (168/0)
(1/15) subjectID: (168/0)
(2/15) uniqueID: (168/0)
(3/15) positiveCulture: (163/5)
(4/15) antibiotics: (157/11)
(5/15) positiveCultureSrc: (2/166)
(6/15) positiveCultureDate: (2/166)
(7/15) positiveCultureTime: (2/166)
(8/15) positiveCultureOrganismCode1: (2/166)
(9/15) positiveCultureOrganismCode2: (0/168)
(10/15) positiveCultureOrganismCode3: (0/168)
(11/15) antibioticsCode1: (57/111)
(12/15) antibioticsCode2: (30/138)
(13/15) antibioticsCode3: (6/162)
(14/15) positiveCultureNumber: (168/0)


## 02-08. other-med

In [103]:
other_med_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Other Medication')
all_other_med_columns = id_columns + other_med_columns
other_med_columns

['otherMedTimeSlot',
 'otherMedTargetDate',
 'otherMedTargetTime',
 'anticonvulsants',
 'anticonvulsants1',
 'anticonvulsants2',
 'anticonvulsants3',
 'analgesics',
 'analgesicsSedatives1',
 'analgesicsSedatives2',
 'analgesicsSedatives3',
 'antipyretics',
 'antipyretics1',
 'antipyretics2',
 'antipyretics3',
 'paralytics',
 'paralytics1',
 'paralytics2',
 'paralytics3',
 'otherMedFluidIntake_ccPerKg',
 'otherMedUrineOutput_ccPerKg']

In [104]:
df_other_med = df_dict['lh06om.csv'].copy()

otherMedTimeSlot_int = df_other_med['otherMedTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = otherMedTimeSlot_int == 0
is_clinical_course = is_pre == False
# after_baseline
df_other_med = df_other_med[is_clinical_course]

df_other_med = COMBINE_harmonizer.valid_columns(df_other_med, all_other_med_columns, debug_df=True, debug_columns=True)
df_other_med = COMBINE_harmonizer.postprocess(df_other_med)

out_filename = os.sep.join([out_dir, '02-08-other-med.csv'])
df_other_med.to_csv(out_filename, index=False)

(1/26) REC_CMP not in columns
(24/26) CMP_DATE not in columns
(25/26) CRT_DATE not in columns


### 02-08-1. check other-med

In [105]:
df_other_med_groupby = df_other_med.groupby(['uniqueID', 'otherMedTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_other_med_groupby['_count'] > 1
df_other_med_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,otherMedTimeSlot,Unnamed: 2_level_1


### 02-08-2. check empty cells

In [106]:
COMBINE_harmonizer.check_empty(df_other_med)

(0/24) column: center (778 / 0)
(1/24) column: subjectID (778 / 0)
(2/24) column: uniqueID (778 / 0)
(3/24) column: otherMedTimeSlot (778 / 0)
(4/24) column: otherMedTargetDate (25 / 753)
(5/24) column: otherMedTargetTime (25 / 753)
(6/24) column: anticonvulsants (5 / 773)
(7/24) column: anticonvulsants1 (421 / 357)
(8/24) column: anticonvulsants2 (68 / 710)
(9/24) column: anticonvulsants3 (10 / 768)
(10/24) column: analgesics (6 / 772)
(11/24) column: analgesicsSedatives1 (231 / 547)
(12/24) column: analgesicsSedatives2 (76 / 702)
(13/24) column: analgesicsSedatives3 (11 / 767)
(14/24) column: antipyretics (0 / 778)
(15/24) column: antipyretics1 (7 / 771)
(16/24) column: antipyretics2 (0 / 778)
(17/24) column: antipyretics3 (0 / 778)
(18/24) column: paralytics (0 / 778)
(19/24) column: paralytics1 (30 / 748)
(20/24) column: paralytics2 (0 / 778)
(21/24) column: paralytics3 (0 / 778)
(22/24) column: otherMedFluidIntake_ccPerKg (630 / 148)
(23/24) column: otherMedUrineOutput_ccPerKg (63

In [107]:
COMBINE_harmonizer.column_info(df_other_med)

(0/24) center: (778/0)
(1/24) subjectID: (778/0)
(2/24) uniqueID: (778/0)
(3/24) otherMedTimeSlot: (778/0)
(4/24) otherMedTargetDate: (25/753)
(5/24) otherMedTargetTime: (25/753)
(6/24) anticonvulsants: (5/773)
(7/24) anticonvulsants1: (421/357)
(8/24) anticonvulsants2: (68/710)
(9/24) anticonvulsants3: (10/768)
(10/24) analgesics: (6/772)
(11/24) analgesicsSedatives1: (231/547)
(12/24) analgesicsSedatives2: (76/702)
(13/24) analgesicsSedatives3: (11/767)
(14/24) antipyretics: (0/778)
(15/24) antipyretics1: (7/771)
(16/24) antipyretics2: (0/778)
(17/24) antipyretics3: (0/778)
(18/24) paralytics: (0/778)
(19/24) paralytics1: (30/748)
(20/24) paralytics2: (0/778)
(21/24) paralytics3: (0/778)
(22/24) otherMedFluidIntake_ccPerKg: (630/148)
(23/24) otherMedUrineOutput_ccPerKg: (630/148)


## 02-09. Imaging

In [108]:
imaging_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Imaging')
all_imaging_columns = id_columns + imaging_columns
imaging_columns

['imagingTimeSlot',
 'headSonogram',
 'headSonogramDate',
 'headSonogramTime',
 'headSonogramResult1',
 'headSonogramResult2',
 'headSonogramResult3',
 'headSonogramResult4',
 'headSonogramResult5',
 'headSonogramResult6',
 'headSonogramResult7',
 'headSonogramResult8',
 'headSonogramResultText',
 'headCT',
 'headCTDate',
 'headCTTime',
 'headCTResult1',
 'headCTResult2',
 'headCTResult3',
 'headCTResult4',
 'headCTResult5',
 'headCTResult6',
 'headCTResult7',
 'headCTResult8',
 'headCTResultText',
 'brainMRI',
 'brainMRIDate',
 'brainMRITime',
 'brainMRIResult1',
 'brainMRIResult2',
 'brainMRIResult3',
 'brainMRIResult4',
 'brainMRIResult5',
 'brainMRIResult6',
 'brainMRIResult7',
 'brainMRIResult8',
 'brainMRIResultText']

In [109]:
df_imaging = df_dict['lh09.csv'].copy()

imagingTimeSlot_int = df_imaging['imagingTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = imagingTimeSlot_int == 1
is_post = imagingTimeSlot_int == 3
is_clinical_course = (is_pre == False) & (is_post == False)
df_imaging = df_imaging[is_clinical_course]

df_imaging = COMBINE_harmonizer.valid_columns(df_imaging, all_imaging_columns, debug_df=True, debug_columns=True)
df_imaging = COMBINE_harmonizer.postprocess(df_imaging)

out_filename = os.sep.join([out_dir, '02-09-imaging.csv'])
df_imaging.to_csv(out_filename, index=False)

(7/42) REC_CMP not in columns
(40/42) CMP_DATE not in columns
(41/42) CRT_DATE not in columns


### 02-09-1. check imaging

In [110]:
df_imaging_groupby = df_imaging.groupby(['uniqueID', 'imagingTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_imaging_groupby['_count'] > 1
df_imaging_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,imagingTimeSlot,Unnamed: 2_level_1


### 02-09-2. check empty cells

In [111]:
COMBINE_harmonizer.check_empty(df_imaging)

(0/40) column: center (168 / 0)
(1/40) column: subjectID (168 / 0)
(2/40) column: uniqueID (168 / 0)
(3/40) column: headSonogram (168 / 0)
(4/40) column: headSonogramResultText (9 / 159)
(5/40) column: headCT (168 / 0)
(6/40) column: headCTResultText (1 / 167)
(7/40) column: brainMRI (168 / 0)
(8/40) column: brainMRIResultText (10 / 158)
(9/40) column: imagingTimeSlot (168 / 0)
(10/40) column: headSonogramDate (79 / 89)
(11/40) column: headSonogramTime (79 / 89)
(12/40) column: headSonogramResult1 (79 / 89)
(13/40) column: headSonogramResult2 (11 / 157)
(14/40) column: headSonogramResult3 (3 / 165)
(15/40) column: headSonogramResult4 (1 / 167)
(16/40) column: headSonogramResult5 (0 / 168)
(17/40) column: headSonogramResult6 (0 / 168)
(18/40) column: headSonogramResult7 (0 / 168)
(19/40) column: headSonogramResult8 (0 / 168)
(20/40) column: headCTDate (5 / 163)
(21/40) column: headCTTime (5 / 163)
(22/40) column: headCTResult1 (5 / 163)
(23/40) column: headCTResult2 (3 / 165)
(24/40) co

In [112]:
COMBINE_harmonizer.column_info(df_imaging)

(0/40) center: (168/0)
(1/40) subjectID: (168/0)
(2/40) uniqueID: (168/0)
(3/40) headSonogram: (168/0)
(4/40) headSonogramResultText: (9/159)
(5/40) headCT: (168/0)
(6/40) headCTResultText: (1/167)
(7/40) brainMRI: (168/0)
(8/40) brainMRIResultText: (10/158)
(9/40) imagingTimeSlot: (168/0)
(10/40) headSonogramDate: (79/89)
(11/40) headSonogramTime: (79/89)
(12/40) headSonogramResult1: (79/89)
(13/40) headSonogramResult2: (11/157)
(14/40) headSonogramResult3: (3/165)
(15/40) headSonogramResult4: (1/167)
(16/40) headSonogramResult5: (0/168)
(17/40) headSonogramResult6: (0/168)
(18/40) headSonogramResult7: (0/168)
(19/40) headSonogramResult8: (0/168)
(20/40) headCTDate: (5/163)
(21/40) headCTTime: (5/163)
(22/40) headCTResult1: (5/163)
(23/40) headCTResult2: (3/165)
(24/40) headCTResult3: (1/167)
(25/40) headCTResult4: (0/168)
(26/40) headCTResult5: (0/168)
(27/40) headCTResult6: (0/168)
(28/40) headCTResult7: (0/168)
(29/40) headCTResult8: (0/168)
(30/40) brainMRIDate: (21/147)
(31/40) b

## 03-01. Post-intervention Temperature

In [113]:
temperature_post_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Temperature')
all_temperature_post_columns = id_columns + temperature_post_columns
temperature_post_columns

['post_TemperatureTimeSlot',
 'post_TemperatureDate',
 'post_TemperatureTime',
 'post_SkinTemperature_C',
 'post_AxillaryTemperature_C',
 'post_AlterationSkinIntegrity',
 'post_Shiver',
 'normothermiaAtEndIntervention',
 'normothermiaDate',
 'normothermiaTime',
 'normothermiaAxillaryTemperature_C',
 'noNormothermiaReason',
 'coolAfterIntervention',
 'coolAfterInterventionText']

In [114]:
df_temperature_post = df_dict['lh06ht.csv'].copy()
df_temperature_post = COMBINE_harmonizer.valid_columns(df_temperature_post, all_temperature_post_columns, debug_df=True, debug_columns=True)
df_temperature_post = COMBINE_harmonizer.postprocess(df_temperature_post)

out_filename = os.sep.join([out_dir, '03-01-post-temperature.csv'])
df_temperature_post.to_csv(out_filename, index=False)

(1/10) REC_CMP not in columns
(8/10) CMP_DATE not in columns
(9/10) CRT_DATE not in columns
(7/16) post_AlterationSkinIntegrity not in df
(8/16) post_Shiver not in df
(9/16) normothermiaAtEndIntervention not in df
(10/16) normothermiaDate not in df
(11/16) normothermiaTime not in df
(12/16) normothermiaAxillaryTemperature_C not in df
(13/16) noNormothermiaReason not in df
(14/16) coolAfterIntervention not in df
(15/16) coolAfterInterventionText not in df


### 03-01-1. check post-temperature

In [115]:
df_temperature_post_groupby = df_temperature_post.groupby(['uniqueID', 'post_TemperatureTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_temperature_post_groupby['_count'] > 1
df_temperature_post_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,post_TemperatureTimeSlot,Unnamed: 2_level_1


### 03-01-2. check empty cells

In [116]:
COMBINE_harmonizer.check_empty(df_temperature_post)

(0/8) column: center (450 / 0)
(1/8) column: subjectID (450 / 0)
(2/8) column: uniqueID (450 / 0)
(3/8) column: post_TemperatureTimeSlot (450 / 0)
(4/8) column: post_TemperatureDate (437 / 13)
(5/8) column: post_TemperatureTime (432 / 18)
(6/8) column: post_SkinTemperature_C (229 / 221)
(7/8) column: post_AxillaryTemperature_C (417 / 33)


In [117]:
COMBINE_harmonizer.column_info(df_temperature_post)

(0/8) center: (450/0)
(1/8) subjectID: (450/0)
(2/8) uniqueID: (450/0)
(3/8) post_TemperatureTimeSlot: (450/0)
(4/8) post_TemperatureDate: (437/13)
(5/8) post_TemperatureTime: (432/18)
(6/8) post_SkinTemperature_C: (229/221)
(7/8) post_AxillaryTemperature_C: (417/33)


## 03-02. Post-intervention Blood Value

## 03-03. Post-intervention Imaging

In [118]:
imaging_post_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Imaging')
all_imaging_post_columns = id_columns + imaging_post_columns
imaging_post_columns

['post_HeadSonogram',
 'post_HeadSonogramDate',
 'post_HeadSonogramTime',
 'post_HeadSonogramResult1',
 'post_HeadSonogramResult2',
 'post_HeadSonogramResult3',
 'post_HeadSonogramResult4',
 'post_HeadSonogramResult5',
 'post_HeadSonogramResult6',
 'post_HeadSonogramResult7',
 'post_HeadSonogramResult8',
 'post_HeadSonogramResultText',
 'post_HeadCT',
 'post_HeadCTDate',
 'post_HeadCTTime',
 'post_HeadCTResult1',
 'post_HeadCTResult2',
 'post_HeadCTResult3',
 'post_HeadCTResult4',
 'post_HeadCTResult5',
 'post_HeadCTResult6',
 'post_HeadCTResult7',
 'post_HeadCTResult8',
 'post_HeadCTResultText',
 'post_BrainMRI',
 'post_BrainMRIDate',
 'post_BrainMRITime',
 'post_BrainMRIResult1',
 'post_BrainMRIResult2',
 'post_BrainMRIResult3',
 'post_BrainMRIResult4',
 'post_BrainMRIResult5',
 'post_BrainMRIResult6',
 'post_BrainMRIResult7',
 'post_BrainMRIResult8',
 'post_BrainMRIResultText']

In [119]:
df_imaging = df_dict['lh09.csv'].copy()

imagingTimeSlot_int = df_imaging['imagingTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_post = imagingTimeSlot_int == 3
df_imaging_post = df_imaging[is_post]
post_rename_map = {
    'headSonogram': 'post_HeadSonogram',
    'headSonogramDate': 'post_HeadSonogramDate',
    'headSonogramTime': 'post_HeadSonogramTime',
    'headSonogramResult1': 'post_HeadSonogramResult1',
    'headSonogramResult2': 'post_HeadSonogramResult2',
    'headSonogramResult3': 'post_HeadSonogramResult3',
    'headSonogramResult4': 'post_HeadSonogramResult4',
    'headSonogramResult5': 'post_HeadSonogramResult5',
    'headSonogramResult6': 'post_HeadSonogramResult6',
    'headSonogramResult7': 'post_HeadSonogramResult7',
    'headSonogramResult8': 'post_HeadSonogramResult8',
    'headSonogramResultText': 'post_HeadSonogramResultText',
    'headCT': 'post_HeadCT',
    'headCTDate': 'post_HeadCTDate',
    'headCTTime': 'post_HeadCTTime',
    'headCTResult1': 'post_HeadCTResult1',
    'headCTResult2': 'post_HeadCTResult2',
    'headCTResult3': 'post_HeadCTResult3',
    'headCTResult4': 'post_HeadCTResult4',
    'headCTResult5': 'post_HeadCTResult5',
    'headCTResult6': 'post_HeadCTResult6',
    'headCTResult7': 'post_HeadCTResult7',
    'headCTResult8': 'post_HeadCTResult8',
    'headCTResultText': 'post_HeadCTResultText',
    'brainMRI': 'post_BrainMRI',
    'brainMRIDate': 'post_BrainMRIDate',
    'brainMRITime': 'post_BrainMRITime',
    'brainMRIResult1': 'post_BrainMRIResult1',
    'brainMRIResult2': 'post_BrainMRIResult2',
    'brainMRIResult3': 'post_BrainMRIResult3',
    'brainMRIResult4': 'post_BrainMRIResult4',
    'brainMRIResult5': 'post_BrainMRIResult5',
    'brainMRIResult6': 'post_BrainMRIResult6',
    'brainMRIResult7': 'post_BrainMRIResult7',
    'brainMRIResult8': 'post_BrainMRIResult8',
    'brainMRIResultText': 'post_BrainMRIResultText'
}
df_imaging_post = df_imaging_post.rename(columns=post_rename_map)

df_imaging_post = COMBINE_harmonizer.valid_columns(df_imaging_post, all_imaging_post_columns, debug_df=True, debug_columns=True)
df_imaging_post = COMBINE_harmonizer.postprocess(df_imaging_post)

out_filename = os.sep.join([out_dir, '03-03-post-imaging.csv'])
df_imaging_post.to_csv(out_filename, index=False)

(7/42) REC_CMP not in columns
(9/42) imagingTimeSlot not in columns
(40/42) CMP_DATE not in columns
(41/42) CRT_DATE not in columns


### 03-03-2. check empty cells

In [120]:
COMBINE_harmonizer.check_empty(df_imaging_post)

(0/39) column: center (168 / 0)
(1/39) column: subjectID (168 / 0)
(2/39) column: uniqueID (168 / 0)
(3/39) column: post_HeadSonogram (168 / 0)
(4/39) column: post_HeadSonogramResultText (1 / 167)
(5/39) column: post_HeadCT (168 / 0)
(6/39) column: post_HeadCTResultText (0 / 168)
(7/39) column: post_BrainMRI (168 / 0)
(8/39) column: post_BrainMRIResultText (37 / 131)
(9/39) column: post_HeadSonogramDate (14 / 154)
(10/39) column: post_HeadSonogramTime (14 / 154)
(11/39) column: post_HeadSonogramResult1 (14 / 154)
(12/39) column: post_HeadSonogramResult2 (3 / 165)
(13/39) column: post_HeadSonogramResult3 (1 / 167)
(14/39) column: post_HeadSonogramResult4 (0 / 168)
(15/39) column: post_HeadSonogramResult5 (0 / 168)
(16/39) column: post_HeadSonogramResult6 (0 / 168)
(17/39) column: post_HeadSonogramResult7 (0 / 168)
(18/39) column: post_HeadSonogramResult8 (0 / 168)
(19/39) column: post_HeadCTDate (2 / 166)
(20/39) column: post_HeadCTTime (2 / 166)
(21/39) column: post_HeadCTResult1 (2 / 

In [121]:
COMBINE_harmonizer.column_info(df_imaging_post)

(0/39) center: (168/0)
(1/39) subjectID: (168/0)
(2/39) uniqueID: (168/0)
(3/39) post_HeadSonogram: (168/0)
(4/39) post_HeadSonogramResultText: (1/167)
(5/39) post_HeadCT: (168/0)
(6/39) post_HeadCTResultText: (0/168)
(7/39) post_BrainMRI: (168/0)
(8/39) post_BrainMRIResultText: (37/131)
(9/39) post_HeadSonogramDate: (14/154)
(10/39) post_HeadSonogramTime: (14/154)
(11/39) post_HeadSonogramResult1: (14/154)
(12/39) post_HeadSonogramResult2: (3/165)
(13/39) post_HeadSonogramResult3: (1/167)
(14/39) post_HeadSonogramResult4: (0/168)
(15/39) post_HeadSonogramResult5: (0/168)
(16/39) post_HeadSonogramResult6: (0/168)
(17/39) post_HeadSonogramResult7: (0/168)
(18/39) post_HeadSonogramResult8: (0/168)
(19/39) post_HeadCTDate: (2/166)
(20/39) post_HeadCTTime: (2/166)
(21/39) post_HeadCTResult1: (2/166)
(22/39) post_HeadCTResult2: (0/168)
(23/39) post_HeadCTResult3: (0/168)
(24/39) post_HeadCTResult4: (0/168)
(25/39) post_HeadCTResult5: (0/168)
(26/39) post_HeadCTResult6: (0/168)
(27/39) post_

## 03-04. Post-intervention Neuro Exam

In [122]:
neuro_exam_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Neuro Exam')
all_neuro_exam_columns = id_columns + neuro_exam_columns
neuro_exam_columns

['post_NeuroExamSectionID',
 'post_NeuroExam',
 'post_NeuroExamDate',
 'post_NeuroExamTime',
 'post_NeuroExamLevelConsciousness',
 'post_NeuroExamSpontaneousActivity',
 'post_NeuroExamPosture',
 'post_NeuroExamTone',
 'post_NeuroExamSuck',
 'post_NeuroExamMoro',
 'post_NeuroExamPupils',
 'post_NeuroExamHeartRate',
 'post_NeuroExamRespiration',
 'post_NeuroExamSeizure',
 'post_NeuroExamSedate',
 'post_NeuroExamClonusSustained',
 'post_NeuroExamFistedHand',
 'post_NeuroExamAbnormalMovement',
 'post_NeuroExamGagReflexAbsent',
 'post_NeuroExamHypertonia',
 'post_NeuroExamAsymTonicNeckReflex']

In [123]:
df_neuro_exam = df_dict['lh11.csv'].copy()
df_neuro_exam = COMBINE_harmonizer.valid_columns(df_neuro_exam, all_neuro_exam_columns, debug_df=True, debug_columns=True)
df_neuro_exam = COMBINE_harmonizer.postprocess(df_neuro_exam)

out_filename = os.sep.join([out_dir, '03-04-post-neuro-exam.csv'])
df_neuro_exam.to_csv(out_filename, index=False)

(8/42) LH11NA_1 not in columns
(9/42) dischargeNeuroExamSeizure not in columns
(10/42) dischargeNeuroExamHypertonia not in columns
(11/42) dischargeNeuroExamClonusSustained not in columns
(12/42) dischargeNeuroExamFistedHand not in columns
(13/42) dischargeNeuroExamAbnormalMovement not in columns
(14/42) dischargeNeuroExamGagReflexAbsent not in columns
(15/42) LH11NA_2 not in columns
(16/42) REC_CMP not in columns
(29/42) dischargeNeuroExamDate not in columns
(30/42) dischargeNeuroExamTime not in columns
(31/42) dischargeNeuroExamLevelConsciousness not in columns
(32/42) dischargeNeuroExamSpontaneousActivity not in columns
(33/42) dischargeNeuroExamPosture not in columns
(34/42) dischargeNeuroExamTone not in columns
(35/42) dischargeNeuroExamSuck not in columns
(36/42) dischargeNeuroExamMoro not in columns
(37/42) dischargeNeuroExamPupils not in columns
(38/42) dischargeNeuroExamHeartRate not in columns
(39/42) dischargeNeuroExamRespiration not in columns
(40/42) CMP_DATE not in column

### 03-04-1 check neuro exam

In [124]:
df_neuro_exam_groupby = df_neuro_exam.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_neuro_exam_groupby['_count'] > 1
df_neuro_exam_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 03-04-2. check empty cells

In [125]:
COMBINE_harmonizer.check_empty(df_neuro_exam)

(0/21) column: center (163 / 0)
(1/21) column: subjectID (163 / 0)
(2/21) column: uniqueID (163 / 0)
(3/21) column: post_NeuroExamSeizure (163 / 0)
(4/21) column: post_NeuroExamSedate (163 / 0)
(5/21) column: post_NeuroExamHypertonia (163 / 0)
(6/21) column: post_NeuroExamClonusSustained (163 / 0)
(7/21) column: post_NeuroExamFistedHand (163 / 0)
(8/21) column: post_NeuroExamAbnormalMovement (163 / 0)
(9/21) column: post_NeuroExamGagReflexAbsent (163 / 0)
(10/21) column: post_NeuroExamDate (152 / 11)
(11/21) column: post_NeuroExamTime (150 / 13)
(12/21) column: post_NeuroExamLevelConsciousness (152 / 11)
(13/21) column: post_NeuroExamSpontaneousActivity (152 / 11)
(14/21) column: post_NeuroExamPosture (152 / 11)
(15/21) column: post_NeuroExamTone (124 / 39)
(16/21) column: post_NeuroExamSuck (149 / 14)
(17/21) column: post_NeuroExamMoro (147 / 16)
(18/21) column: post_NeuroExamPupils (151 / 12)
(19/21) column: post_NeuroExamHeartRate (152 / 11)
(20/21) column: post_NeuroExamRespiration

In [126]:
COMBINE_harmonizer.column_info(df_neuro_exam)

(0/21) center: (163/0)
(1/21) subjectID: (163/0)
(2/21) uniqueID: (163/0)
(3/21) post_NeuroExamSeizure: (163/0)
(4/21) post_NeuroExamSedate: (163/0)
(5/21) post_NeuroExamHypertonia: (163/0)
(6/21) post_NeuroExamClonusSustained: (163/0)
(7/21) post_NeuroExamFistedHand: (163/0)
(8/21) post_NeuroExamAbnormalMovement: (163/0)
(9/21) post_NeuroExamGagReflexAbsent: (163/0)
(10/21) post_NeuroExamDate: (152/11)
(11/21) post_NeuroExamTime: (150/13)
(12/21) post_NeuroExamLevelConsciousness: (152/11)
(13/21) post_NeuroExamSpontaneousActivity: (152/11)
(14/21) post_NeuroExamPosture: (152/11)
(15/21) post_NeuroExamTone: (124/39)
(16/21) post_NeuroExamSuck: (149/14)
(17/21) post_NeuroExamMoro: (147/16)
(18/21) post_NeuroExamPupils: (151/12)
(19/21) post_NeuroExamHeartRate: (152/11)
(20/21) post_NeuroExamRespiration: (152/11)


## 03-05. MRI

In [127]:
df_mri = df_dict[_MRI_FILENAME].copy()
# split subjectID and subjectID-postfix
df_mri['subjectID_with_postfix'] = df_mri['subjectID'].copy()
df_mri['_subjectID_with_postfix_list'] = df_mri['subjectID'].apply(lambda x: x.split('-'))
df_mri['subjectID'] = df_mri['_subjectID_with_postfix_list'].apply(lambda x: x[0])
df_mri['subjectID_postfix'] = df_mri['_subjectID_with_postfix_list'].apply(lambda x: '' if len(x) == 1 else x[1])
del df_mri['_subjectID_with_postfix_list']

## XXX hack for removing duplicated record
center_int = df_mri['center'].apply(COMBINE_harmonizer.to_int)
is_invalid_mri = (center_int == 26) & (df_mri['subjectID'] == 'LH076') & (df_mri['MRIReadDate'] == '2017-02-28')
is_valid_mri = is_invalid_mri == False
df_mri = df_mri[is_valid_mri]


In [128]:
mri3_columns = ['subjectID_with_postfix', 'subjectID_postfix']

mri_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'MRI')
all_mri_columns = list(dict.fromkeys(id_columns + mri_columns + mri3_columns))
all_mri_columns

['center',
 'subjectID',
 'MRIAvailable',
 'MRIAvailable_c',
 'MRIObtain',
 'MRIObtainWindow',
 'MRIObtainWindow_c',
 'MRIDate',
 'MRITime',
 'MRIObtainComment',
 'MRISendRTIDate',
 'MRIReceiveRTIDate',
 'MRINoObtainReason',
 'MRINoObtainReason_c',
 'MRINoObtainReasonText',
 'MRIRead',
 'MRIScore',
 'MRIIteration',
 'MRIIncrement',
 'MRIID',
 'MRIReader',
 'MRIReadDate',
 'MRIStrength_T',
 'MRIStrength_c',
 'MRIAdequateQuality',
 'MRIAdequateQuality_c',
 'MRIT1Axial',
 'MRIT1Axial_c',
 'MRIT1Coronal',
 'MRIT1Coronal_c',
 'MRIT1Sagittal',
 'MRIT1Sagittal_c',
 'MRIT1',
 'MRIT2Axial',
 'MRIT2Axial_c',
 'MRIT2Coronal',
 'MRIT2Coronal_c',
 'MRIT2Sagittal',
 'MRIT2Sagittal_c',
 'MRIT2',
 'MRIT2FLAIRAxial',
 'MRIT2FLAIRAxial_c',
 'MRIT2FLAIRCoronal',
 'MRIT2FLAIRCoronal_c',
 'MRIT2FLAIRSagittal',
 'MRIT2FLAIRSagittal_c',
 'MRIT2FLAIR',
 'MRIGRESWIAxial',
 'MRIGRESWIAxial_c',
 'MRIGRESWICoronal',
 'MRIGRESWICoronal_c',
 'MRIGRESWISagittal',
 'MRIGRESWISagittal_c',
 'MRIGRESWI',
 'MRISPGRAxial'

In [129]:
df_mri = COMBINE_harmonizer.valid_columns(df_mri, all_mri_columns, debug_df=True, debug_columns=True)
df_mri = COMBINE_harmonizer.postprocess(df_mri)
df_mri['MRI_ID'] = df_mri.apply(lambda x: f"{x['subjectID']}_Site{x['center']}", axis=1)

out_filename = os.sep.join([out_dir, '03-05-mri.csv'])
df_mri.to_csv(out_filename, index=False)

(2/96) LM3READ not in columns
(52/96) siteID not in columns
(90/96) FormStatus not in columns
(91/96) DateComplete not in columns
(92/96) DateCreated not in columns
(93/96) KeyedUser not in columns
(2/141) MRIAvailable not in df
(3/141) MRIAvailable_c not in df
(4/141) MRIObtain not in df
(5/141) MRIObtainWindow not in df
(6/141) MRIObtainWindow_c not in df
(8/141) MRITime not in df
(9/141) MRIObtainComment not in df
(10/141) MRISendRTIDate not in df
(11/141) MRIReceiveRTIDate not in df
(12/141) MRINoObtainReason not in df
(13/141) MRINoObtainReason_c not in df
(14/141) MRINoObtainReasonText not in df
(15/141) MRIRead not in df
(16/141) MRIScore not in df
(17/141) MRIIteration not in df
(18/141) MRIIncrement not in df
(32/141) MRIT1 not in df
(39/141) MRIT2 not in df
(46/141) MRIT2FLAIR not in df
(53/141) MRIGRESWI not in df
(60/141) MRISPGR not in df
(65/141) MRIMRS not in df
(66/141) MRIMRS_c not in df
(75/141) MRILesionNumber not in df
(76/141) MRILesionHemisphere not in df
(77/141)

### 03-05-1. check MRI

In [130]:
df_mri_groupby = df_mri.groupby(['uniqueID', 'MRIReader']).agg(_count=('uniqueID', 'count'))

is_invalid = df_mri_groupby['_count'] > 1
df_mri_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,MRIReader,Unnamed: 2_level_1


### 03-05-2. check empty

In [131]:
COMBINE_harmonizer.check_empty(df_mri)

(0/92) column: center (244 / 0)
(1/92) column: subjectID (244 / 0)
(2/92) column: uniqueID (244 / 0)
(3/92) column: MRIReader (244 / 0)
(4/92) column: MRIReadDate (244 / 0)
(5/92) column: MRIDate (244 / 0)
(6/92) column: MRIStrength_T (244 / 0)
(7/92) column: MRIStrength_c (114 / 130)
(8/92) column: MRIAdequateQuality (239 / 5)
(9/92) column: MRIAdequateQuality_c (244 / 0)
(10/92) column: MRIT1Axial (244 / 0)
(11/92) column: MRIT1Axial_c (172 / 72)
(12/92) column: MRIT1Coronal (244 / 0)
(13/92) column: MRIT1Coronal_c (47 / 197)
(14/92) column: MRIT1Sagittal (244 / 0)
(15/92) column: MRIT1Sagittal_c (227 / 17)
(16/92) column: MRIT2Axial (244 / 0)
(17/92) column: MRIT2Axial_c (223 / 21)
(18/92) column: MRIT2Coronal (244 / 0)
(19/92) column: MRIT2Coronal_c (129 / 115)
(20/92) column: MRIT2Sagittal (244 / 0)
(21/92) column: MRIT2Sagittal_c (39 / 205)
(22/92) column: MRIT2FLAIRAxial (244 / 0)
(23/92) column: MRIT2FLAIRAxial_c (167 / 77)
(24/92) column: MRIT2FLAIRCoronal (244 / 0)
(25/92) co

In [132]:
COMBINE_harmonizer.column_info(df_mri)

(0/92) center: (244/0)
(1/92) subjectID: (244/0)
(2/92) uniqueID: (244/0)
(3/92) MRIReader: (244/0)
(4/92) MRIReadDate: (244/0)
(5/92) MRIDate: (244/0)
(6/92) MRIStrength_T: (244/0)
(7/92) MRIStrength_c: (114/130)
(8/92) MRIAdequateQuality: (239/5)
(9/92) MRIAdequateQuality_c: (244/0)
(10/92) MRIT1Axial: (244/0)
(11/92) MRIT1Axial_c: (172/72)
(12/92) MRIT1Coronal: (244/0)
(13/92) MRIT1Coronal_c: (47/197)
(14/92) MRIT1Sagittal: (244/0)
(15/92) MRIT1Sagittal_c: (227/17)
(16/92) MRIT2Axial: (244/0)
(17/92) MRIT2Axial_c: (223/21)
(18/92) MRIT2Coronal: (244/0)
(19/92) MRIT2Coronal_c: (129/115)
(20/92) MRIT2Sagittal: (244/0)
(21/92) MRIT2Sagittal_c: (39/205)
(22/92) MRIT2FLAIRAxial: (244/0)
(23/92) MRIT2FLAIRAxial_c: (167/77)
(24/92) MRIT2FLAIRCoronal: (244/0)
(25/92) MRIT2FLAIRCoronal_c: (22/222)
(26/92) MRIT2FLAIRSagittal: (244/0)
(27/92) MRIT2FLAIRSagittal_c: (3/241)
(28/92) MRIGRESWIAxial: (244/0)
(29/92) MRIGRESWIAxial_c: (169/75)
(30/92) MRIGRESWICoronal: (244/0)
(31/92) MRIGRESWICoron

### 03-05-3. MRI summary

In [133]:
df_mri_s = None
for idx, each_filename in enumerate(_MRI_FILENAMES_MERGE):
    each_df = df_dict[each_filename]
    columns = list(each_df.columns)
    each_filename_prefix = re.sub(r'\.csv$', '', each_filename)
    print(f'({idx}/{len(_MRI_FILENAMES_MERGE)}) filename: {each_filename} columns: {columns}')

    if df_mri_s is None:
        df_mri_s = each_df
    else:
        df_mri_s = df_mri_s.merge(each_df, on=['center', 'subjectID'], how='outer', suffixes=['', ':' + each_filename_prefix])

# postprocess
df_mri_s = COMBINE_harmonizer.valid_columns(df_mri_s, all_mri_columns, debug_df=True, debug_columns=True)
df_mri_s = COMBINE_harmonizer.postprocess(df_mri_s)

out_filename = os.sep.join([out_dir, '03-05_s-mri.csv'])
df_mri_s.to_csv(out_filename, index=False)

(0/2) filename: lhmr01.csv columns: ['subjectID', 'MRIAvailable', 'MRIObtain', 'MRINoObtainReason', 'LM1INIT', 'REC_CMP', 'center', 'MRIDate', 'MRITime', 'MRISendRTIDate', 'CMP_DATE', 'CRT_DATE']
(1/2) filename: lhmr02.csv columns: ['subjectID', 'MRIRead', 'MRIScore', 'LM2INIT', 'REC_CMP', 'center', 'MRIDate', 'MRITime', 'CMP_DATE', 'CRT_DATE']


(4/20) LM1INIT not in columns
(5/20) REC_CMP not in columns
(10/20) CMP_DATE not in columns
(11/20) CRT_DATE not in columns
(14/20) LM2INIT not in columns
(15/20) REC_CMP:lhmr02 not in columns
(16/20) MRIDate:lhmr02 not in columns
(17/20) MRITime:lhmr02 not in columns
(18/20) CMP_DATE:lhmr02 not in columns
(19/20) CRT_DATE:lhmr02 not in columns
(3/141) MRIAvailable_c not in df
(5/141) MRIObtainWindow not in df
(6/141) MRIObtainWindow_c not in df
(9/141) MRIObtainComment not in df
(11/141) MRIReceiveRTIDate not in df
(13/141) MRINoObtainReason_c not in df
(14/141) MRINoObtainReasonText not in df
(17/141) MRIIteration not in df
(18/141) MRIIncrement not in df
(19/141) MRIID not in df
(20/141) MRIReader not in df
(21/141) MRIReadDate not in df
(22/141) MRIStrength_T not in df
(23/141) MRIStrength_c not in df
(24/141) MRIAdequateQuality not in df
(25/141) MRIAdequateQuality_c not in df
(26/141) MRIT1Axial not in df
(27/141) MRIT1Axial_c not in df
(28/141) MRIT1Coronal not in df
(29/141) MR

### 03-05-4. check MRI Summary

In [134]:
df_mri_s_groupby = df_mri_s.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_mri_s_groupby['_count'] > 1
df_mri_s_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 03-05-5. check empty cells

In [135]:
COMBINE_harmonizer.check_empty(df_mri_s)

(0/11) column: center (150 / 0)
(1/11) column: subjectID (150 / 0)
(2/11) column: uniqueID (150 / 0)
(3/11) column: MRIAvailable (150 / 0)
(4/11) column: MRIObtain (148 / 2)
(5/11) column: MRINoObtainReason (1 / 149)
(6/11) column: MRIDate (147 / 3)
(7/11) column: MRITime (147 / 3)
(8/11) column: MRISendRTIDate (146 / 4)
(9/11) column: MRIRead (147 / 3)
(10/11) column: MRIScore (147 / 3)


In [136]:
COMBINE_harmonizer.column_info(df_mri_s)

(0/11) center: (150/0)
(1/11) subjectID: (150/0)
(2/11) uniqueID: (150/0)
(3/11) MRIAvailable: (150/0)
(4/11) MRIObtain: (148/2)
(5/11) MRINoObtainReason: (1/149)
(6/11) MRIDate: (147/3)
(7/11) MRITime: (147/3)
(8/11) MRISendRTIDate: (146/4)
(9/11) MRIRead: (147/3)
(10/11) MRIScore: (147/3)


## 02-11. elevated temperature

In [137]:
elevated_temperature_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Elevated Temperature')
all_elevated_temperature_columns = id_columns + elevated_temperature_columns
elevated_temperature_columns

['elevatedTempNumber',
 'elevatedTempMin',
 'elevatedTempDate',
 'elevatedTempTime',
 'elevatedTempSkinTemperature_C',
 'elevatedTempAxillaryTemperature_C',
 'elevatedTempEsophagealTemperature_C',
 'elevatedTempServoSet_C',
 'elevatedTempDevice',
 'elevatedTempDeviceMode',
 'elevatedTempAirTemperature_C',
 'elevatedTempBath',
 'elevatedTempNoBathReason',
 'elevatedTempBlanketrol']

In [138]:
df_elevated_temperature = df_dict['lh06a.csv'].copy()
df_elevated_temperature = COMBINE_harmonizer.valid_columns(df_elevated_temperature, all_elevated_temperature_columns, debug_df=True, debug_columns=True)
df_elevated_temperature = COMBINE_harmonizer.postprocess(df_elevated_temperature)

out_filename = os.sep.join([out_dir, '02-11-elevated-temperature.csv'])
df_elevated_temperature.to_csv(out_filename, index=False)

(3/19) REC_CMP not in columns
(17/19) CMP_DATE not in columns
(18/19) CRT_DATE not in columns


### 02-11-1. check elevated temperature

In [139]:
df_elevated_temperature_groupby = df_elevated_temperature.groupby(['uniqueID', 'elevatedTempNumber', 'elevatedTempMin']).agg(_count=('uniqueID', 'count'))

is_invalid = df_elevated_temperature_groupby['_count'] > 1
df_elevated_temperature_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,_count
uniqueID,elevatedTempNumber,elevatedTempMin,Unnamed: 3_level_1


### 02-11-2. check empty cells

In [140]:
COMBINE_harmonizer.check_empty(df_elevated_temperature)

(0/17) column: center (445 / 0)
(1/17) column: subjectID (445 / 0)
(2/17) column: uniqueID (445 / 0)
(3/17) column: elevatedTempBath (423 / 22)
(4/17) column: elevatedTempBlanketrol (423 / 22)
(5/17) column: elevatedTempNumber (445 / 0)
(6/17) column: elevatedTempMin (445 / 0)
(7/17) column: elevatedTempDate (407 / 38)
(8/17) column: elevatedTempTime (396 / 49)
(9/17) column: elevatedTempSkinTemperature_C (265 / 180)
(10/17) column: elevatedTempAxillaryTemperature_C (259 / 186)
(11/17) column: elevatedTempEsophagealTemperature_C (299 / 146)
(12/17) column: elevatedTempServoSet_C (225 / 220)
(13/17) column: elevatedTempDevice (334 / 111)
(14/17) column: elevatedTempDeviceMode (334 / 111)
(15/17) column: elevatedTempAirTemperature_C (0 / 445)
(16/17) column: elevatedTempNoBathReason (302 / 143)


In [141]:
COMBINE_harmonizer.column_info(df_elevated_temperature)

(0/17) center: (445/0)
(1/17) subjectID: (445/0)
(2/17) uniqueID: (445/0)
(3/17) elevatedTempBath: (423/22)
(4/17) elevatedTempBlanketrol: (423/22)
(5/17) elevatedTempNumber: (445/0)
(6/17) elevatedTempMin: (445/0)
(7/17) elevatedTempDate: (407/38)
(8/17) elevatedTempTime: (396/49)
(9/17) elevatedTempSkinTemperature_C: (265/180)
(10/17) elevatedTempAxillaryTemperature_C: (259/186)
(11/17) elevatedTempEsophagealTemperature_C: (299/146)
(12/17) elevatedTempServoSet_C: (225/220)
(13/17) elevatedTempDevice: (334/111)
(14/17) elevatedTempDeviceMode: (334/111)
(15/17) elevatedTempAirTemperature_C: (0/445)
(16/17) elevatedTempNoBathReason: (302/143)


## 02-12. fluctuated temperature

In [142]:
fluctuated_temperature_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Fluctuated Temperature')
all_elevated_temperature_columns = id_columns + fluctuated_temperature_columns
fluctuated_temperature_columns

['fluctuateTempNumber',
 'fluctuateTempMin',
 'fluctuateTempDate',
 'fluctuateTempTime',
 'fluctuateTempSkinTemperature_C',
 'fluctuateTempAxillaryTemperature_C',
 'fluctuateTempEsophagealTemperature_C',
 'fluctuateTempBlanketrol_C',
 'fluctuateTempServoSet_C']

In [143]:
df_fluctuated_temperature = df_dict['lh06f.csv'].copy()
df_fluctuated_temperature = COMBINE_harmonizer.valid_columns(df_fluctuated_temperature, all_elevated_temperature_columns, debug_df=True, debug_columns=True)
df_fluctuated_temperature = COMBINE_harmonizer.postprocess(df_fluctuated_temperature)

out_filename = os.sep.join([out_dir, '02-12-fluctuated-temperature.csv'])
df_fluctuated_temperature.to_csv(out_filename, index=False)

### 02-12-1. check fluctuated temperature

In [144]:
df_fluctuated_temperature_groupby = df_fluctuated_temperature.groupby(['uniqueID', 'fluctuateTempNumber', 'fluctuateTempMin']).agg(_count=('uniqueID', 'count'))

is_invalid = df_fluctuated_temperature_groupby['_count'] > 1
df_fluctuated_temperature_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,_count
uniqueID,fluctuateTempNumber,fluctuateTempMin,Unnamed: 3_level_1


### 02-12-2. check empty cells

In [145]:
COMBINE_harmonizer.check_empty(df_fluctuated_temperature)

(0/12) column: center (54 / 0)
(1/12) column: subjectID (54 / 0)
(2/12) column: uniqueID (54 / 0)
(3/12) column: fluctuateTempNumber (54 / 0)
(4/12) column: fluctuateTempMin (54 / 0)
(5/12) column: fluctuateTempDate (54 / 0)
(6/12) column: fluctuateTempTime (54 / 0)
(7/12) column: fluctuateTempSkinTemperature_C (33 / 21)
(8/12) column: fluctuateTempAxillaryTemperature_C (11 / 43)
(9/12) column: fluctuateTempEsophagealTemperature_C (38 / 16)
(10/12) column: fluctuateTempBlanketrol_C (39 / 15)
(11/12) column: fluctuateTempServoSet_C (40 / 14)


In [146]:
COMBINE_harmonizer.column_info(df_fluctuated_temperature)

(0/12) center: (54/0)
(1/12) subjectID: (54/0)
(2/12) uniqueID: (54/0)
(3/12) fluctuateTempNumber: (54/0)
(4/12) fluctuateTempMin: (54/0)
(5/12) fluctuateTempDate: (54/0)
(6/12) fluctuateTempTime: (54/0)
(7/12) fluctuateTempSkinTemperature_C: (33/21)
(8/12) fluctuateTempAxillaryTemperature_C: (11/43)
(9/12) fluctuateTempEsophagealTemperature_C: (38/16)
(10/12) fluctuateTempBlanketrol_C: (39/15)
(11/12) fluctuateTempServoSet_C: (40/14)


## 02-13. bradycardia

## 02-14. adverse event

In [147]:
adverse_event_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Adverse Event')
all_adverse_event_columns = id_columns + adverse_event_columns
adverse_event_columns

['adverseEventNumber',
 'SAECardiacExperienceOnsetDate',
 'SAECardiacExperienceOnsetTime',
 'SAECardiacExperienceResolveDate',
 'SAECardiacExperienceResolveTime',
 'SAECardiacExperienceDueToHypothermia',
 'SAECardiacExperienceActionTaken',
 'SAECardiacExperienceOutcome',
 'SAECardiacExperienceComment',
 'SAEMetabolicAcidosisOnsetDate',
 'SAEMetabolicAcidosisOnsetTime',
 'SAEMetabolicAcidosisResolveDate',
 'SAEMetabolicAcidosisResolveTime',
 'SAEMetabolicAcidosisDueToHypothermia',
 'SAEMetabolicAcidosisActionTaken',
 'SAEMetabolicAcidosisOutcome',
 'SAEMetabolicAcidosisComment',
 'SAEThrombosisExperienceOnsetDate',
 'SAEThrombosisExperienceOnsetTime',
 'SAEThrombosisExperienceResolveDate',
 'SAEThrombosisExperienceResolveTime',
 'SAEThrombosisExperienceDueToHypothermia',
 'SAEThrombosisExperienceActionTaken',
 'SAEThrombosisExperienceOutcome',
 'SAEThrombosisExperienceComment',
 'SAEBleedingExperienceOnsetDate',
 'SAEBleedingExperienceOnsetTime',
 'SAEBleedingExperienceResolveDate',
 'S

In [148]:
df_adverse_event = df_dict['lh07.csv'].copy()
df_adverse_event = COMBINE_harmonizer.valid_columns(df_adverse_event, all_adverse_event_columns, debug_df=True, debug_columns=True)
df_adverse_event = COMBINE_harmonizer.postprocess(df_adverse_event)

out_filename = os.sep.join([out_dir, '02-14-adverse-event.csv'])
df_adverse_event.to_csv(out_filename, index=False)

(8/59) LH7INIT not in columns
(9/59) REC_CMP not in columns
(48/59) CMP_DATE not in columns
(49/59) CRT_DATE not in columns
(36/57) SAEAlterationSkinIntegrityOnsetDate not in df
(37/57) SAEAlterationSkinIntegrityResolveDate not in df


### 02-14-1. check adverse event

In [149]:
df_adverse_event_groupby = df_adverse_event.groupby(['uniqueID', 'adverseEventNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_adverse_event_groupby['_count'] > 1
df_adverse_event_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,adverseEventNumber,Unnamed: 2_level_1


### 02-14-2. check empty cells

In [150]:
COMBINE_harmonizer.check_empty(df_adverse_event)

(0/56) column: center (18 / 0)
(1/56) column: subjectID (18 / 0)
(2/56) column: uniqueID (18 / 0)
(3/56) column: SAECardiacExperienceComment (0 / 18)
(4/56) column: SAEMetabolicAcidosisComment (0 / 18)
(5/56) column: SAEThrombosisExperienceComment (0 / 18)
(6/56) column: SAEBleedingExperienceComment (4 / 14)
(7/56) column: SAEAlterationSkinIntegrity (2 / 16)
(8/56) column: SAEAlterationSkinIntegrityComment (2 / 16)
(9/56) column: SAEDeathComment (6 / 12)
(10/56) column: adverseEventNumber (18 / 0)
(11/56) column: SAECardiacExperienceOnsetDate (0 / 18)
(12/56) column: SAECardiacExperienceOnsetTime (0 / 18)
(13/56) column: SAECardiacExperienceResolveDate (0 / 18)
(14/56) column: SAECardiacExperienceResolveTime (0 / 18)
(15/56) column: SAECardiacExperienceActionTaken (0 / 18)
(16/56) column: SAECardiacExperienceOutcome (0 / 18)
(17/56) column: SAECardiacExperienceDueToHypothermia (0 / 18)
(18/56) column: SAEMetabolicAcidosisOnsetDate (0 / 18)
(19/56) column: SAEMetabolicAcidosisOnsetTime 

In [151]:
COMBINE_harmonizer.column_info(df_adverse_event)

(0/56) center: (18/0)
(1/56) subjectID: (18/0)
(2/56) uniqueID: (18/0)
(3/56) SAECardiacExperienceComment: (0/18)
(4/56) SAEMetabolicAcidosisComment: (0/18)
(5/56) SAEThrombosisExperienceComment: (0/18)
(6/56) SAEBleedingExperienceComment: (4/14)
(7/56) SAEAlterationSkinIntegrity: (2/16)
(8/56) SAEAlterationSkinIntegrityComment: (2/16)
(9/56) SAEDeathComment: (6/12)
(10/56) adverseEventNumber: (18/0)
(11/56) SAECardiacExperienceOnsetDate: (0/18)
(12/56) SAECardiacExperienceOnsetTime: (0/18)
(13/56) SAECardiacExperienceResolveDate: (0/18)
(14/56) SAECardiacExperienceResolveTime: (0/18)
(15/56) SAECardiacExperienceActionTaken: (0/18)
(16/56) SAECardiacExperienceOutcome: (0/18)
(17/56) SAECardiacExperienceDueToHypothermia: (0/18)
(18/56) SAEMetabolicAcidosisOnsetDate: (0/18)
(19/56) SAEMetabolicAcidosisOnsetTime: (0/18)
(20/56) SAEMetabolicAcidosisResolveDate: (0/18)
(21/56) SAEMetabolicAcidosisResolveTime: (0/18)
(22/56) SAEMetabolicAcidosisActionTaken: (0/18)
(23/56) SAEMetabolicAcidosi

## 02-15. violation

In [152]:
violation_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Violation')
all_violation_columns = id_columns + violation_columns
violation_columns

['violationNumber',
 'violationDate',
 'violationNature',
 'violationTreatmentAssign',
 'violationTreatmentReceive',
 'violationOtherText',
 'violationCircumstance',
 'violationOtherCirumstanceText',
 'violationComment']

In [153]:
df_violation = df_dict['lh14.csv'].copy()
df_violation = COMBINE_harmonizer.valid_columns(df_violation, all_violation_columns, debug_df=True, debug_columns=True)
df_violation = COMBINE_harmonizer.postprocess(df_violation)

out_filename = os.sep.join([out_dir, '02-15-violation.csv'])
df_violation.to_csv(out_filename, index=False)

(4/14) LH14NAME not in columns
(5/14) LH14INIT not in columns
(6/14) REC_CMP not in columns
(12/14) CMP_DATE not in columns
(13/14) CRT_DATE not in columns
(5/11) violationTreatmentAssign not in df
(6/11) violationTreatmentReceive not in df


### 02-15-1. check violation

In [154]:
df_violation_groupby = df_violation.groupby(['uniqueID', 'violationNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_violation_groupby['_count'] > 1
df_violation_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,violationNumber,Unnamed: 2_level_1


### 02-15-2. check empty cells

In [155]:
COMBINE_harmonizer.check_empty(df_violation)

(0/10) column: center (52 / 0)
(1/10) column: subjectID (52 / 0)
(2/10) column: uniqueID (52 / 0)
(3/10) column: violationOtherText (36 / 16)
(4/10) column: violationOtherCirumstanceText (41 / 11)
(5/10) column: violationComment (34 / 18)
(6/10) column: violationNumber (52 / 0)
(7/10) column: violationDate (52 / 0)
(8/10) column: violationNature (52 / 0)
(9/10) column: violationCircumstance (52 / 0)


In [156]:
COMBINE_harmonizer.column_info(df_violation)

(0/10) center: (52/0)
(1/10) subjectID: (52/0)
(2/10) uniqueID: (52/0)
(3/10) violationOtherText: (36/16)
(4/10) violationOtherCirumstanceText: (41/11)
(5/10) violationComment: (34/18)
(6/10) violationNumber: (52/0)
(7/10) violationDate: (52/0)
(8/10) violationNature: (52/0)
(9/10) violationCircumstance: (52/0)


## 02-16. Interrupt

## 02-17. Discontinue

In [157]:
discontinue_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Discontinue')
all_discontinue_columns = id_columns + discontinue_columns
discontinue_columns

['discontinueDate',
 'discontinueTime',
 'discontinueBeforeEndPeriod',
 'discontinueParentsWithdraw',
 'discontinuePhysicianWithdraw',
 'discontinueAdverseEvent',
 'discontinueECMO',
 'discontinueDNR',
 'discontinueWdrawSupport',
 'discontinueDeath',
 'discontinueOther',
 'discontinueOtherText']

In [158]:
df_discontinue = COMBINE_harmonizer.valid_columns(df_main, all_discontinue_columns, debug_df=False, debug_columns=True)
df_discontinue = COMBINE_harmonizer.postprocess(df_discontinue)

out_filename = os.sep.join([out_dir, '02-17-discontinue.csv'])
df_discontinue.to_csv(out_filename, index=False)

(9/14) discontinueDNR not in df
(10/14) discontinueWdrawSupport not in df


### 02-17-1. check discontinue

In [159]:
df_continue_groupby = df_discontinue.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_continue_groupby['_count'] > 1
df_continue_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 02-17-2. check empty cells

In [160]:
COMBINE_harmonizer.check_empty(df_discontinue)

(0/13) column: center (168 / 0)
(1/13) column: subjectID (168 / 0)
(2/13) column: uniqueID (168 / 0)
(3/13) column: discontinueBeforeEndPeriod (168 / 0)
(4/13) column: discontinueParentsWithdraw (23 / 145)
(5/13) column: discontinuePhysicianWithdraw (23 / 145)
(6/13) column: discontinueAdverseEvent (23 / 145)
(7/13) column: discontinueECMO (23 / 145)
(8/13) column: discontinueDeath (23 / 145)
(9/13) column: discontinueOther (23 / 145)
(10/13) column: discontinueOtherText (9 / 159)
(11/13) column: discontinueDate (23 / 145)
(12/13) column: discontinueTime (23 / 145)


In [161]:
COMBINE_harmonizer.column_info(df_discontinue)

(0/13) center: (168/0)
(1/13) subjectID: (168/0)
(2/13) uniqueID: (168/0)
(3/13) discontinueBeforeEndPeriod: (168/0)
(4/13) discontinueParentsWithdraw: (23/145)
(5/13) discontinuePhysicianWithdraw: (23/145)
(6/13) discontinueAdverseEvent: (23/145)
(7/13) discontinueECMO: (23/145)
(8/13) discontinueDeath: (23/145)
(9/13) discontinueOther: (23/145)
(10/13) discontinueOtherText: (9/159)
(11/13) discontinueDate: (23/145)
(12/13) discontinueTime: (23/145)


## 04-16. Wdraw Support

In [162]:
wdraw_support_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Withdrawal of Support')
all_wdraw_support_columns = id_columns + wdraw_support_columns
wdraw_support_columns

['wdrawSupport',
 'wdrawSupportDate',
 'wdrawSupportTime',
 'wdrawSupportDiscussedWithFamily',
 'wdrawSupportRecommendSolelyByClinicalTeam',
 'wdrawSupportNeurologicalExam',
 'wdrawSupportImagingStudy',
 'wdrawSupportEEGFinding',
 'wdrawSupportMultisystemOrganFailureOtherThanCNS',
 'wdrawSupportBrainBloodFlowScan',
 'wdrawSupportParentWish',
 'wdrawSupportOther',
 'wdrawSupportOtherText']

In [163]:
df_wdraw_support = COMBINE_harmonizer.valid_columns(df_main, all_wdraw_support_columns, debug_df=False, debug_columns=True)
df_wdraw_support = COMBINE_harmonizer.postprocess(df_wdraw_support)

out_filename = os.sep.join([out_dir, '04-16-wdraw-support.csv'])
df_wdraw_support.to_csv(out_filename, index=False)

### 04-16-1. check wdraw support

In [164]:
df_wdraw_support_groupby = df_wdraw_support.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_wdraw_support_groupby['_count'] > 1
df_wdraw_support_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 04-16-2. check empty cells

In [165]:
COMBINE_harmonizer.check_empty(df_wdraw_support)

(0/16) column: center (168 / 0)
(1/16) column: subjectID (168 / 0)
(2/16) column: uniqueID (168 / 0)
(3/16) column: wdrawSupportDiscussedWithFamily (168 / 0)
(4/16) column: wdrawSupportNeurologicalExam (19 / 149)
(5/16) column: wdrawSupportImagingStudy (19 / 149)
(6/16) column: wdrawSupportEEGFinding (19 / 149)
(7/16) column: wdrawSupportMultisystemOrganFailureOtherThanCNS (19 / 149)
(8/16) column: wdrawSupportBrainBloodFlowScan (19 / 149)
(9/16) column: wdrawSupportParentWish (19 / 149)
(10/16) column: wdrawSupportOther (19 / 149)
(11/16) column: wdrawSupportOtherText (1 / 167)
(12/16) column: wdrawSupport (19 / 149)
(13/16) column: wdrawSupportRecommendSolelyByClinicalTeam (19 / 149)
(14/16) column: wdrawSupportDate (18 / 150)
(15/16) column: wdrawSupportTime (18 / 150)


In [166]:
COMBINE_harmonizer.column_info(df_wdraw_support)

(0/16) center: (168/0)
(1/16) subjectID: (168/0)
(2/16) uniqueID: (168/0)
(3/16) wdrawSupportDiscussedWithFamily: (168/0)
(4/16) wdrawSupportNeurologicalExam: (19/149)
(5/16) wdrawSupportImagingStudy: (19/149)
(6/16) wdrawSupportEEGFinding: (19/149)
(7/16) wdrawSupportMultisystemOrganFailureOtherThanCNS: (19/149)
(8/16) wdrawSupportBrainBloodFlowScan: (19/149)
(9/16) wdrawSupportParentWish: (19/149)
(10/16) wdrawSupportOther: (19/149)
(11/16) wdrawSupportOtherText: (1/167)
(12/16) wdrawSupport: (19/149)
(13/16) wdrawSupportRecommendSolelyByClinicalTeam: (19/149)
(14/16) wdrawSupportDate: (18/150)
(15/16) wdrawSupportTime: (18/150)


## 04-17. Limitation of Care

In [167]:
limit_care_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Limitation of Care')
all_limit_care_columns = id_columns + limit_care_columns
limit_care_columns

['limitCareDiscussedWithFamily',
 'limitCareRecommendSolelyByClinicalTeam',
 'limitCareAgreedByFamilyAndCareTeam',
 'limitCareNoFurtherMechanicalVentilationAndIntubation',
 'limitCareNoFurtherVentilationWithBagAndMask',
 'limitCareNoFurtherMedicationsToSupportBP',
 'limitCareNoFurtherChestCompression',
 'limitCareNoFurtherEmergencyMedication',
 'limitCareDNR',
 'limitCareDNRDate',
 'limitCareDNRTime']

In [168]:
df_limit_care = COMBINE_harmonizer.valid_columns(df_main, all_limit_care_columns, debug_df=False, debug_columns=True)
df_limit_care = COMBINE_harmonizer.postprocess(df_limit_care)

out_filename = os.sep.join([out_dir, '04-17-limit-care.csv'])
df_limit_care.to_csv(out_filename, index=False)

### 04-17-1. check limit care

In [169]:
df_limit_care_groupby = df_limit_care.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_limit_care_groupby['_count'] > 1
df_limit_care_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 04-17-2. check empty cells

In [170]:
COMBINE_harmonizer.check_empty(df_limit_care)

(0/14) column: center (168 / 0)
(1/14) column: subjectID (168 / 0)
(2/14) column: uniqueID (168 / 0)
(3/14) column: limitCareDiscussedWithFamily (168 / 0)
(4/14) column: limitCareAgreedByFamilyAndCareTeam (13 / 155)
(5/14) column: limitCareNoFurtherMechanicalVentilationAndIntubation (13 / 155)
(6/14) column: limitCareNoFurtherVentilationWithBagAndMask (13 / 155)
(7/14) column: limitCareNoFurtherMedicationsToSupportBP (13 / 155)
(8/14) column: limitCareNoFurtherChestCompression (13 / 155)
(9/14) column: limitCareNoFurtherEmergencyMedication (13 / 155)
(10/14) column: limitCareDNR (168 / 0)
(11/14) column: limitCareRecommendSolelyByClinicalTeam (14 / 154)
(12/14) column: limitCareDNRDate (15 / 153)
(13/14) column: limitCareDNRTime (15 / 153)


In [171]:
COMBINE_harmonizer.column_info(df_limit_care)

(0/14) center: (168/0)
(1/14) subjectID: (168/0)
(2/14) uniqueID: (168/0)
(3/14) limitCareDiscussedWithFamily: (168/0)
(4/14) limitCareAgreedByFamilyAndCareTeam: (13/155)
(5/14) limitCareNoFurtherMechanicalVentilationAndIntubation: (13/155)
(6/14) limitCareNoFurtherVentilationWithBagAndMask: (13/155)
(7/14) limitCareNoFurtherMedicationsToSupportBP: (13/155)
(8/14) limitCareNoFurtherChestCompression: (13/155)
(9/14) limitCareNoFurtherEmergencyMedication: (13/155)
(10/14) limitCareDNR: (168/0)
(11/14) limitCareRecommendSolelyByClinicalTeam: (14/154)
(12/14) limitCareDNRDate: (15/153)
(13/14) limitCareDNRTime: (15/153)


## 04-01. Status

In [172]:
status_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Status')
all_status_columns = id_columns + status_columns
status_columns

['status',
 'statusDate',
 'dischargeStatus',
 'dischargeDate',
 'dischargeWeight_g',
 'dischargeLength_cm',
 'dischargeHeadCircumference_cm',
 'transferReason',
 'transferDate',
 'transferWeight_g',
 'transferLength_cm',
 'transferHeadCircumference_cm',
 'transferOutcome',
 'homeTherapyStatus',
 'homeTherapyVentilator',
 'homeTherapyOxygen',
 'homeTherapyGavageTubeFeed',
 'homeTherapyGastrostomyTubeFeed',
 'homeTherapyTemperatureBlanket',
 'homeTherapyAnticonvulsantMedication',
 'homeTherapyOther',
 'homeTherapyOtherText',
 'deathDate',
 'deathTime',
 'deathAutopsy',
 'deathCause',
 'deathCauseText',
 'deathSrc']

In [173]:
df_status = COMBINE_harmonizer.valid_columns(df_main, all_status_columns, debug_df=False, debug_columns=True)


(3/30) statusDate not in df


In [174]:
df_status['status'].unique()

array(['1', '4', '2'], dtype=object)

In [175]:
# statusDate
is_discharge = df_status['status'].isin(['1', '1.0'])
df_status.loc[is_discharge, 'statusDate'] = df_status.loc[is_discharge, 'dischargeDate']

is_transfer = df_status['status'].isin(['2', '2.0', '3', '3.0'])
df_status.loc[is_transfer, 'statusDate'] = df_status.loc[is_transfer, 'transferDate']

is_died = df_status['status'].isin(['4', '4.0'])
df_status.loc[is_died, 'statusDate'] = df_status.loc[is_died, 'deathDate']

df_status = COMBINE_harmonizer.postprocess(df_status)

out_filename = os.sep.join([out_dir, '04-01-status.csv'])
df_status.to_csv(out_filename, index=False)

### 04-01-2. check empty cells

In [176]:
COMBINE_harmonizer.check_empty(df_status)

(0/31) column: center (168 / 0)
(1/31) column: subjectID (168 / 0)
(2/31) column: uniqueID (168 / 0)
(3/31) column: homeTherapyStatus (15 / 153)
(4/31) column: homeTherapyVentilator (13 / 155)
(5/31) column: homeTherapyOxygen (13 / 155)
(6/31) column: homeTherapyGavageTubeFeed (13 / 155)
(7/31) column: homeTherapyGastrostomyTubeFeed (13 / 155)
(8/31) column: homeTherapyTemperatureBlanket (13 / 155)
(9/31) column: homeTherapyAnticonvulsantMedication (13 / 155)
(10/31) column: homeTherapyOther (13 / 155)
(11/31) column: homeTherapyOtherText (1 / 167)
(12/31) column: deathAutopsy (16 / 152)
(13/31) column: deathCauseText (3 / 165)
(14/31) column: status (168 / 0)
(15/31) column: dischargeDate (136 / 32)
(16/31) column: dischargeWeight_g (136 / 32)
(17/31) column: dischargeLength_cm (126 / 42)
(18/31) column: dischargeHeadCircumference_cm (128 / 40)
(19/31) column: transferReason (16 / 152)
(20/31) column: transferDate (16 / 152)
(21/31) column: transferWeight_g (16 / 152)
(22/31) column: 

In [177]:
COMBINE_harmonizer.column_info(df_status)

(0/31) center: (168/0)
(1/31) subjectID: (168/0)
(2/31) uniqueID: (168/0)
(3/31) homeTherapyStatus: (15/153)
(4/31) homeTherapyVentilator: (13/155)
(5/31) homeTherapyOxygen: (13/155)
(6/31) homeTherapyGavageTubeFeed: (13/155)
(7/31) homeTherapyGastrostomyTubeFeed: (13/155)
(8/31) homeTherapyTemperatureBlanket: (13/155)
(9/31) homeTherapyAnticonvulsantMedication: (13/155)
(10/31) homeTherapyOther: (13/155)
(11/31) homeTherapyOtherText: (1/167)
(12/31) deathAutopsy: (16/152)
(13/31) deathCauseText: (3/165)
(14/31) status: (168/0)
(15/31) dischargeDate: (136/32)
(16/31) dischargeWeight_g: (136/32)
(17/31) dischargeLength_cm: (126/42)
(18/31) dischargeHeadCircumference_cm: (128/40)
(19/31) transferReason: (16/152)
(20/31) transferDate: (16/152)
(21/31) transferWeight_g: (16/152)
(22/31) transferLength_cm: (13/155)
(23/31) transferHeadCircumference_cm: (13/155)
(24/31) transferOutcome: (15/153)
(25/31) deathDate: (16/152)
(26/31) deathTime: (16/152)
(27/31) deathCause: (16/152)
(28/31) deat

## 04-02. Discharge Neuro Exam

In [178]:
discharge_neuro_exam_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Neuro Exam')
all_discharge_neuro_exam_columns = id_columns + discharge_neuro_exam_columns
discharge_neuro_exam_columns

['dischargeNeuroExam',
 'dischargeNeuroExamStatus',
 'dischargeNeuroExamDate',
 'dischargeNeuroExamTime',
 'dischargeNeuroExamLevelConsciousness',
 'dischargeNeuroExamSpontaneousActivity',
 'dischargeNeuroExamPosture',
 'dischargeNeuroExamTone',
 'dischargeNeuroExamSuck',
 'dischargeNeuroExamMoro',
 'dischargeNeuroExamPupils',
 'dischargeNeuroExamHeartRate',
 'dischargeNeuroExamRespiration',
 'dischargeNeuroExamSeizure',
 'dischargeNeuroExamClonusSustained',
 'dischargeNeuroExamFistedHand',
 'dischargeNeuroExamAbnormalMovement',
 'dischargeNeuroExamGagReflexAbsent',
 'dischargeNeuroExamSedate',
 'dischargeNeuroExamHypertonia',
 'dischargeNeuroExamAsymTonicNeckReflex']

In [179]:
df_discharge_neuro_exam = COMBINE_harmonizer.valid_columns(df_main, all_discharge_neuro_exam_columns, debug_df=False, debug_columns=True)
df_discharge_neuro_exam = COMBINE_harmonizer.postprocess(df_discharge_neuro_exam)

out_filename = os.sep.join([out_dir, '04-02-neuro-exam.csv'])
df_discharge_neuro_exam.to_csv(out_filename, index=False)

(2/23) dischargeNeuroExam not in df
(3/23) dischargeNeuroExamStatus not in df
(20/23) dischargeNeuroExamSedate not in df
(22/23) dischargeNeuroExamAsymTonicNeckReflex not in df


### 04-02-2. check empty cells

In [180]:
COMBINE_harmonizer.check_empty(df_discharge_neuro_exam)

(0/20) column: center (168 / 0)
(1/20) column: subjectID (168 / 0)
(2/20) column: uniqueID (168 / 0)
(3/20) column: dischargeNeuroExamSeizure (163 / 5)
(4/20) column: dischargeNeuroExamHypertonia (163 / 5)
(5/20) column: dischargeNeuroExamClonusSustained (163 / 5)
(6/20) column: dischargeNeuroExamFistedHand (163 / 5)
(7/20) column: dischargeNeuroExamAbnormalMovement (163 / 5)
(8/20) column: dischargeNeuroExamGagReflexAbsent (163 / 5)
(9/20) column: dischargeNeuroExamDate (157 / 11)
(10/20) column: dischargeNeuroExamTime (153 / 15)
(11/20) column: dischargeNeuroExamLevelConsciousness (157 / 11)
(12/20) column: dischargeNeuroExamSpontaneousActivity (157 / 11)
(13/20) column: dischargeNeuroExamPosture (157 / 11)
(14/20) column: dischargeNeuroExamTone (129 / 39)
(15/20) column: dischargeNeuroExamSuck (157 / 11)
(16/20) column: dischargeNeuroExamMoro (155 / 13)
(17/20) column: dischargeNeuroExamPupils (156 / 12)
(18/20) column: dischargeNeuroExamHeartRate (153 / 15)
(19/20) column: discharg

In [181]:
COMBINE_harmonizer.column_info(df_discharge_neuro_exam)

(0/20) center: (168/0)
(1/20) subjectID: (168/0)
(2/20) uniqueID: (168/0)
(3/20) dischargeNeuroExamSeizure: (163/5)
(4/20) dischargeNeuroExamHypertonia: (163/5)
(5/20) dischargeNeuroExamClonusSustained: (163/5)
(6/20) dischargeNeuroExamFistedHand: (163/5)
(7/20) dischargeNeuroExamAbnormalMovement: (163/5)
(8/20) dischargeNeuroExamGagReflexAbsent: (163/5)
(9/20) dischargeNeuroExamDate: (157/11)
(10/20) dischargeNeuroExamTime: (153/15)
(11/20) dischargeNeuroExamLevelConsciousness: (157/11)
(12/20) dischargeNeuroExamSpontaneousActivity: (157/11)
(13/20) dischargeNeuroExamPosture: (157/11)
(14/20) dischargeNeuroExamTone: (129/39)
(15/20) dischargeNeuroExamSuck: (157/11)
(16/20) dischargeNeuroExamMoro: (155/13)
(17/20) dischargeNeuroExamPupils: (156/12)
(18/20) dischargeNeuroExamHeartRate: (153/15)
(19/20) dischargeNeuroExamRespiration: (157/11)


## 04-03. Discharge Cardiovascular

In [182]:
discharge_cardiovascular_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Cardiovascular')
all_discharge_cardiovascular_columns = id_columns + discharge_cardiovascular_columns
discharge_cardiovascular_columns

['dischargeCardiomegaly',
 'dischargeCardiacFailure',
 'dischargeCardiacDysfunctionByEcho',
 'dischargeCardiacIschemiaByEKG',
 'dischargeHypotension',
 'dischargeArrhythmia']

In [183]:
df_discharge_cardiovascular = COMBINE_harmonizer.valid_columns(df_main, all_discharge_cardiovascular_columns, debug_df=False, debug_columns=True)
df_discharge_cardiovascular = COMBINE_harmonizer.postprocess(df_discharge_cardiovascular)

out_filename = os.sep.join([out_dir, '04-03-cardiovascular.csv'])
df_discharge_cardiovascular.to_csv(out_filename, index=False)

### 04-03-2. check empty cells

In [184]:
COMBINE_harmonizer.check_empty(df_discharge_cardiovascular)

(0/9) column: center (168 / 0)
(1/9) column: subjectID (168 / 0)
(2/9) column: uniqueID (168 / 0)
(3/9) column: dischargeCardiomegaly (168 / 0)
(4/9) column: dischargeCardiacFailure (168 / 0)
(5/9) column: dischargeCardiacDysfunctionByEcho (168 / 0)
(6/9) column: dischargeCardiacIschemiaByEKG (168 / 0)
(7/9) column: dischargeHypotension (168 / 0)
(8/9) column: dischargeArrhythmia (168 / 0)


In [185]:
COMBINE_harmonizer.column_info(df_discharge_cardiovascular)

(0/9) center: (168/0)
(1/9) subjectID: (168/0)
(2/9) uniqueID: (168/0)
(3/9) dischargeCardiomegaly: (168/0)
(4/9) dischargeCardiacFailure: (168/0)
(5/9) dischargeCardiacDysfunctionByEcho: (168/0)
(6/9) dischargeCardiacIschemiaByEKG: (168/0)
(7/9) dischargeHypotension: (168/0)
(8/9) dischargeArrhythmia: (168/0)


## 04-04. Discharge Respiratory

In [186]:
discharge_respiratory_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Respiratory')
all_discharge_respiratory_columns = id_columns + discharge_respiratory_columns
discharge_respiratory_columns

['dischargeMeconiumAspirationSyndrome',
 'dischargePPHN',
 'dischargePulmonaryHemorrhage',
 'dischargePenumonia',
 'dischargeChronicLungDisease',
 'dischargeECMO',
 'dischargeINO',
 'dischargeVentilator_day',
 'dischargeOxygen_day',
 'dischargeCPAP_day',
 'dischargePulmonaryStartDate1',
 'dischargePulmonaryStartTime1',
 'dischargePulmonaryEndDate1',
 'dischargePulmonaryEndTime1',
 'dischargePulmonaryStartDate2',
 'dischargePulmonaryStartTime2',
 'dischargePulmonaryEndDate2',
 'dischargePulmonaryEndTime2',
 'dischargePulmonaryStartDate3',
 'dischargePulmonaryStartTime3',
 'dischargePulmonaryEndDate3',
 'dischargePulmonaryEndTime3']

In [187]:
df_discharge_respiratory = COMBINE_harmonizer.valid_columns(df_main, all_discharge_respiratory_columns, debug_df=False, debug_columns=True)
df_discharge_respiratory = COMBINE_harmonizer.postprocess(df_discharge_respiratory)

out_filename = os.sep.join([out_dir, '04-04-respiratory.csv'])
df_discharge_respiratory.to_csv(out_filename, index=False)

### 04-04-2. check empty cells

In [188]:
COMBINE_harmonizer.check_empty(df_discharge_respiratory)

(0/25) column: center (168 / 0)
(1/25) column: subjectID (168 / 0)
(2/25) column: uniqueID (168 / 0)
(3/25) column: dischargeMeconiumAspirationSyndrome (168 / 0)
(4/25) column: dischargePPHN (168 / 0)
(5/25) column: dischargePulmonaryHemorrhage (168 / 0)
(6/25) column: dischargePenumonia (168 / 0)
(7/25) column: dischargeChronicLungDisease (168 / 0)
(8/25) column: dischargeECMO (168 / 0)
(9/25) column: dischargeINO (168 / 0)
(10/25) column: dischargeVentilator_day (168 / 0)
(11/25) column: dischargeOxygen_day (168 / 0)
(12/25) column: dischargeCPAP_day (168 / 0)
(13/25) column: dischargePulmonaryStartDate1 (29 / 139)
(14/25) column: dischargePulmonaryEndDate1 (28 / 140)
(15/25) column: dischargePulmonaryStartTime1 (28 / 140)
(16/25) column: dischargePulmonaryEndTime1 (27 / 141)
(17/25) column: dischargePulmonaryStartDate2 (2 / 166)
(18/25) column: dischargePulmonaryEndDate2 (2 / 166)
(19/25) column: dischargePulmonaryStartTime2 (2 / 166)
(20/25) column: dischargePulmonaryEndTime2 (2 / 

In [189]:
COMBINE_harmonizer.column_info(df_discharge_respiratory)

(0/25) center: (168/0)
(1/25) subjectID: (168/0)
(2/25) uniqueID: (168/0)
(3/25) dischargeMeconiumAspirationSyndrome: (168/0)
(4/25) dischargePPHN: (168/0)
(5/25) dischargePulmonaryHemorrhage: (168/0)
(6/25) dischargePenumonia: (168/0)
(7/25) dischargeChronicLungDisease: (168/0)
(8/25) dischargeECMO: (168/0)
(9/25) dischargeINO: (168/0)
(10/25) dischargeVentilator_day: (168/0)
(11/25) dischargeOxygen_day: (168/0)
(12/25) dischargeCPAP_day: (168/0)
(13/25) dischargePulmonaryStartDate1: (29/139)
(14/25) dischargePulmonaryEndDate1: (28/140)
(15/25) dischargePulmonaryStartTime1: (28/140)
(16/25) dischargePulmonaryEndTime1: (27/141)
(17/25) dischargePulmonaryStartDate2: (2/166)
(18/25) dischargePulmonaryEndDate2: (2/166)
(19/25) dischargePulmonaryStartTime2: (2/166)
(20/25) dischargePulmonaryEndTime2: (2/166)
(21/25) dischargePulmonaryStartDate3: (0/168)
(22/25) dischargePulmonaryEndDate3: (0/168)
(23/25) dischargePulmonaryStartTime3: (0/168)
(24/25) dischargePulmonaryEndTime3: (0/168)


## 04-05. Discharge Hematology

In [190]:
discharge_hematology_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Hematology')
all_discharge_hematology_columns = id_columns + discharge_hematology_columns
discharge_hematology_columns

['dischargeDIC']

In [191]:
df_discharge_hematology = COMBINE_harmonizer.valid_columns(df_main, all_discharge_hematology_columns, debug_df=False, debug_columns=True)
df_discharge_hematology = COMBINE_harmonizer.postprocess(df_discharge_hematology)

out_filename = os.sep.join([out_dir, '04-05-hematology.csv'])
df_discharge_hematology.to_csv(out_filename, index=False)

### 04-05-2. check empty cells

In [192]:
COMBINE_harmonizer.check_empty(df_discharge_hematology)

(0/4) column: center (168 / 0)
(1/4) column: subjectID (168 / 0)
(2/4) column: uniqueID (168 / 0)
(3/4) column: dischargeDIC (168 / 0)


In [193]:
COMBINE_harmonizer.column_info(df_discharge_hematology)

(0/4) center: (168/0)
(1/4) subjectID: (168/0)
(2/4) uniqueID: (168/0)
(3/4) dischargeDIC: (168/0)


## 04-06. Discharge Metabolic

In [194]:
discharge_metabolic_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Metabolic')
all_discharge_metabolic_columns = id_columns + discharge_metabolic_columns
discharge_metabolic_columns

['dischargeHypoglycemia', 'dischargeHypocalcemia', 'dischargeHypomagnesemia']

In [195]:
df_discharge_metabolic = COMBINE_harmonizer.valid_columns(df_main, all_discharge_metabolic_columns, debug_df=False, debug_columns=True)
df_discharge_metabolic = COMBINE_harmonizer.postprocess(df_discharge_metabolic)

out_filename = os.sep.join([out_dir, '04-06-metabolic.csv'])
df_discharge_metabolic.to_csv(out_filename, index=False)

### 04-06-2. check empty cells

In [196]:
COMBINE_harmonizer.check_empty(df_discharge_metabolic)

(0/6) column: center (168 / 0)
(1/6) column: subjectID (168 / 0)
(2/6) column: uniqueID (168 / 0)
(3/6) column: dischargeHypoglycemia (168 / 0)
(4/6) column: dischargeHypocalcemia (168 / 0)
(5/6) column: dischargeHypomagnesemia (168 / 0)


In [197]:
COMBINE_harmonizer.column_info(df_discharge_metabolic)

(0/6) center: (168/0)
(1/6) subjectID: (168/0)
(2/6) uniqueID: (168/0)
(3/6) dischargeHypoglycemia: (168/0)
(4/6) dischargeHypocalcemia: (168/0)
(5/6) dischargeHypomagnesemia: (168/0)


## 04-07. Discharge Renal

In [198]:
discharge_renal_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Renal')
all_discharge_renal_columns = id_columns + discharge_renal_columns
discharge_renal_columns

['dischargeOliguria', 'dischargeAnuria', 'dischargeDialysis']

In [199]:
df_discharge_renal = COMBINE_harmonizer.valid_columns(df_main, all_discharge_renal_columns, debug_df=False, debug_columns=True)
df_discharge_renal = COMBINE_harmonizer.postprocess(df_discharge_renal)

out_filename = os.sep.join([out_dir, '04-07-renal.csv'])
df_discharge_renal.to_csv(out_filename, index=False)

### 04-07-2. check empty cells

In [200]:
COMBINE_harmonizer.check_empty(df_discharge_renal)

(0/6) column: center (168 / 0)
(1/6) column: subjectID (168 / 0)
(2/6) column: uniqueID (168 / 0)
(3/6) column: dischargeOliguria (168 / 0)
(4/6) column: dischargeAnuria (168 / 0)
(5/6) column: dischargeDialysis (168 / 0)


In [201]:
COMBINE_harmonizer.column_info(df_discharge_renal)

(0/6) center: (168/0)
(1/6) subjectID: (168/0)
(2/6) uniqueID: (168/0)
(3/6) dischargeOliguria: (168/0)
(4/6) dischargeAnuria: (168/0)
(5/6) dischargeDialysis: (168/0)


## 04-08. Discharge Gastrointestinal

In [202]:
discharge_gastrointestinal_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Gastrointestinal')
all_discharge_gastrointestinal_columns = id_columns + discharge_gastrointestinal_columns
discharge_gastrointestinal_columns

['dischargeEnteralFeedStart_day',
 'dischargeTubeFeedingDuration_day',
 'dischargeFullNippleFeed',
 'dischargeFullNippleFeed_day',
 'dischargeNEC',
 'dischargeHepaticDysfunction']

In [203]:
df_discharge_gastrointestinal = COMBINE_harmonizer.valid_columns(df_main, all_discharge_gastrointestinal_columns, debug_df=False, debug_columns=True)
df_discharge_gastrointestinal = COMBINE_harmonizer.postprocess(df_discharge_gastrointestinal)

out_filename = os.sep.join([out_dir, '04-08-gastrointestinal.csv'])
df_discharge_gastrointestinal.to_csv(out_filename, index=False)

(2/8) dischargeEnteralFeedStart_day not in df
(4/8) dischargeFullNippleFeed not in df


### 04-08-2. check empty cells

In [204]:
COMBINE_harmonizer.check_empty(df_discharge_gastrointestinal)

(0/7) column: center (168 / 0)
(1/7) column: subjectID (168 / 0)
(2/7) column: uniqueID (168 / 0)
(3/7) column: dischargeNEC (168 / 0)
(4/7) column: dischargeHepaticDysfunction (168 / 0)
(5/7) column: dischargeTubeFeedingDuration_day (165 / 3)
(6/7) column: dischargeFullNippleFeed_day (137 / 31)


In [205]:
COMBINE_harmonizer.column_info(df_discharge_gastrointestinal)

(0/7) center: (168/0)
(1/7) subjectID: (168/0)
(2/7) uniqueID: (168/0)
(3/7) dischargeNEC: (168/0)
(4/7) dischargeHepaticDysfunction: (168/0)
(5/7) dischargeTubeFeedingDuration_day: (165/3)
(6/7) dischargeFullNippleFeed_day: (137/31)


## 04-09. Skin

In [206]:
discharge_skin_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Skin')
all_discharge_skin_columns = id_columns + discharge_skin_columns
discharge_skin_columns

['dischargeAlteredSkinItegrityPostIntervention',
 'dischargeErythema',
 'dischargeErythemaOnsetDate',
 'dischargeErythemaResolveDate',
 'dischargeSclerema',
 'dischargeScleremaOnsetDate',
 'dischargeScleremaResolveDate',
 'dischargeCyanosis',
 'dischargeCyanosisOnsetDate',
 'dischargeCyanosisResolveDate',
 'dischargeSubFatNecrosis',
 'dischargeSubFatNecrosisOnsetDate',
 'dischargeSubFatNecrosisResolveDate']

In [207]:
df_discharge_skin = COMBINE_harmonizer.valid_columns(df_main, all_discharge_skin_columns, debug_df=False, debug_columns=True)
df_discharge_skin = COMBINE_harmonizer.postprocess(df_discharge_skin)

out_filename = os.sep.join([out_dir, '04-09-skin.csv'])
df_discharge_skin.to_csv(out_filename, index=False)

(2/15) dischargeAlteredSkinItegrityPostIntervention not in df
(3/15) dischargeErythema not in df
(4/15) dischargeErythemaOnsetDate not in df
(5/15) dischargeErythemaResolveDate not in df
(6/15) dischargeSclerema not in df
(7/15) dischargeScleremaOnsetDate not in df
(8/15) dischargeScleremaResolveDate not in df
(9/15) dischargeCyanosis not in df
(10/15) dischargeCyanosisOnsetDate not in df
(11/15) dischargeCyanosisResolveDate not in df
(12/15) dischargeSubFatNecrosis not in df
(13/15) dischargeSubFatNecrosisOnsetDate not in df
(14/15) dischargeSubFatNecrosisResolveDate not in df


### 04-09-2. check empty cells

In [208]:
COMBINE_harmonizer.check_empty(df_discharge_skin)

(0/3) column: center (168 / 0)
(1/3) column: subjectID (168 / 0)
(2/3) column: uniqueID (168 / 0)


In [209]:
COMBINE_harmonizer.column_info(df_discharge_skin)

(0/3) center: (168/0)
(1/3) subjectID: (168/0)
(2/3) uniqueID: (168/0)


## 04-10. Auditory

In [210]:
discharge_auditory_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Auditory')
all_discharge_auditory_columns = id_columns + discharge_auditory_columns
discharge_auditory_columns

['dischargeHearingTest', 'dischargeHearingTestNormal']

In [211]:
df_discharge_auditory = COMBINE_harmonizer.valid_columns(df_main, all_discharge_auditory_columns, debug_df=False, debug_columns=True)
df_discharge_auditory = COMBINE_harmonizer.postprocess(df_discharge_auditory)

out_filename = os.sep.join([out_dir, '04-10-auditory.csv'])
df_discharge_auditory.to_csv(out_filename, index=False)

### 04-10-2. check empty cells

In [212]:
COMBINE_harmonizer.check_empty(df_discharge_auditory)

(0/5) column: center (168 / 0)
(1/5) column: subjectID (168 / 0)
(2/5) column: uniqueID (168 / 0)
(3/5) column: dischargeHearingTest (168 / 0)
(4/5) column: dischargeHearingTestNormal (131 / 37)


In [213]:
COMBINE_harmonizer.column_info(df_discharge_auditory)

(0/5) center: (168/0)
(1/5) subjectID: (168/0)
(2/5) uniqueID: (168/0)
(3/5) dischargeHearingTest: (168/0)
(4/5) dischargeHearingTestNormal: (131/37)


## 04-11. Surgery

In [214]:
discharge_surgery_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Surgery')
all_discharge_surgery_columns = id_columns + discharge_surgery_columns
discharge_surgery_columns

['dischargeMajorSurgery',
 'dischargeSurgeryCode1',
 'dischargeSurgeryCode2',
 'dischargeSurgeryCode3']

In [215]:
df_discharge_surgery = COMBINE_harmonizer.valid_columns(df_main, all_discharge_surgery_columns, debug_df=False, debug_columns=True)
df_discharge_surgery = COMBINE_harmonizer.postprocess(df_discharge_surgery)

out_filename = os.sep.join([out_dir, '04-11-surgery.csv'])
df_discharge_surgery.to_csv(out_filename, index=False)

### 04-11-2. check empty cells

In [216]:
COMBINE_harmonizer.check_empty(df_discharge_surgery)

(0/7) column: center (168 / 0)
(1/7) column: subjectID (168 / 0)
(2/7) column: uniqueID (168 / 0)
(3/7) column: dischargeMajorSurgery (168 / 0)
(4/7) column: dischargeSurgeryCode1 (9 / 159)
(5/7) column: dischargeSurgeryCode2 (3 / 165)
(6/7) column: dischargeSurgeryCode3 (0 / 168)


In [217]:
COMBINE_harmonizer.column_info(df_discharge_surgery)

(0/7) center: (168/0)
(1/7) subjectID: (168/0)
(2/7) uniqueID: (168/0)
(3/7) dischargeMajorSurgery: (168/0)
(4/7) dischargeSurgeryCode1: (9/159)
(5/7) dischargeSurgeryCode2: (3/165)
(6/7) dischargeSurgeryCode3: (0/168)


## 04-12. Infection

In [218]:
discharge_infection_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Infection')
all_discharge_infection_columns = id_columns + discharge_infection_columns
discharge_infection_columns

['dischargeSepticemia',
 'dischargeSepticemiaOrganismCode1',
 'dischargeSepticemiaOrganismCode2',
 'dischargeSepticemiaOrganismCode3',
 'dischargeMeningitisEncephalitis',
 'dischargeMeningitisOrganismCode1',
 'dischargeMeningitisOrganismCode2',
 'dischargeMeningitisOrganismCode3']

In [219]:
df_discharge_infection = COMBINE_harmonizer.valid_columns(df_main, all_discharge_infection_columns, debug_df=False, debug_columns=True)
df_discharge_infection = COMBINE_harmonizer.postprocess(df_discharge_infection)

out_filename = os.sep.join([out_dir, '04-12-infection.csv'])
df_discharge_infection.to_csv(out_filename, index=False)

### 04-12-2. check empty cells

In [220]:
COMBINE_harmonizer.check_empty(df_discharge_infection)

(0/11) column: center (168 / 0)
(1/11) column: subjectID (168 / 0)
(2/11) column: uniqueID (168 / 0)
(3/11) column: dischargeSepticemia (168 / 0)
(4/11) column: dischargeMeningitisEncephalitis (168 / 0)
(5/11) column: dischargeSepticemiaOrganismCode1 (3 / 165)
(6/11) column: dischargeSepticemiaOrganismCode2 (1 / 167)
(7/11) column: dischargeSepticemiaOrganismCode3 (1 / 167)
(8/11) column: dischargeMeningitisOrganismCode1 (2 / 166)
(9/11) column: dischargeMeningitisOrganismCode2 (0 / 168)
(10/11) column: dischargeMeningitisOrganismCode3 (0 / 168)


In [221]:
COMBINE_harmonizer.column_info(df_discharge_infection)

(0/11) center: (168/0)
(1/11) subjectID: (168/0)
(2/11) uniqueID: (168/0)
(3/11) dischargeSepticemia: (168/0)
(4/11) dischargeMeningitisEncephalitis: (168/0)
(5/11) dischargeSepticemiaOrganismCode1: (3/165)
(6/11) dischargeSepticemiaOrganismCode2: (1/167)
(7/11) dischargeSepticemiaOrganismCode3: (1/167)
(8/11) dischargeMeningitisOrganismCode1: (2/166)
(9/11) dischargeMeningitisOrganismCode2: (0/168)
(10/11) dischargeMeningitisOrganismCode3: (0/168)


## 04-13. Seizure

In [222]:
discharge_neuro_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Seizure')
all_discharge_neuro_columns = id_columns + discharge_neuro_columns
discharge_neuro_columns

['dischargeSeizure',
 'dischargeSeizurePreIntervention',
 'dischargeSeizureAfterBaseline',
 'dischargeSeizureMaintenance',
 'dischargeSeizureRewarming',
 'dischargeSeizurePostIntervention',
 'dischargeEEG',
 'dischargeEEGFindingConsistentWithSeizure',
 'dischargeEEGFindingConsistentWithSeizureDate',
 'dischargeEEGFindingConsistentWithSeizureTime',
 'dischargeEEGAbnormalBackgroundActivity',
 'dischargeEEGAbnormalBackgroundActivityDate',
 'dischargeEEGAbnormalBackgroundActivityTime',
 'dischargeAnticonvulsantsOver72H']

In [223]:
df_discharge_neuro = COMBINE_harmonizer.valid_columns(df_main, all_discharge_neuro_columns, debug_df=False, debug_columns=True)
df_discharge_neuro = COMBINE_harmonizer.postprocess(df_discharge_neuro)

out_filename = os.sep.join([out_dir, '04-13-seizure.csv'])
df_discharge_neuro.to_csv(out_filename, index=False)

(4/16) dischargeSeizureAfterBaseline not in df


### 04-13-2. check empty cells

In [224]:
COMBINE_harmonizer.check_empty(df_discharge_neuro)

(0/16) column: center (168 / 0)
(1/16) column: subjectID (168 / 0)
(2/16) column: uniqueID (168 / 0)
(3/16) column: dischargeSeizure (168 / 0)
(4/16) column: dischargeSeizurePreIntervention (128 / 40)
(5/16) column: dischargeSeizureMaintenance (128 / 40)
(6/16) column: dischargeSeizureRewarming (128 / 40)
(7/16) column: dischargeSeizurePostIntervention (128 / 40)
(8/16) column: dischargeEEG (168 / 0)
(9/16) column: dischargeEEGFindingConsistentWithSeizure (139 / 29)
(10/16) column: dischargeEEGAbnormalBackgroundActivity (168 / 0)
(11/16) column: dischargeAnticonvulsantsOver72H (168 / 0)
(12/16) column: dischargeEEGFindingConsistentWithSeizureDate (136 / 32)
(13/16) column: dischargeEEGFindingConsistentWithSeizureTime (124 / 44)
(14/16) column: dischargeEEGAbnormalBackgroundActivityDate (97 / 71)
(15/16) column: dischargeEEGAbnormalBackgroundActivityTime (81 / 87)


In [225]:
COMBINE_harmonizer.column_info(df_discharge_neuro)

(0/16) center: (168/0)
(1/16) subjectID: (168/0)
(2/16) uniqueID: (168/0)
(3/16) dischargeSeizure: (168/0)
(4/16) dischargeSeizurePreIntervention: (128/40)
(5/16) dischargeSeizureMaintenance: (128/40)
(6/16) dischargeSeizureRewarming: (128/40)
(7/16) dischargeSeizurePostIntervention: (128/40)
(8/16) dischargeEEG: (168/0)
(9/16) dischargeEEGFindingConsistentWithSeizure: (139/29)
(10/16) dischargeEEGAbnormalBackgroundActivity: (168/0)
(11/16) dischargeAnticonvulsantsOver72H: (168/0)
(12/16) dischargeEEGFindingConsistentWithSeizureDate: (136/32)
(13/16) dischargeEEGFindingConsistentWithSeizureTime: (124/44)
(14/16) dischargeEEGAbnormalBackgroundActivityDate: (97/71)
(15/16) dischargeEEGAbnormalBackgroundActivityTime: (81/87)


## 04-14. Birth Defect

In [226]:
discharge_birth_defect_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Birth Defect')
all_discharge_birth_defect_columns = id_columns + discharge_birth_defect_columns
discharge_birth_defect_columns

['dischargeSyndromeMalformation',
 'dischargeBirthDefectCode1',
 'dischargeBirthDefectCode2',
 'dischargeBirthDefectCode3']

In [227]:
df_discharge_birth_defect = COMBINE_harmonizer.valid_columns(df_main, all_discharge_birth_defect_columns, debug_df=False, debug_columns=True)
df_discharge_birth_defect = COMBINE_harmonizer.postprocess(df_discharge_birth_defect)

out_filename = os.sep.join([out_dir, '04-14-birth-defect.csv'])
df_discharge_birth_defect.to_csv(out_filename, index=False)

### 04-14-2. check empty cells

In [228]:
COMBINE_harmonizer.check_empty(df_discharge_birth_defect)

(0/7) column: center (168 / 0)
(1/7) column: subjectID (168 / 0)
(2/7) column: uniqueID (168 / 0)
(3/7) column: dischargeSyndromeMalformation (168 / 0)
(4/7) column: dischargeBirthDefectCode1 (3 / 165)
(5/7) column: dischargeBirthDefectCode2 (0 / 168)
(6/7) column: dischargeBirthDefectCode3 (0 / 168)


In [229]:
COMBINE_harmonizer.column_info(df_discharge_birth_defect)

(0/7) center: (168/0)
(1/7) subjectID: (168/0)
(2/7) uniqueID: (168/0)
(3/7) dischargeSyndromeMalformation: (168/0)
(4/7) dischargeBirthDefectCode1: (3/165)
(5/7) dischargeBirthDefectCode2: (0/168)
(6/7) dischargeBirthDefectCode3: (0/168)


## 04-15. Home Therapy

In [230]:
discharge_home_therapy_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Home Therapy')
all_discharge_home_therapy_columns = id_columns + discharge_home_therapy_columns
discharge_home_therapy_columns

['dischargeHomeTherapy',
 'dischargeHomeTherapyVentilator',
 'dischargeHomeTherapyOxygen',
 'dischargeHomeTherapyGavageTubeFeed',
 'dischargeHomeTherapyGastrostomyTubeFeed',
 'dischargeHomeTherapyTemperatureBlanket',
 'dischargeHomeTherapyAnticonvulsantMedication',
 'dischargeHomeTherapyOther',
 'dischargeHomeTherapyOtherText']

In [231]:
df_discharge_home_therapy = COMBINE_harmonizer.valid_columns(df_main, all_discharge_home_therapy_columns, debug_df=False, debug_columns=True)
df_discharge_home_therapy = COMBINE_harmonizer.postprocess(df_discharge_home_therapy)

out_filename = os.sep.join([out_dir, '04-15-home-therapy.csv'])
df_discharge_home_therapy.to_csv(out_filename, index=False)

(2/11) dischargeHomeTherapy not in df


### 04-15-2. check empty cells

In [232]:
COMBINE_harmonizer.check_empty(df_discharge_home_therapy)

(0/11) column: center (168 / 0)
(1/11) column: subjectID (168 / 0)
(2/11) column: uniqueID (168 / 0)
(3/11) column: dischargeHomeTherapyVentilator (140 / 28)
(4/11) column: dischargeHomeTherapyOxygen (140 / 28)
(5/11) column: dischargeHomeTherapyGavageTubeFeed (140 / 28)
(6/11) column: dischargeHomeTherapyGastrostomyTubeFeed (140 / 28)
(7/11) column: dischargeHomeTherapyTemperatureBlanket (140 / 28)
(8/11) column: dischargeHomeTherapyAnticonvulsantMedication (140 / 28)
(9/11) column: dischargeHomeTherapyOther (140 / 28)
(10/11) column: dischargeHomeTherapyOtherText (11 / 157)


In [233]:
COMBINE_harmonizer.column_info(df_discharge_home_therapy)

(0/11) center: (168/0)
(1/11) subjectID: (168/0)
(2/11) uniqueID: (168/0)
(3/11) dischargeHomeTherapyVentilator: (140/28)
(4/11) dischargeHomeTherapyOxygen: (140/28)
(5/11) dischargeHomeTherapyGavageTubeFeed: (140/28)
(6/11) dischargeHomeTherapyGastrostomyTubeFeed: (140/28)
(7/11) dischargeHomeTherapyTemperatureBlanket: (140/28)
(8/11) dischargeHomeTherapyAnticonvulsantMedication: (140/28)
(9/11) dischargeHomeTherapyOther: (140/28)
(10/11) dischargeHomeTherapyOtherText: (11/157)
