In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re
import pydoc

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_OC
sheet_name = COMBINE_harmonizer.SHEET_MAIN

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')


In [4]:
_FILENAMES = [
    'oc01.csv',
    'oc02.csv',
    'oc04.csv',
    'oc05.csv',
    'oc06d.csv',
    'oc06t.csv',
    'oc06t120.csv',
    'oc06t72.csv',
    'oc07.csv',
    'oc08.csv',
    'oc09.csv',
    'oc09i.csv',
    'oc10.csv',
    'oc11.csv',
    'oc12.csv',
    'oc13.csv',
    'oc14.csv',
    'oc15.csv',
    'oc17.csv',

    'ocmr01.csv',
    'ocmr02.csv',
    'ocmr03.csv',
]

# intended for single record per unique-id.
_FILENAMES_MERGE = [
    'oc01.csv',
    'oc02.csv',
    'oc04.csv',
    'oc05.csv',
    'oc09.csv',
    # 'oc11.csv',
    'oc13.csv',
]

# MRI intended for single record per unique-id.
_MRI_FILENAMES_MERGE = [
    'ocmr01.csv',
    'ocmr02.csv',
]

_MRI_FILENAME = 'ocmr03.csv'


In [5]:
input_dir = cfg.config[f'{study_name}_dir']
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}"

os.makedirs(out_dir, exist_ok=True)

## 00-1. Column Map

In [6]:
df_data_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=sheet_name)
all_valid_columns = list(df_data_dict[COMBINE_harmonizer.DATA_DICT_VAR_NAME])
column_map = {each[study_name]: each[COMBINE_harmonizer.DATA_DICT_VAR_NAME] for _, each in df_data_dict.iterrows()}

## 00-2. df-dict from _FILENAMES

In [7]:
df_dict = {filename: pd.read_csv(os.sep.join([input_dir, filename]), dtype='O').rename(columns=column_map) for filename in _FILENAMES}

In [8]:
### XXX change unit from cc to cc/Kg
columns = ['otherMedFluidIntake_ccPerKg', 'otherMedUrineOutput_ccPerKg']
df_dict['oc08.csv'] = COMBINE_harmonizer.cc_to_cc_per_kg(df_dict['oc08.csv'], df_dict['oc05.csv'], columns, 'birthWeight_g')

## 00-3. df-all and df-main

In [9]:
df_all = None
for idx, each_filename in enumerate(_FILENAMES_MERGE):
    each_full_filename = os.sep.join([root_dir, each_filename])
    each_df = df_dict[each_filename]
    columns = list(each_df.columns)
    each_filename_prefix = re.sub(r'\.csv$', '', each_filename)

    if df_all is None:
        df_all = each_df
    else:
        df_all = df_all.merge(each_df, on=['center', 'subjectID'], how='outer', suffixes=['', ':' + each_filename_prefix])

len(df_all), len(df_all.columns)

(1261, 368)

### 00-3-1. neuro exam

In [10]:
# parse oc11
_NEURO_EXAM_SECTION_ID_POST_NORMO = 'A'

def _parse_oc11_rename_column(column: str, prefix: str) -> str:
    if column in ['post_NeuroExamSectionID', 'post_NeuroExam', 'dischargeNeuroExam']:
        return column

    if 'NeuroExam' not in column:
        return column

    column_list = column.split('NeuroExam')

    return prefix + 'NeuroExam' + column_list[1]


def _parse_oc11_rename_columns(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    column_map = {each: _parse_oc11_rename_column(each, prefix) for each in df.columns}
    return df.rename(columns=column_map)


def _parse_oc11() -> pd.DataFrame:
    df_oc11 = df_dict['oc11.csv'].copy()
    is_post = df_oc11['post_NeuroExamSectionID'] == _NEURO_EXAM_SECTION_ID_POST_NORMO
    df_oc11_post = df_oc11[is_post]
    df_oc11_discharge = df_oc11[is_post == False]

    df_oc11_post = _parse_oc11_rename_columns(df_oc11_post, 'post_')
    df_oc11_discharge = _parse_oc11_rename_columns(df_oc11_discharge, 'discharge')

    df_oc11_merge = df_oc11_post.merge(df_oc11_discharge, on=['center', 'subjectID'], how='outer')

    return df_oc11_merge

In [11]:
df_oc11 = _parse_oc11()

In [12]:
df_oc11.columns

Index(['subjectID', 'post_NeuroExamSectionID_x', 'dischargeNeuroExam_x',
       'post_NeuroExamTone', 'post_NeuroExamRespiration',
       'post_NeuroExamSeizure', 'post_NeuroExamSedate',
       'post_NeuroExamClonusSustained', 'post_NeuroExamFistedHand',
       'post_NeuroExamAbnormalMovement', 'post_NeuroExamGagReflexAbsent',
       'post_NeuroExamAsymTonicNeckReflex', 'OC11ENAM_x', 'REC_CMP_x',
       'center', 'post_NeuroExamStatus', 'post_NeuroExamDate',
       'post_NeuroExamTime', 'post_NeuroExamLevelConsciousness',
       'post_NeuroExamSpontaneousActivity', 'post_NeuroExamPosture',
       'post_NeuroExamSuck', 'post_NeuroExamMoro', 'post_NeuroExamPupils',
       'post_NeuroExamHeartRate', 'CMP_DATE_x', 'CRT_DATE_x',
       'post_NeuroExamSectionID_y', 'dischargeNeuroExam_y',
       'dischargeNeuroExamTone', 'dischargeNeuroExamRespiration',
       'dischargeNeuroExamSeizure', 'dischargeNeuroExamSedate',
       'dischargeNeuroExamClonusSustained', 'dischargeNeuroExamFistedHand',


#### 00-3-1-1. check df_oc11

In [13]:
df_oc11_groupby = df_oc11.groupby(['center', 'subjectID']).agg(_count=('subjectID', 'count'))
is_invalid = df_oc11_groupby['_count'] > 1
df_oc11_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
center,subjectID,Unnamed: 2_level_1


#### 00-3-1-2. check empty cells

In [14]:
COMBINE_harmonizer.check_empty(df_oc11)

(0/52) column: subjectID (364 / 0)
(1/52) column: post_NeuroExamSectionID_x (357 / 7)
(2/52) column: dischargeNeuroExam_x (357 / 7)
(3/52) column: post_NeuroExamTone (308 / 56)
(4/52) column: post_NeuroExamRespiration (308 / 56)
(5/52) column: post_NeuroExamSeizure (308 / 56)
(6/52) column: post_NeuroExamSedate (308 / 56)
(7/52) column: post_NeuroExamClonusSustained (308 / 56)
(8/52) column: post_NeuroExamFistedHand (308 / 56)
(9/52) column: post_NeuroExamAbnormalMovement (308 / 56)
(10/52) column: post_NeuroExamGagReflexAbsent (308 / 56)
(11/52) column: post_NeuroExamAsymTonicNeckReflex (0 / 364)
(12/52) column: OC11ENAM_x (308 / 56)
(13/52) column: REC_CMP_x (357 / 7)
(14/52) column: center (364 / 0)
(15/52) column: post_NeuroExamStatus (0 / 364)
(16/52) column: post_NeuroExamDate (308 / 56)
(17/52) column: post_NeuroExamTime (307 / 57)
(18/52) column: post_NeuroExamLevelConsciousness (308 / 56)
(19/52) column: post_NeuroExamSpontaneousActivity (308 / 56)
(20/52) column: post_NeuroEx

In [15]:
COMBINE_harmonizer.column_info(df_oc11)

(0/52) subjectID: (364/0)
(1/52) post_NeuroExamSectionID_x: (357/7)
(2/52) dischargeNeuroExam_x: (357/7)
(3/52) post_NeuroExamTone: (308/56)
(4/52) post_NeuroExamRespiration: (308/56)
(5/52) post_NeuroExamSeizure: (308/56)
(6/52) post_NeuroExamSedate: (308/56)
(7/52) post_NeuroExamClonusSustained: (308/56)
(8/52) post_NeuroExamFistedHand: (308/56)
(9/52) post_NeuroExamAbnormalMovement: (308/56)
(10/52) post_NeuroExamGagReflexAbsent: (308/56)
(11/52) post_NeuroExamAsymTonicNeckReflex: (0/364)
(12/52) OC11ENAM_x: (308/56)
(13/52) REC_CMP_x: (357/7)
(14/52) center: (364/0)
(15/52) post_NeuroExamStatus: (0/364)
(16/52) post_NeuroExamDate: (308/56)
(17/52) post_NeuroExamTime: (307/57)
(18/52) post_NeuroExamLevelConsciousness: (308/56)
(19/52) post_NeuroExamSpontaneousActivity: (308/56)
(20/52) post_NeuroExamPosture: (307/57)
(21/52) post_NeuroExamSuck: (306/58)
(22/52) post_NeuroExamMoro: (289/75)
(23/52) post_NeuroExamPupils: (290/74)
(24/52) post_NeuroExamHeartRate: (306/58)
(25/52) CMP_D

### 00-3-2. df-main

In [16]:
# main
print('to set main')
is_main = df_all['randomNumber'].isnull() == False

df_main_from_all = df_all[is_main]
df_main = df_main_from_all.merge(df_oc11, on=['center', 'subjectID'], how='outer', suffixes=['main', 'oc11'])
len(df_main), len(df_main.columns), len(df_main_from_all), len(df_main_from_all.columns), len(df_oc11), len(df_oc11.columns)

to set main


(364, 418, 364, 368, 364, 52)

## 00-4. identifier-column

In [17]:
id_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Identity')
id_columns

['center', 'subjectID']

## 01-02. screening

In [18]:
screening_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Screening')
all_screening_columns = id_columns + screening_columns
screening_columns

['siteID',
 'birthDate',
 'birthNumber',
 'screenComment',
 'coreTempLess32p5COverEq2Hr_e',
 'coreTempLess33p5COver1Hr_e',
 'coreTempLess34COver1Hr_e',
 'first6HrCoolByClinicalProtocol_e',
 'chromosomalAbnormality_e',
 'majorCongenitalAnomaly_e',
 'birthWeightLessEq1800g_e',
 'infantUnlikelySurvive_e',
 'first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e',
 'postnatalAgeLess6HrOrGreater24Hr_e',
 'enrolledConflictingTrial_e',
 'first60MinAnyBloodGasPHLessEq7_i',
 'first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i',
 'historyPerinatalEvent_i',
 'at10MinApgarLessEq5OrVent_i',
 'randomEligible',
 'consentStatus',
 'noConsentReason',
 'noInStudyReason',
 'random',
 'noRandomReason',
 'noRandomReasonText',
 'randomDate',
 'randomTime',
 'randomNumber',
 'randomTreatmentAssign',
 'randomTreatmentReceive',
 'treatmentBlanketType',
 'inOtherTrial',
 'inOtherTrialText']

In [19]:
# screening
print('to set screening')
df_screening = COMBINE_harmonizer.valid_columns(df_all, all_screening_columns)
df_screening = COMBINE_harmonizer.postprocess(df_screening)

out_filename = os.sep.join([out_dir, '00-02-screening.csv'])
df_screening.to_csv(out_filename, index=False)

df_main_screening = COMBINE_harmonizer.valid_columns(df_main, all_screening_columns)
df_main_screening = COMBINE_harmonizer.postprocess(df_main_screening)

out_filename = os.sep.join([out_dir, '01-02-screening.csv'])
df_main_screening.to_csv(out_filename, index=False)

to set screening
(8/36) coreTempLess34COver1Hr_e not in df
(15/36) postnatalAgeLess6HrOrGreater24Hr_e not in df
(16/36) enrolledConflictingTrial_e not in df
(24/36) noInStudyReason not in df
(8/36) coreTempLess34COver1Hr_e not in df
(15/36) postnatalAgeLess6HrOrGreater24Hr_e not in df
(16/36) enrolledConflictingTrial_e not in df
(24/36) noInStudyReason not in df


### 01-02-1. check screening

In [20]:
len(df_screening['center'].unique()), len(df_screening['subjectID'].unique()), len(df_screening['uniqueID'].unique()), len(df_screening)

(19, 395, 1261, 1261)

In [21]:
df_screening_groupby = df_screening.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_screening_groupby['_count'] > 1
df_screening_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-02-2. check empty cells

In [22]:
COMBINE_harmonizer.check_empty(df_screening)

(0/33) column: center (1261 / 0)
(1/33) column: subjectID (1261 / 0)
(2/33) column: uniqueID (1261 / 0)
(3/33) column: siteID (1261 / 0)
(4/33) column: screenComment (204 / 1057)
(5/33) column: birthDate (1261 / 0)
(6/33) column: birthNumber (1261 / 0)
(7/33) column: coreTempLess33p5COver1Hr_e (848 / 413)
(8/33) column: first6HrCoolByClinicalProtocol_e (1261 / 0)
(9/33) column: chromosomalAbnormality_e (1261 / 0)
(10/33) column: majorCongenitalAnomaly_e (1261 / 0)
(11/33) column: birthWeightLessEq1800g_e (1261 / 0)
(12/33) column: infantUnlikelySurvive_e (1261 / 0)
(13/33) column: first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e (1198 / 63)
(14/33) column: first60MinAnyBloodGasPHLessEq7_i (970 / 291)
(15/33) column: first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i (933 / 328)
(16/33) column: historyPerinatalEvent_i (301 / 960)
(17/33) column: at10MinApgarLessEq5OrVent_i (301 / 960)
(18/33) column: randomEligible (1261 / 0)
(19/33) column: noConsentReason (45 / 1216)
(20

### 01-02-3. check main

In [23]:
len(df_main_screening['center'].unique()), len(df_main_screening['subjectID'].unique()), len(df_main_screening['uniqueID'].unique()), len(df_main_screening)

(18, 164, 364, 364)

In [24]:
df_main_screening_groupby = df_main_screening.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_main_screening_groupby['_count'] > 1
df_main_screening_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-02-4. check empty cells

In [25]:
COMBINE_harmonizer.check_empty(df_main_screening)

(0/33) column: center (364 / 0)
(1/33) column: subjectID (364 / 0)
(2/33) column: uniqueID (364 / 0)
(3/33) column: siteID (364 / 0)
(4/33) column: screenComment (48 / 316)
(5/33) column: birthDate (364 / 0)
(6/33) column: birthNumber (364 / 0)
(7/33) column: coreTempLess33p5COver1Hr_e (269 / 95)
(8/33) column: first6HrCoolByClinicalProtocol_e (364 / 0)
(9/33) column: chromosomalAbnormality_e (364 / 0)
(10/33) column: majorCongenitalAnomaly_e (364 / 0)
(11/33) column: birthWeightLessEq1800g_e (364 / 0)
(12/33) column: infantUnlikelySurvive_e (364 / 0)
(13/33) column: first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e (351 / 13)
(14/33) column: first60MinAnyBloodGasPHLessEq7_i (350 / 14)
(15/33) column: first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i (340 / 24)
(16/33) column: historyPerinatalEvent_i (64 / 300)
(17/33) column: at10MinApgarLessEq5OrVent_i (64 / 300)
(18/33) column: randomEligible (364 / 0)
(19/33) column: noConsentReason (0 / 364)
(20/33) column: random (3

In [26]:
COMBINE_harmonizer.column_info(df_main_screening)

(0/33) center: (364/0)
(1/33) subjectID: (364/0)
(2/33) uniqueID: (364/0)
(3/33) siteID: (364/0)
(4/33) screenComment: (48/316)
(5/33) birthDate: (364/0)
(6/33) birthNumber: (364/0)
(7/33) coreTempLess33p5COver1Hr_e: (269/95)
(8/33) first6HrCoolByClinicalProtocol_e: (364/0)
(9/33) chromosomalAbnormality_e: (364/0)
(10/33) majorCongenitalAnomaly_e: (364/0)
(11/33) birthWeightLessEq1800g_e: (364/0)
(12/33) infantUnlikelySurvive_e: (364/0)
(13/33) first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e: (351/13)
(14/33) first60MinAnyBloodGasPHLessEq7_i: (350/14)
(15/33) first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i: (340/24)
(16/33) historyPerinatalEvent_i: (64/300)
(17/33) at10MinApgarLessEq5OrVent_i: (64/300)
(18/33) randomEligible: (364/0)
(19/33) noConsentReason: (0/364)
(20/33) random: (364/0)
(21/33) noRandomReasonText: (0/364)
(22/33) randomTreatmentReceive: (364/0)
(23/33) consentStatus: (364/0)
(24/33) noRandomReason: (0/364)
(25/33) randomDate: (364/0)
(26/33) random

## 01-12. Pre-intervention Neuro Exam

In [27]:
pre_neuro_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Neuro Exam')
all_pre_neuro_columns = id_columns + pre_neuro_columns
pre_neuro_columns

['pre_NeuroExam',
 'pre_NoNeuroExamReason',
 'pre_NeuroExamSignModerateSevereHIE3Category',
 'pre_NeuroExamLevelConsciousness',
 'pre_NeuroExamSpontaneousActivity',
 'pre_NeuroExamPosture',
 'pre_NeuroExamTone',
 'pre_NeuroExamSuck',
 'pre_NeuroExamMoro',
 'pre_NeuroExamPupils',
 'pre_NeuroExamHeartRate',
 'pre_NeuroExamRespiration',
 'pre_NeuroExamDate',
 'pre_NeuroExamTime',
 'pre_NeuroExamSedate',
 'pre_NeuroExamSeizure']

In [28]:
print('to set pre-intervention neuro exam')
df_pre_neuro = COMBINE_harmonizer.valid_columns(df_all, all_pre_neuro_columns)
df_pre_neuro = COMBINE_harmonizer.postprocess(df_pre_neuro)

out_filename = os.sep.join([out_dir, '00-12-neuro-exam.csv'])
df_pre_neuro.to_csv(out_filename, index=False)

df_main_pre_neuro = COMBINE_harmonizer.valid_columns(df_main, all_pre_neuro_columns)
df_main_pre_neuro = COMBINE_harmonizer.postprocess(df_main_pre_neuro)

out_filename = os.sep.join([out_dir, '01-12-neuro-exam.csv'])
df_main_pre_neuro.to_csv(out_filename, index=False)

to set pre-intervention neuro exam


### 01-12-1. check pre-intervention neuro

In [29]:
len(df_pre_neuro['center'].unique()), len(df_pre_neuro['subjectID'].unique()), len(df_pre_neuro['uniqueID'].unique()), len(df_pre_neuro)

(19, 395, 1261, 1261)

In [30]:
df_pre_neuro_groupby = df_pre_neuro.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_pre_neuro_groupby['_count'] > 1
df_pre_neuro_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-12-2. check empty cells

In [31]:
COMBINE_harmonizer.check_empty(df_pre_neuro)

(0/19) column: center (1261 / 0)
(1/19) column: subjectID (1261 / 0)
(2/19) column: uniqueID (1261 / 0)
(3/19) column: pre_NeuroExamSeizure (882 / 379)
(4/19) column: pre_NeuroExam (882 / 379)
(5/19) column: pre_NeuroExamTone (797 / 464)
(6/19) column: pre_NeuroExamRespiration (797 / 464)
(7/19) column: pre_NeuroExamSignModerateSevereHIE3Category (797 / 464)
(8/19) column: pre_NeuroExamSedate (797 / 464)
(9/19) column: pre_NoNeuroExamReason (59 / 1202)
(10/19) column: pre_NeuroExamLevelConsciousness (797 / 464)
(11/19) column: pre_NeuroExamSpontaneousActivity (797 / 464)
(12/19) column: pre_NeuroExamPosture (791 / 470)
(13/19) column: pre_NeuroExamSuck (786 / 475)
(14/19) column: pre_NeuroExamMoro (774 / 487)
(15/19) column: pre_NeuroExamPupils (775 / 486)
(16/19) column: pre_NeuroExamHeartRate (797 / 464)
(17/19) column: pre_NeuroExamDate (797 / 464)
(18/19) column: pre_NeuroExamTime (782 / 479)


### 01-12-3. check main-screening-neuro

In [32]:
len(df_main_pre_neuro['center'].unique()), len(df_main_pre_neuro['subjectID'].unique()), len(df_main_pre_neuro['uniqueID'].unique()), len(df_main_pre_neuro)

(18, 164, 364, 364)

In [33]:
df_main_pre_neuro_groupby = df_main_pre_neuro.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_main_pre_neuro_groupby['_count'] > 1
df_main_pre_neuro_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 01-12-4. check empty cells

In [34]:
COMBINE_harmonizer.check_empty(df_main_pre_neuro)

(0/19) column: center (364 / 0)
(1/19) column: subjectID (364 / 0)
(2/19) column: uniqueID (364 / 0)
(3/19) column: pre_NeuroExamSeizure (364 / 0)
(4/19) column: pre_NeuroExam (364 / 0)
(5/19) column: pre_NeuroExamTone (362 / 2)
(6/19) column: pre_NeuroExamRespiration (362 / 2)
(7/19) column: pre_NeuroExamSignModerateSevereHIE3Category (362 / 2)
(8/19) column: pre_NeuroExamSedate (362 / 2)
(9/19) column: pre_NoNeuroExamReason (2 / 362)
(10/19) column: pre_NeuroExamLevelConsciousness (362 / 2)
(11/19) column: pre_NeuroExamSpontaneousActivity (362 / 2)
(12/19) column: pre_NeuroExamPosture (360 / 4)
(13/19) column: pre_NeuroExamSuck (357 / 7)
(14/19) column: pre_NeuroExamMoro (348 / 16)
(15/19) column: pre_NeuroExamPupils (351 / 13)
(16/19) column: pre_NeuroExamHeartRate (362 / 2)
(17/19) column: pre_NeuroExamDate (362 / 2)
(18/19) column: pre_NeuroExamTime (361 / 3)


In [35]:
COMBINE_harmonizer.column_info(df_main_pre_neuro)

(0/19) center: (364/0)
(1/19) subjectID: (364/0)
(2/19) uniqueID: (364/0)
(3/19) pre_NeuroExamSeizure: (364/0)
(4/19) pre_NeuroExam: (364/0)
(5/19) pre_NeuroExamTone: (362/2)
(6/19) pre_NeuroExamRespiration: (362/2)
(7/19) pre_NeuroExamSignModerateSevereHIE3Category: (362/2)
(8/19) pre_NeuroExamSedate: (362/2)
(9/19) pre_NoNeuroExamReason: (2/362)
(10/19) pre_NeuroExamLevelConsciousness: (362/2)
(11/19) pre_NeuroExamSpontaneousActivity: (362/2)
(12/19) pre_NeuroExamPosture: (360/4)
(13/19) pre_NeuroExamSuck: (357/7)
(14/19) pre_NeuroExamMoro: (348/16)
(15/19) pre_NeuroExamPupils: (351/13)
(16/19) pre_NeuroExamHeartRate: (362/2)
(17/19) pre_NeuroExamDate: (362/2)
(18/19) pre_NeuroExamTime: (361/3)


## 01-03. Maternal Demographics

In [36]:
maternal_demographics_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Maternal Demographics')
all_maternal_demographics_columns = id_columns + maternal_demographics_columns
maternal_demographics_columns

['motherAge_year',
 'motherRace',
 'motherRaceOther1',
 'motherRaceOther2',
 'motherRaceOther3',
 'motherRaceOther4',
 'motherRaceOther5',
 'motherRaceOther6',
 'motherEthnicity',
 'motherMaritalStatus',
 'motherEducation',
 'motherInsurance']

In [37]:
print('to set maternal demographics')
df_maternal_demographics = COMBINE_harmonizer.valid_columns(df_main, all_maternal_demographics_columns)
df_maternal_demographics = COMBINE_harmonizer.postprocess(df_maternal_demographics)

out_filename = os.sep.join([out_dir, '01-03-maternal-demographics.csv'])
df_maternal_demographics.to_csv(out_filename, index=False)

to set maternal demographics


### 01-03-2. check empty cells

In [38]:
COMBINE_harmonizer.check_empty(df_maternal_demographics)

(0/15) column: center (364 / 0)
(1/15) column: subjectID (364 / 0)
(2/15) column: uniqueID (364 / 0)
(3/15) column: motherAge_year (364 / 0)
(4/15) column: motherRace (361 / 3)
(5/15) column: motherRaceOther1 (1 / 363)
(6/15) column: motherRaceOther2 (1 / 363)
(7/15) column: motherRaceOther3 (0 / 364)
(8/15) column: motherRaceOther4 (0 / 364)
(9/15) column: motherRaceOther5 (0 / 364)
(10/15) column: motherRaceOther6 (0 / 364)
(11/15) column: motherEthnicity (364 / 0)
(12/15) column: motherMaritalStatus (364 / 0)
(13/15) column: motherEducation (363 / 1)
(14/15) column: motherInsurance (362 / 2)


In [39]:
COMBINE_harmonizer.column_info(df_maternal_demographics)

(0/15) center: (364/0)
(1/15) subjectID: (364/0)
(2/15) uniqueID: (364/0)
(3/15) motherAge_year: (364/0)
(4/15) motherRace: (361/3)
(5/15) motherRaceOther1: (1/363)
(6/15) motherRaceOther2: (1/363)
(7/15) motherRaceOther3: (0/364)
(8/15) motherRaceOther4: (0/364)
(9/15) motherRaceOther5: (0/364)
(10/15) motherRaceOther6: (0/364)
(11/15) motherEthnicity: (364/0)
(12/15) motherMaritalStatus: (364/0)
(13/15) motherEducation: (363/1)
(14/15) motherInsurance: (362/2)


## 01-04. Pregnancy History

In [40]:
pregnancy_history_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Pregnancy History')
all_pregnancy_history_columns = id_columns + pregnancy_history_columns
pregnancy_history_columns

['gravida',
 'parity',
 'multipleBirth',
 'numFetus',
 'prenatalCare',
 'hypertensionEclampsia',
 'antepartumHemorrhage',
 'thyroidMalfunction',
 'diabetes']

In [41]:
print('to set pregnancy history')
df_pregnancy_history = COMBINE_harmonizer.valid_columns(df_main, all_pregnancy_history_columns)
df_pregnancy_history = COMBINE_harmonizer.postprocess(df_pregnancy_history)

out_filename = os.sep.join([out_dir, '01-04-pregnancy-history.csv'])
df_pregnancy_history.to_csv(out_filename, index=False)

to set pregnancy history


### 01-04-2. check empty cells

In [42]:
COMBINE_harmonizer.check_empty(df_pregnancy_history)

(0/12) column: center (364 / 0)
(1/12) column: subjectID (364 / 0)
(2/12) column: uniqueID (364 / 0)
(3/12) column: multipleBirth (364 / 0)
(4/12) column: prenatalCare (364 / 0)
(5/12) column: hypertensionEclampsia (364 / 0)
(6/12) column: antepartumHemorrhage (364 / 0)
(7/12) column: thyroidMalfunction (364 / 0)
(8/12) column: diabetes (364 / 0)
(9/12) column: gravida (364 / 0)
(10/12) column: parity (364 / 0)
(11/12) column: numFetus (10 / 354)


In [43]:
COMBINE_harmonizer.column_info(df_pregnancy_history)

(0/12) center: (364/0)
(1/12) subjectID: (364/0)
(2/12) uniqueID: (364/0)
(3/12) multipleBirth: (364/0)
(4/12) prenatalCare: (364/0)
(5/12) hypertensionEclampsia: (364/0)
(6/12) antepartumHemorrhage: (364/0)
(7/12) thyroidMalfunction: (364/0)
(8/12) diabetes: (364/0)
(9/12) gravida: (364/0)
(10/12) parity: (364/0)
(11/12) numFetus: (10/354)


## 01-05. Labor Delivery

In [44]:
labor_delivery_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Labor Delivery')
all_labor_delivery_columns = id_columns + labor_delivery_columns
labor_delivery_columns

['maternalAdmissionDate',
 'maternalAdmissionTime',
 'ruptureDate',
 'ruptureTime',
 'ruptureOver18Hr',
 'ruptureBeforeDelivery',
 'labor',
 'laborOnsetDate',
 'laborOnsetTime',
 'deliveryMode',
 'fetalDecelerate',
 'cordMishap',
 'uterineRupture',
 'shoulderDystocia',
 'placentalProblem',
 'maternalHemorrhage',
 'maternalTrauma',
 'maternalCardioRespiratoryArrest',
 'maternalSeizure',
 'pyrexiaOver37p6C',
 'chorioamnionitis',
 'placentalPathologyPerformed',
 'histologicChorioamionitis',
 'laborAntibiotics',
 'laborAntibioticsCode1',
 'laborAntibioticsCode2',
 'laborAntibioticsCode3',
 'laborAntibioticsCode4',
 'laborAntibioticsCode5',
 'laborAntibioticsCode6']

In [45]:
print('to set labor delivery')
df_labor_delivery = COMBINE_harmonizer.valid_columns(df_main, all_labor_delivery_columns)
df_labor_delivery = COMBINE_harmonizer.postprocess(df_labor_delivery)

out_filename = os.sep.join([out_dir, '01-05-labor-delivery.csv'])
df_labor_delivery.to_csv(out_filename, index=False)

to set labor delivery
(8/32) labor not in df
(9/32) laborOnsetDate not in df
(10/32) laborOnsetTime not in df


### 01-05-2. check empty cells

In [46]:
COMBINE_harmonizer.check_empty(df_labor_delivery)

(0/30) column: center (364 / 0)
(1/30) column: subjectID (364 / 0)
(2/30) column: uniqueID (364 / 0)
(3/30) column: fetalDecelerate (364 / 0)
(4/30) column: cordMishap (364 / 0)
(5/30) column: uterineRupture (364 / 0)
(6/30) column: shoulderDystocia (364 / 0)
(7/30) column: placentalProblem (364 / 0)
(8/30) column: maternalHemorrhage (364 / 0)
(9/30) column: maternalTrauma (364 / 0)
(10/30) column: maternalCardioRespiratoryArrest (364 / 0)
(11/30) column: maternalSeizure (364 / 0)
(12/30) column: pyrexiaOver37p6C (364 / 0)
(13/30) column: chorioamnionitis (364 / 0)
(14/30) column: placentalPathologyPerformed (364 / 0)
(15/30) column: histologicChorioamionitis (198 / 166)
(16/30) column: laborAntibiotics (364 / 0)
(17/30) column: ruptureBeforeDelivery (364 / 0)
(18/30) column: ruptureOver18Hr (47 / 317)
(19/30) column: maternalAdmissionDate (340 / 24)
(20/30) column: maternalAdmissionTime (288 / 76)
(21/30) column: laborAntibioticsCode1 (85 / 279)
(22/30) column: laborAntibioticsCode2 (

In [47]:
COMBINE_harmonizer.column_info(df_labor_delivery)

(0/30) center: (364/0)
(1/30) subjectID: (364/0)
(2/30) uniqueID: (364/0)
(3/30) fetalDecelerate: (364/0)
(4/30) cordMishap: (364/0)
(5/30) uterineRupture: (364/0)
(6/30) shoulderDystocia: (364/0)
(7/30) placentalProblem: (364/0)
(8/30) maternalHemorrhage: (364/0)
(9/30) maternalTrauma: (364/0)
(10/30) maternalCardioRespiratoryArrest: (364/0)
(11/30) maternalSeizure: (364/0)
(12/30) pyrexiaOver37p6C: (364/0)
(13/30) chorioamnionitis: (364/0)
(14/30) placentalPathologyPerformed: (364/0)
(15/30) histologicChorioamionitis: (198/166)
(16/30) laborAntibiotics: (364/0)
(17/30) ruptureBeforeDelivery: (364/0)
(18/30) ruptureOver18Hr: (47/317)
(19/30) maternalAdmissionDate: (340/24)
(20/30) maternalAdmissionTime: (288/76)
(21/30) laborAntibioticsCode1: (85/279)
(22/30) laborAntibioticsCode2: (33/331)
(23/30) laborAntibioticsCode3: (8/356)
(24/30) laborAntibioticsCode4: (0/364)
(25/30) laborAntibioticsCode5: (0/364)
(26/30) laborAntibioticsCode6: (0/364)
(27/30) ruptureDate: (245/119)
(28/30) ru

## 01-06. Birth

In [48]:
birth_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Birth')
all_birth_columns = id_columns + birth_columns
birth_columns

['encephalopathyLevel',
 'randomInfantAge',
 'birthDate',
 'birthTime',
 'birthWeight_g',
 'birthLength_cm',
 'birthHeadCircumference_cm',
 'birthGestationalAge_week',
 'infantSex',
 'infantOutborn',
 'outbornInHospital',
 'outbornOutHospital',
 'neonateAdmissionDate',
 'neonateAdmissionTime',
 'Apgar1min',
 'Apgar5min',
 'Apgar10min',
 'Apgar15min',
 'Apgar20min',
 'deliveryResuscitation',
 'deliveryOxygen',
 'deliveryBaggingAndMask',
 'deliveryChestCompression',
 'deliveryIntubation',
 'deliveryDrug',
 'at10MinContinueResuscitation',
 'at10MinOxygen',
 'at10MinBaggingAndMask',
 'at10MinChestCompression',
 'at10MinIntubation',
 'at10MinDrug',
 'spontaneousRespirationTime',
 'cordBloodGas',
 'cordBloodGasSrc',
 'cordBloodGasPH',
 'cordBloodGasPCO2_mmHg',
 'cordBloodGasPO2_mmHg',
 'cordBloodGasHCO3_mEqPerL',
 'cordBloodGasBaseDeficit_mEqPerL',
 'firstPostnatalBloodGas',
 'firstPostnatalBloodGasSrc',
 'firstPostnatalBloodGasDate',
 'firstPostnatalBloodGasTime',
 'firstPostnatalBloodGasPH

In [49]:
print('to set birth')
df_birth = COMBINE_harmonizer.valid_columns(df_main, all_birth_columns)
df_birth = COMBINE_harmonizer.postprocess(df_birth)

out_filename = os.sep.join([out_dir, '01-06-birth.csv'])
df_birth.to_csv(out_filename, index=False)

to set birth
(3/50) randomInfantAge not in df
(21/50) deliveryResuscitation not in df


### 01-06-2. check empty cells

In [50]:
COMBINE_harmonizer.check_empty(df_birth)

(0/49) column: center (364 / 0)
(1/49) column: subjectID (364 / 0)
(2/49) column: uniqueID (364 / 0)
(3/49) column: birthDate (364 / 0)
(4/49) column: encephalopathyLevel (364 / 0)
(5/49) column: infantOutborn (364 / 0)
(6/49) column: outbornInHospital (234 / 130)
(7/49) column: outbornOutHospital (234 / 130)
(8/49) column: deliveryOxygen (364 / 0)
(9/49) column: deliveryBaggingAndMask (364 / 0)
(10/49) column: deliveryChestCompression (364 / 0)
(11/49) column: deliveryIntubation (364 / 0)
(12/49) column: deliveryDrug (364 / 0)
(13/49) column: at10MinContinueResuscitation (364 / 0)
(14/49) column: at10MinOxygen (315 / 49)
(15/49) column: at10MinBaggingAndMask (315 / 49)
(16/49) column: at10MinChestCompression (315 / 49)
(17/49) column: at10MinIntubation (315 / 49)
(18/49) column: at10MinDrug (315 / 49)
(19/49) column: cordBloodGas (364 / 0)
(20/49) column: firstPostnatalBloodGas (364 / 0)
(21/49) column: birthTime (364 / 0)
(22/49) column: neonateAdmissionDate (234 / 130)
(23/49) colum

In [51]:
COMBINE_harmonizer.column_info(df_birth)

(0/49) center: (364/0)
(1/49) subjectID: (364/0)
(2/49) uniqueID: (364/0)
(3/49) birthDate: (364/0)
(4/49) encephalopathyLevel: (364/0)
(5/49) infantOutborn: (364/0)
(6/49) outbornInHospital: (234/130)
(7/49) outbornOutHospital: (234/130)
(8/49) deliveryOxygen: (364/0)
(9/49) deliveryBaggingAndMask: (364/0)
(10/49) deliveryChestCompression: (364/0)
(11/49) deliveryIntubation: (364/0)
(12/49) deliveryDrug: (364/0)
(13/49) at10MinContinueResuscitation: (364/0)
(14/49) at10MinOxygen: (315/49)
(15/49) at10MinBaggingAndMask: (315/49)
(16/49) at10MinChestCompression: (315/49)
(17/49) at10MinIntubation: (315/49)
(18/49) at10MinDrug: (315/49)
(19/49) cordBloodGas: (364/0)
(20/49) firstPostnatalBloodGas: (364/0)
(21/49) birthTime: (364/0)
(22/49) neonateAdmissionDate: (234/130)
(23/49) neonateAdmissionTime: (231/133)
(24/49) Apgar1min: (361/3)
(25/49) Apgar5min: (361/3)
(26/49) Apgar10min: (324/40)
(27/49) Apgar15min: (123/241)
(28/49) Apgar20min: (75/289)
(29/49) birthWeight_g: (363/1)
(30/49)

## 01-07. Pre Intervention - Temperature

In [52]:
temperature_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Temperature')
all_temperature_pre_intervention_columns = id_columns + temperature_pre_intervention_columns
temperature_pre_intervention_columns

['targetTreatmentTemperature_C',
 'pre_CoolInitiate',
 'pre_CoolbyIceGelPack',
 'pre_CoolPassively',
 'pre_CoolClinically',
 'pre_CoolInitiateDate',
 'pre_CoolInitiateTime',
 'pre_AfterOvershootReach33p5C',
 'pre_AfterOvershootReach33p5CDate',
 'pre_AfterOvershootReach33p5CTime',
 'pre_TemperatureMinDate',
 'pre_TemperatureMinTime',
 'pre_SkinTemperatureMin_C',
 'pre_AxillaryTemperatureMin_C',
 'pre_EsophagealTemperatureMin_C',
 'pre_ServoSetMin_C',
 'pre_TemperatureMaxDate',
 'pre_TemperatureMaxTime',
 'pre_SkinTemperatureMax_C',
 'pre_AxillaryTemperatureMax_C',
 'pre_EsophagealTemperatureMax_C',
 'pre_ServoSetMax_C']

In [53]:
df_temperature_pre_intervention = pd.concat([df_dict['oc06t72.csv'], df_dict['oc06t120.csv']])
df_temperature_pre_intervention = COMBINE_harmonizer.valid_columns(df_temperature_pre_intervention, all_temperature_pre_intervention_columns)
df_temperature_pre_intervention = COMBINE_harmonizer.postprocess(df_temperature_pre_intervention)

out_filename = os.sep.join([out_dir, '01-07-pre-temperature.csv'])
df_temperature_pre_intervention.to_csv(out_filename, index=False)

### 01-07-2. check empty cells

In [54]:
COMBINE_harmonizer.check_empty(df_temperature_pre_intervention)

(0/25) column: center (364 / 0)
(1/25) column: subjectID (364 / 0)
(2/25) column: uniqueID (364 / 0)
(3/25) column: pre_CoolInitiate (364 / 0)
(4/25) column: pre_CoolbyIceGelPack (252 / 112)
(5/25) column: pre_CoolPassively (252 / 112)
(6/25) column: pre_CoolClinically (252 / 112)
(7/25) column: pre_AfterOvershootReach33p5C (364 / 0)
(8/25) column: targetTreatmentTemperature_C (364 / 0)
(9/25) column: pre_CoolInitiateDate (201 / 163)
(10/25) column: pre_CoolInitiateTime (200 / 164)
(11/25) column: pre_AfterOvershootReach33p5CDate (70 / 294)
(12/25) column: pre_AfterOvershootReach33p5CTime (69 / 295)
(13/25) column: pre_TemperatureMinDate (351 / 13)
(14/25) column: pre_TemperatureMinTime (348 / 16)
(15/25) column: pre_SkinTemperatureMin_C (168 / 196)
(16/25) column: pre_AxillaryTemperatureMin_C (231 / 133)
(17/25) column: pre_EsophagealTemperatureMin_C (138 / 226)
(18/25) column: pre_ServoSetMin_C (164 / 200)
(19/25) column: pre_TemperatureMaxDate (350 / 14)
(20/25) column: pre_Temperat

In [55]:
COMBINE_harmonizer.column_info(df_temperature_pre_intervention)

(0/25) center: (364/0)
(1/25) subjectID: (364/0)
(2/25) uniqueID: (364/0)
(3/25) pre_CoolInitiate: (364/0)
(4/25) pre_CoolbyIceGelPack: (252/112)
(5/25) pre_CoolPassively: (252/112)
(6/25) pre_CoolClinically: (252/112)
(7/25) pre_AfterOvershootReach33p5C: (364/0)
(8/25) targetTreatmentTemperature_C: (364/0)
(9/25) pre_CoolInitiateDate: (201/163)
(10/25) pre_CoolInitiateTime: (200/164)
(11/25) pre_AfterOvershootReach33p5CDate: (70/294)
(12/25) pre_AfterOvershootReach33p5CTime: (69/295)
(13/25) pre_TemperatureMinDate: (351/13)
(14/25) pre_TemperatureMinTime: (348/16)
(15/25) pre_SkinTemperatureMin_C: (168/196)
(16/25) pre_AxillaryTemperatureMin_C: (231/133)
(17/25) pre_EsophagealTemperatureMin_C: (138/226)
(18/25) pre_ServoSetMin_C: (164/200)
(19/25) pre_TemperatureMaxDate: (350/14)
(20/25) pre_TemperatureMaxTime: (340/24)
(21/25) pre_SkinTemperatureMax_C: (133/231)
(22/25) pre_AxillaryTemperatureMax_C: (279/85)
(23/25) pre_EsophagealTemperatureMax_C: (68/296)
(24/25) pre_ServoSetMax_C: 

## 01-08. Pre-intervention - Cardio

In [56]:
cardio_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Cardiovascular')
all_cardio_pre_intervention_columns = id_columns + cardio_pre_intervention_columns
cardio_pre_intervention_columns

['pre_CardioDate',
 'pre_CardioTime',
 'pre_CardioSystolicBloodPressure_mmHg',
 'pre_CardioDiastolicBloodPressure_mmHg',
 'pre_CardioHeartRate_BPM',
 'pre_CardioVolumeExpand',
 'pre_CardioInotropicAgent',
 'pre_CardioBloodTransfusion',
 'pre_CardioPlatelets']

In [57]:
df_cardio = df_dict['oc07.csv'].copy()

cardioTimeSlot_int = df_cardio['cardioTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = cardioTimeSlot_int == 0
df_cardio_pre_intervention = df_cardio[is_pre]
pre_rename_map = {
    'cardioDate': 'pre_CardioDate',
    'cardioTime': 'pre_CardioTime',
    'cardioSystolicBloodPressure_mmHg': 'pre_CardioSystolicBloodPressure_mmHg',
    'cardioDiastolicBloodPressure_mmHg': 'pre_CardioDiastolicBloodPressure_mmHg',
    'cardioHeartRate_BPM': 'pre_CardioHeartRate_BPM',
    'cardioVolumeExpand': 'pre_CardioVolumeExpand',
    'cardioInotropicAgent': 'pre_CardioInotropicAgent',
    'cardioBloodTransfusion': 'pre_CardioBloodTransfusion',
    'cardioPlatelets': 'pre_CardioPlatelets',
}
df_cardio_pre_intervention = df_cardio_pre_intervention.rename(columns=pre_rename_map)

df_cardio_pre_intervention = COMBINE_harmonizer.valid_columns(df_cardio_pre_intervention, all_cardio_pre_intervention_columns)
df_cardio_pre_intervention = COMBINE_harmonizer.postprocess(df_cardio_pre_intervention)

out_filename = os.sep.join([out_dir, '01-08-pre-cardiovascular.csv'])
df_cardio_pre_intervention.to_csv(out_filename, index=False)

### 01-08-2. Check empty

In [58]:
COMBINE_harmonizer.check_empty(df_cardio_pre_intervention)

(0/12) column: center (357 / 0)
(1/12) column: subjectID (357 / 0)
(2/12) column: uniqueID (357 / 0)
(3/12) column: pre_CardioVolumeExpand (357 / 0)
(4/12) column: pre_CardioInotropicAgent (357 / 0)
(5/12) column: pre_CardioBloodTransfusion (357 / 0)
(6/12) column: pre_CardioPlatelets (357 / 0)
(7/12) column: pre_CardioDate (357 / 0)
(8/12) column: pre_CardioTime (355 / 2)
(9/12) column: pre_CardioSystolicBloodPressure_mmHg (350 / 7)
(10/12) column: pre_CardioDiastolicBloodPressure_mmHg (350 / 7)
(11/12) column: pre_CardioHeartRate_BPM (351 / 6)


In [59]:
COMBINE_harmonizer.column_info(df_cardio_pre_intervention)

(0/12) center: (357/0)
(1/12) subjectID: (357/0)
(2/12) uniqueID: (357/0)
(3/12) pre_CardioVolumeExpand: (357/0)
(4/12) pre_CardioInotropicAgent: (357/0)
(5/12) pre_CardioBloodTransfusion: (357/0)
(6/12) pre_CardioPlatelets: (357/0)
(7/12) pre_CardioDate: (357/0)
(8/12) pre_CardioTime: (355/2)
(9/12) pre_CardioSystolicBloodPressure_mmHg: (350/7)
(10/12) pre_CardioDiastolicBloodPressure_mmHg: (350/7)
(11/12) pre_CardioHeartRate_BPM: (351/6)


## 01-13. Pre-intervention - Respiratory

In [60]:
respiratory_pre_intervention_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Respiratory')
all_respiratory_pre_intervention_columns = id_columns + respiratory_pre_intervention_columns
respiratory_pre_intervention_columns

['pre_RespiratoryDate',
 'pre_RespiratoryTime',
 'pre_RespiratorySupportType',
 'pre_RespiratoryFiO2',
 'pre_RespiratoryRate_Hz',
 'pre_RespiratoryPIP_cmH2O',
 'pre_RespiratoryMAP_cmH2O',
 'pre_RespiratoryPEEP_cmH2O']

In [61]:
df_respiratory = df_dict['oc08.csv'].copy()

respiratoryTimeSlot_int = df_respiratory['respiratoryTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = respiratoryTimeSlot_int == 0
df_respiratory_pre = df_respiratory[is_pre]
pre_rename_map = {
    'respiratoryDate': 'pre_RespiratoryDate',
    'respiratoryTime': 'pre_RespiratoryTime',
    'respiratorySupportType': 'pre_RespiratorySupportType',
    'respiratoryFiO2': 'pre_RespiratoryFiO2',
    'respiratoryRate_Hz': 'pre_RespiratoryRate_Hz',
    'respiratoryPIP_cmH2O': 'pre_RespiratoryPIP_cmH2O',
    'respiratoryMAP_cmH2O': 'pre_RespiratoryMAP_cmH2O',
    'respiratoryPEEP_cmH2O': 'pre_RespiratoryPEEP_cmH2O',
}
df_respiratory_pre = df_respiratory_pre.rename(columns=pre_rename_map)

df_respiratory_pre = COMBINE_harmonizer.valid_columns(df_respiratory_pre, all_respiratory_pre_intervention_columns)
df_respiratory_pre = COMBINE_harmonizer.postprocess(df_respiratory_pre)

# XXX skip pre-respiratory because there is no data.
# out_filename = os.sep.join([out_dir, '01-13-pre-respiratory.csv'])
# df_respiratory_before_baseline.to_csv(out_filename, index=False)

### 01-13-2. check empty cells

In [62]:
COMBINE_harmonizer.check_empty(df_respiratory_pre)

(0/11) column: center (350 / 0)
(1/11) column: subjectID (350 / 0)
(2/11) column: uniqueID (350 / 0)
(3/11) column: pre_RespiratoryDate (348 / 2)
(4/11) column: pre_RespiratoryTime (321 / 29)
(5/11) column: pre_RespiratorySupportType (0 / 350)
(6/11) column: pre_RespiratoryFiO2 (0 / 350)
(7/11) column: pre_RespiratoryRate_Hz (0 / 350)
(8/11) column: pre_RespiratoryPIP_cmH2O (0 / 350)
(9/11) column: pre_RespiratoryMAP_cmH2O (0 / 350)
(10/11) column: pre_RespiratoryPEEP_cmH2O (0 / 350)


In [63]:
COMBINE_harmonizer.column_info(df_respiratory_pre)

(0/11) center: (350/0)
(1/11) subjectID: (350/0)
(2/11) uniqueID: (350/0)
(3/11) pre_RespiratoryDate: (348/2)
(4/11) pre_RespiratoryTime: (321/29)
(5/11) pre_RespiratorySupportType: (0/350)
(6/11) pre_RespiratoryFiO2: (0/350)
(7/11) pre_RespiratoryRate_Hz: (0/350)
(8/11) pre_RespiratoryPIP_cmH2O: (0/350)
(9/11) pre_RespiratoryMAP_cmH2O: (0/350)
(10/11) pre_RespiratoryPEEP_cmH2O: (0/350)


## 01-14. Pre-intervention - Blood Gas

In [64]:
blood_gas_pre_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Blood Gas')
all_blood_gas_pre_columns = id_columns + blood_gas_pre_columns
blood_gas_pre_columns

['pre_BloodGasDate',
 'pre_BloodGasTime',
 'pre_BloodGasSrc',
 'pre_BloodGasPH',
 'pre_BloodGasPCO2_mmHg',
 'pre_BloodGasPO2_mmHg',
 'pre_BloodGasHCO3_mEqPerL',
 'pre_BloodGasBaseDeficit_mEqPerL',
 'pre_BloodGasPHCorrect',
 'pre_BloodGasPCO2Correct_mmHg',
 'pre_BloodGasPO2Correct_mmHg',
 'pre_BloodGasHCO3Correct_mEqPerL',
 'pre_BloodGasBaseDeficitCorrect_mEqPerL']

In [65]:
df_blood_gas = df_dict['oc07.csv'].copy().rename(columns={'cardioTimeSlot': 'bloodGasTimeSlot'})

bloodGasTimeSlot_int = df_blood_gas['bloodGasTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = bloodGasTimeSlot_int == 0
df_blood_gas_pre = df_blood_gas[is_pre]
pre_rename_map = {
    'bloodGasDate': 'pre_BloodGasDate',
    'bloodGasTime': 'pre_BloodGasTime',
    'bloodGasSrc': 'pre_BloodGasSrc',
    'bloodGasPH': 'pre_BloodGasPH',
    'bloodGasPCO2_mmHg': 'pre_BloodGasPCO2_mmHg',
    'bloodGasPO2_mmHg': 'pre_BloodGasPO2_mmHg',
    'bloodGasHCO3_mEqPerL': 'pre_BloodGasHCO3_mEqPerL',
    'bloodGasBaseDeficit_mEqPerL': 'pre_BloodGasBaseDeficit_mEqPerL',
    'bloodGasPHCorrect': 'pre_BloodGasPHCorrect',
    'bloodGasPCO2Correct_mmHg': 'pre_BloodGasPCO2Correct_mmHg',
    'bloodGasPO2Correct_mmHg': 'pre_BloodGasPO2Correct_mmHg',
    'bloodGasHCO3Correct_mEqPerL': 'pre_BloodGasHCO3Correct_mEqPerL',
    'bloodGasBaseDeficitCorrect_mEqPerL': 'pre_BloodGasBaseDeficitCorrect_mEqPerL',
}
df_blood_gas_pre = df_blood_gas_pre.rename(columns=pre_rename_map)

df_blood_gas_pre = COMBINE_harmonizer.valid_columns(df_blood_gas_pre, all_blood_gas_pre_columns)
df_blood_gas_pre = COMBINE_harmonizer.postprocess(df_blood_gas_pre)

# XXX skip pre-blood-gas because there is only 1 data.
# out_filename = os.sep.join([out_dir, '01-14-pre-blood-gas.csv'])
# df_blood_gas_pre.to_csv(out_filename, index=False)

### 01-14-2. check empty

In [66]:
COMBINE_harmonizer.check_empty(df_blood_gas_pre)

(0/16) column: center (357 / 0)
(1/16) column: subjectID (357 / 0)
(2/16) column: uniqueID (357 / 0)
(3/16) column: pre_BloodGasSrc (0 / 357)
(4/16) column: pre_BloodGasDate (1 / 356)
(5/16) column: pre_BloodGasTime (1 / 356)
(6/16) column: pre_BloodGasPH (1 / 356)
(7/16) column: pre_BloodGasPCO2_mmHg (1 / 356)
(8/16) column: pre_BloodGasPO2_mmHg (1 / 356)
(9/16) column: pre_BloodGasHCO3_mEqPerL (1 / 356)
(10/16) column: pre_BloodGasBaseDeficit_mEqPerL (1 / 356)
(11/16) column: pre_BloodGasPHCorrect (0 / 357)
(12/16) column: pre_BloodGasPCO2Correct_mmHg (0 / 357)
(13/16) column: pre_BloodGasPO2Correct_mmHg (0 / 357)
(14/16) column: pre_BloodGasHCO3Correct_mEqPerL (0 / 357)
(15/16) column: pre_BloodGasBaseDeficitCorrect_mEqPerL (0 / 357)


In [67]:
COMBINE_harmonizer.column_info(df_blood_gas_pre)

(0/16) center: (357/0)
(1/16) subjectID: (357/0)
(2/16) uniqueID: (357/0)
(3/16) pre_BloodGasSrc: (0/357)
(4/16) pre_BloodGasDate: (1/356)
(5/16) pre_BloodGasTime: (1/356)
(6/16) pre_BloodGasPH: (1/356)
(7/16) pre_BloodGasPCO2_mmHg: (1/356)
(8/16) pre_BloodGasPO2_mmHg: (1/356)
(9/16) pre_BloodGasHCO3_mEqPerL: (1/356)
(10/16) pre_BloodGasBaseDeficit_mEqPerL: (1/356)
(11/16) pre_BloodGasPHCorrect: (0/357)
(12/16) pre_BloodGasPCO2Correct_mmHg: (0/357)
(13/16) pre_BloodGasPO2Correct_mmHg: (0/357)
(14/16) pre_BloodGasHCO3Correct_mEqPerL: (0/357)
(15/16) pre_BloodGasBaseDeficitCorrect_mEqPerL: (0/357)


## 01-15. Pre-intervention - Hematology CBC

In [68]:
hematology_before_baseline_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Hematology CBC')
all_hematology_before_baseline_columns = id_columns + hematology_before_baseline_columns
hematology_before_baseline_columns

['pre_Hematology',
 'pre_HematologyDate',
 'pre_HematologyTime',
 'pre_HematologyWBC',
 'pre_HematologyHemoglobin',
 'pre_HematologyPolymorphNeutrophilsDifferentialCount',
 'pre_HematologyMonocytes',
 'pre_HematologyLymphocytes',
 'pre_HematologyPlateletCount',
 'pre_HematologyPT_s',
 'pre_HematologyPTT_s']

In [69]:
list(df_dict['oc08.csv'].columns)

['subjectID',
 'hematology',
 'REC_CMP',
 'center',
 'respiratoryTimeSlot',
 'respiratoryDate',
 'respiratoryTime',
 'respiratorySupportType',
 'respiratoryFiO2',
 'respiratoryRate_Hz',
 'respiratoryPIP_cmH2O',
 'respiratoryMAP_cmH2O',
 'respiratoryPEEP_cmH2O',
 'hematologyWBC',
 'hematologyHemoglobin',
 'hematologyPolymorphNeutrophilsDifferentialCount',
 'hematologyMonocytes',
 'hematologyLymphocytes',
 'hematologyPlateletCount',
 'hematologyPT_s',
 'hematologyPTT_s',
 'anticonvulsants1',
 'anticonvulsants2',
 'anticonvulsants3',
 'analgesicsSedatives1',
 'analgesicsSedatives2',
 'analgesicsSedatives3',
 'antipyretics1',
 'antipyretics2',
 'antipyretics3',
 'paralytics1',
 'paralytics2',
 'paralytics3',
 'otherMedFluidIntake_ccPerKg',
 'otherMedUrineOutput_ccPerKg',
 'CMP_DATE',
 'CRT_DATE']

In [70]:
df_hematology = df_dict['oc08.csv'].copy().rename(columns={'respiratoryTimeSlot': 'hematologyTimeSlot', 'respiratoryDate': 'hematologyDate', 'respiratoryTime': 'hematologyTime'})

hematology_time_slot_int = df_hematology['hematologyTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = hematology_time_slot_int == 0
df_hematology_pre = df_hematology[is_pre]
pre_rename_map = {
    'hematology': 'pre_Hematology',
    'hematologyDate': 'pre_HematologyDate',
    'hematologyTime': 'pre_HematologyTime',
    'hematologyWBC': 'pre_HematologyWBC',
    'hematologyHemoglobin': 'pre_HematologyHemoglobin',
    'hematologyPolymorphNeutrophilsDifferentialCount': 'pre_HematologyPolymorphNeutrophilsDifferentialCount',
    'hematologyMonocytes': 'pre_HematologyMonocytes',
    'hematologyLymphocytes': 'pre_HematologyLymphocytes',
    'hematologyPlateletCount': 'pre_HematologyPlateletCount',
    'hematologyPT_s': 'pre_HematologyPT_s',
    'hematologyPTT_s': 'pre_HematologyPTT_s',
}
df_hematology_pre = df_hematology_pre.rename(columns=pre_rename_map)

df_hematology_pre = COMBINE_harmonizer.valid_columns(df_hematology_pre, all_hematology_before_baseline_columns, debug_df=True, debug_columns=True)
df_hematology_pre = COMBINE_harmonizer.postprocess(df_hematology_pre)

# XXX skip pre-hematology because there is no data.
# out_filename = os.sep.join([out_dir, '01-15-pre-hematology.csv'])
# df_hematology_pre.to_csv(out_filename, index=False)

(2/37) REC_CMP not in columns
(4/37) hematologyTimeSlot not in columns
(7/37) respiratorySupportType not in columns
(8/37) respiratoryFiO2 not in columns
(9/37) respiratoryRate_Hz not in columns
(10/37) respiratoryPIP_cmH2O not in columns
(11/37) respiratoryMAP_cmH2O not in columns
(12/37) respiratoryPEEP_cmH2O not in columns
(21/37) anticonvulsants1 not in columns
(22/37) anticonvulsants2 not in columns
(23/37) anticonvulsants3 not in columns
(24/37) analgesicsSedatives1 not in columns
(25/37) analgesicsSedatives2 not in columns
(26/37) analgesicsSedatives3 not in columns
(27/37) antipyretics1 not in columns
(28/37) antipyretics2 not in columns
(29/37) antipyretics3 not in columns
(30/37) paralytics1 not in columns
(31/37) paralytics2 not in columns
(32/37) paralytics3 not in columns
(33/37) otherMedFluidIntake_ccPerKg not in columns
(34/37) otherMedUrineOutput_ccPerKg not in columns
(35/37) CMP_DATE not in columns
(36/37) CRT_DATE not in columns


### 01-15-2. check empty

In [71]:
COMBINE_harmonizer.check_empty(df_hematology_pre)

(0/14) column: center (350 / 0)
(1/14) column: subjectID (350 / 0)
(2/14) column: uniqueID (350 / 0)
(3/14) column: pre_Hematology (0 / 350)
(4/14) column: pre_HematologyDate (348 / 2)
(5/14) column: pre_HematologyTime (321 / 29)
(6/14) column: pre_HematologyWBC (0 / 350)
(7/14) column: pre_HematologyHemoglobin (0 / 350)
(8/14) column: pre_HematologyPolymorphNeutrophilsDifferentialCount (0 / 350)
(9/14) column: pre_HematologyMonocytes (0 / 350)
(10/14) column: pre_HematologyLymphocytes (0 / 350)
(11/14) column: pre_HematologyPlateletCount (0 / 350)
(12/14) column: pre_HematologyPT_s (0 / 350)
(13/14) column: pre_HematologyPTT_s (0 / 350)


In [72]:
COMBINE_harmonizer.column_info(df_hematology_pre)

(0/14) center: (350/0)
(1/14) subjectID: (350/0)
(2/14) uniqueID: (350/0)
(3/14) pre_Hematology: (0/350)
(4/14) pre_HematologyDate: (348/2)
(5/14) pre_HematologyTime: (321/29)
(6/14) pre_HematologyWBC: (0/350)
(7/14) pre_HematologyHemoglobin: (0/350)
(8/14) pre_HematologyPolymorphNeutrophilsDifferentialCount: (0/350)
(9/14) pre_HematologyMonocytes: (0/350)
(10/14) pre_HematologyLymphocytes: (0/350)
(11/14) pre_HematologyPlateletCount: (0/350)
(12/14) pre_HematologyPT_s: (0/350)
(13/14) pre_HematologyPTT_s: (0/350)


## 01-09. Pre-intervention - Infection

In [73]:
infection_pre_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Infection')
all_infection_pre_columns = id_columns + infection_pre_columns
infection_pre_columns

['pre_PositiveCulture',
 'pre_PositiveCultureSrc',
 'pre_PositiveCultureDate',
 'pre_PositiveCultureTime',
 'pre_PositiveCultureOrganismCode1',
 'pre_PositiveCultureOrganismCode2',
 'pre_PositiveCultureOrganismCode3',
 'pre_Antibiotics',
 'pre_AntibioticsCode1',
 'pre_AntibioticsCode2',
 'pre_AntibioticsCode3']

In [74]:
df_infection_1 = df_dict['oc09i.csv'].copy()
df_infection_2 = df_dict['oc09.csv'].copy()
df_infection = df_infection_1.merge(df_infection_2, on=['center', 'subjectID'], how='outer')
df_infection_pre_intervention = COMBINE_harmonizer.valid_columns(df_infection, all_infection_pre_columns)
df_infection_pre_intervention = COMBINE_harmonizer.postprocess(df_infection_pre_intervention)

out_filename = os.sep.join([out_dir, '01-09-pre-infection.csv'])
df_infection_pre_intervention.to_csv(out_filename, index=False)

(2/13) pre_PositiveCulture not in df
(3/13) pre_PositiveCultureSrc not in df
(4/13) pre_PositiveCultureDate not in df
(5/13) pre_PositiveCultureTime not in df
(6/13) pre_PositiveCultureOrganismCode1 not in df
(7/13) pre_PositiveCultureOrganismCode2 not in df
(8/13) pre_PositiveCultureOrganismCode3 not in df


In [75]:
df_infection_pre_intervention.columns

Index(['center', 'subjectID', 'uniqueID', 'pre_Antibiotics',
       'pre_AntibioticsCode1', 'pre_AntibioticsCode2', 'pre_AntibioticsCode3'],
      dtype='object')

### 01-09-2. check empty cells

In [76]:
COMBINE_harmonizer.check_empty(df_infection_pre_intervention)

(0/7) column: center (364 / 0)
(1/7) column: subjectID (364 / 0)
(2/7) column: uniqueID (364 / 0)
(3/7) column: pre_Antibiotics (364 / 0)
(4/7) column: pre_AntibioticsCode1 (308 / 56)
(5/7) column: pre_AntibioticsCode2 (273 / 91)
(6/7) column: pre_AntibioticsCode3 (8 / 356)


In [77]:
COMBINE_harmonizer.column_info(df_infection_pre_intervention)

(0/7) center: (364/0)
(1/7) subjectID: (364/0)
(2/7) uniqueID: (364/0)
(3/7) pre_Antibiotics: (364/0)
(4/7) pre_AntibioticsCode1: (308/56)
(5/7) pre_AntibioticsCode2: (273/91)
(6/7) pre_AntibioticsCode3: (8/356)


## 01-10. Pre-intervention - Other Medication

In [78]:
other_med_before_baseline_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Other Medication')
all_other_med_pre_baseline_columns = id_columns + other_med_before_baseline_columns
other_med_before_baseline_columns

['pre_OtherMedTargetDate',
 'pre_OtherMedTargetTime',
 'pre_Anticonvulsants',
 'pre_Anticonvulsants1',
 'pre_Anticonvulsants2',
 'pre_Anticonvulsants3',
 'pre_Analgesics',
 'pre_AnalgesicsSedatives1',
 'pre_AnalgesicsSedatives2',
 'pre_AnalgesicsSedatives3',
 'pre_Antipyretics',
 'pre_Antipyretics1',
 'pre_Antipyretics2',
 'pre_Antipyretics3',
 'pre_Paralytics',
 'pre_Paralytics1',
 'pre_Paralytics2',
 'pre_Paralytics3',
 'pre_OtherMedFluidIntake_ccPerKg',
 'pre_OtherMedUrineOutput_ccPerKg']

In [79]:
df_other_med_pre = df_dict['oc08.csv'].copy().rename(columns={'respiratoryTimeSlot': 'otherMedTimeSlot'})

other_med_time_slot_int = df_other_med_pre['otherMedTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = other_med_time_slot_int == 0
df_other_med_pre = df_other_med_pre[is_pre]
pre_rename_map = {
    'otherMedTargetDate': 'pre_OtherMedTargetDate',
    'otherMedTargetTime': 'pre_OtherMedTargetTime',
    'anticonvulsants': 'pre_Anticonvulsants',
    'anticonvulsants1': 'pre_Anticonvulsants1',
    'anticonvulsants2': 'pre_Anticonvulsants2',
    'anticonvulsants3': 'pre_Anticonvulsants3',
    'analgesics': 'pre_Analgesics',
    'analgesicsSedatives1': 'pre_AnalgesicsSedatives1',
    'analgesicsSedatives2': 'pre_AnalgesicsSedatives2',
    'analgesicsSedatives3': 'pre_AnalgesicsSedatives3',
    'antipyretics': 'pre_Antipyretics',
    'antipyretics1': 'pre_Antipyretics1',
    'antipyretics2': 'pre_Antipyretics2',
    'antipyretics3': 'pre_Antipyretics3',
    'paralytics': 'pre_Paralytics',
    'paralytics1': 'pre_Paralytics1',
    'paralytics2': 'pre_Paralytics2',
    'paralytics3': 'pre_Paralytics3',
    'otherMedFluidIntake_ccPerKg': 'pre_OtherMedFluidIntake_ccPerKg',
    'otherMedUrineOutput_ccPerKg': 'pre_OtherMedUrineOutput_ccPerKg',
}
df_other_med_pre = df_other_med_pre.rename(columns=pre_rename_map)

df_other_med_pre = COMBINE_harmonizer.valid_columns(df_other_med_pre, all_other_med_pre_baseline_columns, debug_df=True, debug_columns=True)
df_other_med_pre = COMBINE_harmonizer.postprocess(df_other_med_pre)

out_filename = os.sep.join([out_dir, '01-10-pre-other-med.csv'])
df_other_med_pre.to_csv(out_filename, index=False)

(1/37) hematology not in columns
(2/37) REC_CMP not in columns
(4/37) otherMedTimeSlot not in columns
(5/37) respiratoryDate not in columns
(6/37) respiratoryTime not in columns
(7/37) respiratorySupportType not in columns
(8/37) respiratoryFiO2 not in columns
(9/37) respiratoryRate_Hz not in columns
(10/37) respiratoryPIP_cmH2O not in columns
(11/37) respiratoryMAP_cmH2O not in columns
(12/37) respiratoryPEEP_cmH2O not in columns
(13/37) hematologyWBC not in columns
(14/37) hematologyHemoglobin not in columns
(15/37) hematologyPolymorphNeutrophilsDifferentialCount not in columns
(16/37) hematologyMonocytes not in columns
(17/37) hematologyLymphocytes not in columns
(18/37) hematologyPlateletCount not in columns
(19/37) hematologyPT_s not in columns
(20/37) hematologyPTT_s not in columns
(35/37) CMP_DATE not in columns
(36/37) CRT_DATE not in columns
(2/22) pre_OtherMedTargetDate not in df
(3/22) pre_OtherMedTargetTime not in df
(4/22) pre_Anticonvulsants not in df
(8/22) pre_Analgesic

### 01-10-2. check empty cells

In [80]:
COMBINE_harmonizer.check_empty(df_other_med_pre)

(0/17) column: center (350 / 0)
(1/17) column: subjectID (350 / 0)
(2/17) column: uniqueID (350 / 0)
(3/17) column: pre_Anticonvulsants1 (85 / 265)
(4/17) column: pre_Anticonvulsants2 (13 / 337)
(5/17) column: pre_Anticonvulsants3 (0 / 350)
(6/17) column: pre_AnalgesicsSedatives1 (74 / 276)
(7/17) column: pre_AnalgesicsSedatives2 (19 / 331)
(8/17) column: pre_AnalgesicsSedatives3 (1 / 349)
(9/17) column: pre_Antipyretics1 (1 / 349)
(10/17) column: pre_Antipyretics2 (0 / 350)
(11/17) column: pre_Antipyretics3 (0 / 350)
(12/17) column: pre_Paralytics1 (2 / 348)
(13/17) column: pre_Paralytics2 (0 / 350)
(14/17) column: pre_Paralytics3 (0 / 350)
(15/17) column: pre_OtherMedFluidIntake_ccPerKg (0 / 350)
(16/17) column: pre_OtherMedUrineOutput_ccPerKg (0 / 350)


In [81]:
COMBINE_harmonizer.column_info(df_other_med_pre)

(0/17) center: (350/0)
(1/17) subjectID: (350/0)
(2/17) uniqueID: (350/0)
(3/17) pre_Anticonvulsants1: (85/265)
(4/17) pre_Anticonvulsants2: (13/337)
(5/17) pre_Anticonvulsants3: (0/350)
(6/17) pre_AnalgesicsSedatives1: (74/276)
(7/17) pre_AnalgesicsSedatives2: (19/331)
(8/17) pre_AnalgesicsSedatives3: (1/349)
(9/17) pre_Antipyretics1: (1/349)
(10/17) pre_Antipyretics2: (0/350)
(11/17) pre_Antipyretics3: (0/350)
(12/17) pre_Paralytics1: (2/348)
(13/17) pre_Paralytics2: (0/350)
(14/17) pre_Paralytics3: (0/350)
(15/17) pre_OtherMedFluidIntake_ccPerKg: (0/350)
(16/17) pre_OtherMedUrineOutput_ccPerKg: (0/350)


## 01-11. Pre-intervention - Imaging

In [82]:
imaging_pre_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Pre-intervention', 'Imaging')
all_imaging_pre_columns = id_columns + imaging_pre_columns
imaging_pre_columns

['pre_HeadSonogram',
 'pre_HeadSonogramDate',
 'pre_HeadSonogramTime',
 'pre_HeadSonogramResult1',
 'pre_HeadSonogramResult2',
 'pre_HeadSonogramResult3',
 'pre_HeadSonogramResult4',
 'pre_HeadSonogramResult5',
 'pre_HeadSonogramResult6',
 'pre_HeadSonogramResult7',
 'pre_HeadSonogramResult8',
 'pre_HeadSonogramResultText',
 'pre_HeadCT',
 'pre_HeadCTDate',
 'pre_HeadCTTime',
 'pre_HeadCTResult1',
 'pre_HeadCTResult2',
 'pre_HeadCTResult3',
 'pre_HeadCTResult4',
 'pre_HeadCTResult5',
 'pre_HeadCTResult6',
 'pre_HeadCTResult7',
 'pre_HeadCTResult8',
 'pre_HeadCTResultText',
 'pre_BrainMRI',
 'pre_BrainMRIDate',
 'pre_BrainMRITime',
 'pre_BrainMRIResult1',
 'pre_BrainMRIResult2',
 'pre_BrainMRIResult3',
 'pre_BrainMRIResult4',
 'pre_BrainMRIResult5',
 'pre_BrainMRIResult6',
 'pre_BrainMRIResult7',
 'pre_BrainMRIResult8',
 'pre_BrainMRIResultText']

In [83]:
df_imaging = df_dict['oc12.csv'].copy()

imaging_time_slot_int = df_imaging['imagingTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = imaging_time_slot_int == 1
df_imaging_pre = df_imaging[is_pre]
pre_rename_map = {
    'headSonogram': 'pre_HeadSonogram',
    'headSonogramDate': 'pre_HeadSonogramDate',
    'headSonogramTime': 'pre_HeadSonogramTime',
    'headSonogramResult1': 'pre_HeadSonogramResult1',
    'headSonogramResult2': 'pre_HeadSonogramResult2',
    'headSonogramResult3': 'pre_HeadSonogramResult3',
    'headSonogramResult4': 'pre_HeadSonogramResult4',
    'headSonogramResult5': 'pre_HeadSonogramResult5',
    'headSonogramResult6': 'pre_HeadSonogramResult6',
    'headSonogramResult7': 'pre_HeadSonogramResult7',
    'headSonogramResult8': 'pre_HeadSonogramResult8',
    'headSonogramResultText': 'pre_HeadSonogramResultText',
    'headCT': 'pre_HeadCT',
    'headCTDate': 'pre_HeadCTDate',
    'headCTTime': 'pre_HeadCTTime',
    'headCTResult1': 'pre_HeadCTResult1',
    'headCTResult2': 'pre_HeadCTResult2',
    'headCTResult3': 'pre_HeadCTResult3',
    'headCTResult4': 'pre_HeadCTResult4',
    'headCTResult5': 'pre_HeadCTResult5',
    'headCTResult6': 'pre_HeadCTResult6',
    'headCTResult7': 'pre_HeadCTResult7',
    'headCTResult8': 'pre_HeadCTResult8',
    'headCTResultText': 'pre_HeadCTResultText',
    'brainMRI': 'pre_BrainMRI',
    'brainMRIDate': 'pre_BrainMRIDate',
    'brainMRITime': 'pre_BrainMRITime',
    'brainMRIResult1': 'pre_BrainMRIResult1',
    'brainMRIResult2': 'pre_BrainMRIResult2',
    'brainMRIResult3': 'pre_BrainMRIResult3',
    'brainMRIResult4': 'pre_BrainMRIResult4',
    'brainMRIResult5': 'pre_BrainMRIResult5',
    'brainMRIResult6': 'pre_BrainMRIResult6',
    'brainMRIResult7': 'pre_BrainMRIResult7',
    'brainMRIResult8': 'pre_BrainMRIResult8',
    'brainMRIResultText': 'pre_BrainMRIResultText'
}
df_imaging_pre = df_imaging_pre.rename(columns=pre_rename_map)

df_imaging_pre = COMBINE_harmonizer.valid_columns(df_imaging_pre, all_imaging_pre_columns, debug_df=True, debug_columns=False)
df_imaging_pre = COMBINE_harmonizer.postprocess(df_imaging_pre)
out_filename = os.sep.join([out_dir, '01-11-pre-imaging.csv'])
df_imaging_pre.to_csv(out_filename, index=False)

(7/42) REC_CMP not in columns
(9/42) imagingTimeSlot not in columns
(40/42) CMP_DATE not in columns
(41/42) CRT_DATE not in columns


### 01-11-2. check empty cells

In [84]:
COMBINE_harmonizer.check_empty(df_imaging_pre)

(0/39) column: center (350 / 0)
(1/39) column: subjectID (350 / 0)
(2/39) column: uniqueID (350 / 0)
(3/39) column: pre_HeadSonogram (350 / 0)
(4/39) column: pre_HeadSonogramResultText (3 / 347)
(5/39) column: pre_HeadCT (350 / 0)
(6/39) column: pre_HeadCTResultText (2 / 348)
(7/39) column: pre_BrainMRI (350 / 0)
(8/39) column: pre_BrainMRIResultText (0 / 350)
(9/39) column: pre_HeadSonogramDate (17 / 333)
(10/39) column: pre_HeadSonogramTime (17 / 333)
(11/39) column: pre_HeadSonogramResult1 (16 / 334)
(12/39) column: pre_HeadSonogramResult2 (1 / 349)
(13/39) column: pre_HeadSonogramResult3 (0 / 350)
(14/39) column: pre_HeadSonogramResult4 (0 / 350)
(15/39) column: pre_HeadSonogramResult5 (0 / 350)
(16/39) column: pre_HeadSonogramResult6 (0 / 350)
(17/39) column: pre_HeadSonogramResult7 (0 / 350)
(18/39) column: pre_HeadSonogramResult8 (0 / 350)
(19/39) column: pre_HeadCTDate (2 / 348)
(20/39) column: pre_HeadCTTime (2 / 348)
(21/39) column: pre_HeadCTResult1 (2 / 348)
(22/39) column:

In [85]:
COMBINE_harmonizer.column_info(df_imaging_pre)

(0/39) center: (350/0)
(1/39) subjectID: (350/0)
(2/39) uniqueID: (350/0)
(3/39) pre_HeadSonogram: (350/0)
(4/39) pre_HeadSonogramResultText: (3/347)
(5/39) pre_HeadCT: (350/0)
(6/39) pre_HeadCTResultText: (2/348)
(7/39) pre_BrainMRI: (350/0)
(8/39) pre_BrainMRIResultText: (0/350)
(9/39) pre_HeadSonogramDate: (17/333)
(10/39) pre_HeadSonogramTime: (17/333)
(11/39) pre_HeadSonogramResult1: (16/334)
(12/39) pre_HeadSonogramResult2: (1/349)
(13/39) pre_HeadSonogramResult3: (0/350)
(14/39) pre_HeadSonogramResult4: (0/350)
(15/39) pre_HeadSonogramResult5: (0/350)
(16/39) pre_HeadSonogramResult6: (0/350)
(17/39) pre_HeadSonogramResult7: (0/350)
(18/39) pre_HeadSonogramResult8: (0/350)
(19/39) pre_HeadCTDate: (2/348)
(20/39) pre_HeadCTTime: (2/348)
(21/39) pre_HeadCTResult1: (2/348)
(22/39) pre_HeadCTResult2: (1/349)
(23/39) pre_HeadCTResult3: (1/349)
(24/39) pre_HeadCTResult4: (1/349)
(25/39) pre_HeadCTResult5: (0/350)
(26/39) pre_HeadCTResult6: (0/350)
(27/39) pre_HeadCTResult7: (0/350)
(28

## 02-01. Temperature

In [86]:
temperatures_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Temperature')
all_temperatures_columns = id_columns + temperatures_columns
temperatures_columns

['temperatureTimeSlot',
 'temperatureTimeSlotNoForm',
 'temperatureDate',
 'temperatureTime',
 'skinTemperature_C',
 'axillaryTemperature_C',
 'esophagealTemperature_C',
 'blanketTemperature_C',
 'servoSetTemperature_C',
 'alterationSkinIntegrity',
 'shiver']

In [87]:
df_dict['oc06t.csv'].columns

Index(['subjectID', 'alterationSkinIntegrity', 'shiver', 'center',
       'temperatureTimeSlot', 'temperatureTimeSlotNoForm', 'temperatureDate',
       'temperatureTime', 'skinTemperature_C', 'axillaryTemperature_C',
       'esophagealTemperature_C', 'blanketTemperature_C',
       'servoSetTemperature_C'],
      dtype='object')

In [88]:
df_dict['oc06t120.csv'].columns

Index(['subjectID', 'pre_CoolInitiate', 'pre_CoolbyIceGelPack',
       'pre_CoolPassively', 'pre_CoolClinically',
       'pre_AfterOvershootReach33p5C', 'normothermiaAtEndIntervention',
       'noNormothermiaReason', 'discontinueBeforeEndPeriod',
       'discontinueOtherText', 'coolAfterInterventionText', 'OC6INIT',
       'center', 'targetTreatmentTemperature_C', 'pre_CoolInitiateDate',
       'pre_CoolInitiateTime', 'pre_AfterOvershootReach33p5CDate',
       'pre_AfterOvershootReach33p5CTime', 'pre_TemperatureMinDate',
       'pre_TemperatureMinTime', 'pre_SkinTemperatureMin_C',
       'pre_AxillaryTemperatureMin_C', 'pre_EsophagealTemperatureMin_C',
       'pre_ServoSetMin_C', 'pre_TemperatureMaxDate', 'pre_TemperatureMaxTime',
       'pre_SkinTemperatureMax_C', 'pre_AxillaryTemperatureMax_C',
       'pre_EsophagealTemperatureMax_C', 'pre_ServoSetMax_C',
       'normothermiaDate', 'normothermiaTime',
       'normothermiaAxillaryTemperature_C', 'OC6NCEPR', 'discontinueDate',
       '

In [89]:
df_temperature = df_dict['oc06t.csv'].copy()
df_temperature = COMBINE_harmonizer.valid_columns(df_temperature, all_temperatures_columns, debug_df=True, debug_columns=True)
df_temperature = COMBINE_harmonizer.postprocess(df_temperature)
out_filename = os.sep.join([out_dir, '02-01-temperature.csv'])
df_temperature.to_csv(out_filename, index=False)

### 02-01-1. check temperature

In [90]:
df_temperature_groupby = df_temperature.groupby(['uniqueID', 'temperatureTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_temperature_groupby['_count'] > 1
df_temperature_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,temperatureTimeSlot,Unnamed: 2_level_1


### 02-01-2. check empty cells

In [91]:
COMBINE_harmonizer.check_empty(df_temperature)

(0/14) column: center (18275 / 0)
(1/14) column: subjectID (18275 / 0)
(2/14) column: uniqueID (18275 / 0)
(3/14) column: alterationSkinIntegrity (1335 / 16940)
(4/14) column: shiver (1335 / 16940)
(5/14) column: temperatureTimeSlot (18275 / 0)
(6/14) column: temperatureTimeSlotNoForm (0 / 18275)
(7/14) column: temperatureDate (18226 / 49)
(8/14) column: temperatureTime (18168 / 107)
(9/14) column: skinTemperature_C (17237 / 1038)
(10/14) column: axillaryTemperature_C (250 / 18025)
(11/14) column: esophagealTemperature_C (17723 / 552)
(12/14) column: blanketTemperature_C (17242 / 1033)
(13/14) column: servoSetTemperature_C (17677 / 598)


In [92]:
COMBINE_harmonizer.column_info(df_temperature)

(0/14) center: (18275/0)
(1/14) subjectID: (18275/0)
(2/14) uniqueID: (18275/0)
(3/14) alterationSkinIntegrity: (1335/16940)
(4/14) shiver: (1335/16940)
(5/14) temperatureTimeSlot: (18275/0)
(6/14) temperatureTimeSlotNoForm: (0/18275)
(7/14) temperatureDate: (18226/49)
(8/14) temperatureTime: (18168/107)
(9/14) skinTemperature_C: (17237/1038)
(10/14) axillaryTemperature_C: (250/18025)
(11/14) esophagealTemperature_C: (17723/552)
(12/14) blanketTemperature_C: (17242/1033)
(13/14) servoSetTemperature_C: (17677/598)


## 02-02. cardio

In [93]:
cardio_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Cardiovascular')
all_cardio_columns = id_columns + cardio_columns
cardio_columns

['cardioTimeSlot',
 'cardioDate',
 'cardioTime',
 'cardioSystolicBloodPressure_mmHg',
 'cardioDiastolicBloodPressure_mmHg',
 'cardioHeartRate_BPM',
 'cardioVolumeExpand',
 'cardioInotropicAgent',
 'cardioBloodTransfusion',
 'cardioPlatelets']

In [94]:
df_cardio = df_dict['oc07.csv'].copy()

cardioTimeSlot_int = df_cardio['cardioTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = cardioTimeSlot_int == 0
is_intervention = is_pre == False
# after_baseline
df_cardio = df_cardio[is_intervention]

df_cardio = COMBINE_harmonizer.valid_columns(df_cardio, all_cardio_columns, debug_df=True, debug_columns=True)
df_cardio = COMBINE_harmonizer.postprocess(df_cardio)

out_filename = os.sep.join([out_dir, '02-02-cardiovascular.csv'])
df_cardio.to_csv(out_filename, index=False)

(5/28) bloodGasSrc not in columns
(6/28) REC_CMP not in columns
(14/28) bloodGasDate not in columns
(15/28) bloodGasTime not in columns
(16/28) bloodGasPH not in columns
(17/28) bloodGasPCO2_mmHg not in columns
(18/28) bloodGasPO2_mmHg not in columns
(19/28) bloodGasHCO3_mEqPerL not in columns
(20/28) bloodGasBaseDeficit_mEqPerL not in columns
(21/28) bloodGasPHCorrect not in columns
(22/28) bloodGasPCO2Correct_mmHg not in columns
(23/28) bloodGasPO2Correct_mmHg not in columns
(24/28) bloodGasHCO3Correct_mEqPerL not in columns
(25/28) bloodGasBaseDeficitCorrect_mEqPerL not in columns
(26/28) CMP_DATE not in columns
(27/28) CRT_DATE not in columns


### 02-02-1. check cardio

In [95]:
df_cardio_groupby = df_cardio.groupby(['uniqueID', 'cardioTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_cardio_groupby['_count'] > 1
df_cardio_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,cardioTimeSlot,Unnamed: 2_level_1


### 02-02-2. check empty cells

In [96]:
COMBINE_harmonizer.check_empty(df_cardio)

(0/13) column: center (9573 / 0)
(1/13) column: subjectID (9573 / 0)
(2/13) column: uniqueID (9573 / 0)
(3/13) column: cardioVolumeExpand (9565 / 8)
(4/13) column: cardioInotropicAgent (9565 / 8)
(5/13) column: cardioBloodTransfusion (9565 / 8)
(6/13) column: cardioPlatelets (9565 / 8)
(7/13) column: cardioTimeSlot (9573 / 0)
(8/13) column: cardioDate (9565 / 8)
(9/13) column: cardioTime (9551 / 22)
(10/13) column: cardioSystolicBloodPressure_mmHg (9087 / 486)
(11/13) column: cardioDiastolicBloodPressure_mmHg (9087 / 486)
(12/13) column: cardioHeartRate_BPM (9460 / 113)


In [97]:
COMBINE_harmonizer.column_info(df_cardio)

(0/13) center: (9573/0)
(1/13) subjectID: (9573/0)
(2/13) uniqueID: (9573/0)
(3/13) cardioVolumeExpand: (9565/8)
(4/13) cardioInotropicAgent: (9565/8)
(5/13) cardioBloodTransfusion: (9565/8)
(6/13) cardioPlatelets: (9565/8)
(7/13) cardioTimeSlot: (9573/0)
(8/13) cardioDate: (9565/8)
(9/13) cardioTime: (9551/22)
(10/13) cardioSystolicBloodPressure_mmHg: (9087/486)
(11/13) cardioDiastolicBloodPressure_mmHg: (9087/486)
(12/13) cardioHeartRate_BPM: (9460/113)


## 02-03. respiratory

In [98]:
respiratory_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Respiratory')
all_respiratory_columns = id_columns + respiratory_columns
respiratory_columns

['respiratoryTimeSlot',
 'respiratoryDate',
 'respiratoryTime',
 'respiratorySupportType',
 'respiratoryFiO2',
 'respiratoryRate_Hz',
 'respiratoryPIP_cmH2O',
 'respiratoryMAP_cmH2O',
 'respiratoryPEEP_cmH2O']

In [99]:
df_respiratory = df_dict['oc08.csv'].copy()

respiratoryTimeSlot_int = df_respiratory['respiratoryTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = respiratoryTimeSlot_int == 0
is_intervention = is_pre == False
# after_baseline
df_respiratory = df_respiratory[is_intervention]

df_respiratory = COMBINE_harmonizer.valid_columns(df_respiratory, all_respiratory_columns, debug_df=True, debug_columns=True)
df_respiratory = COMBINE_harmonizer.postprocess(df_respiratory)

out_filename = os.sep.join([out_dir, '02-03-respiratory.csv'])
df_respiratory.to_csv(out_filename, index=False)

(1/37) hematology not in columns
(2/37) REC_CMP not in columns
(13/37) hematologyWBC not in columns
(14/37) hematologyHemoglobin not in columns
(15/37) hematologyPolymorphNeutrophilsDifferentialCount not in columns
(16/37) hematologyMonocytes not in columns
(17/37) hematologyLymphocytes not in columns
(18/37) hematologyPlateletCount not in columns
(19/37) hematologyPT_s not in columns
(20/37) hematologyPTT_s not in columns
(21/37) anticonvulsants1 not in columns
(22/37) anticonvulsants2 not in columns
(23/37) anticonvulsants3 not in columns
(24/37) analgesicsSedatives1 not in columns
(25/37) analgesicsSedatives2 not in columns
(26/37) analgesicsSedatives3 not in columns
(27/37) antipyretics1 not in columns
(28/37) antipyretics2 not in columns
(29/37) antipyretics3 not in columns
(30/37) paralytics1 not in columns
(31/37) paralytics2 not in columns
(32/37) paralytics3 not in columns
(33/37) otherMedFluidIntake_ccPerKg not in columns
(34/37) otherMedUrineOutput_ccPerKg not in columns
(35

### 02-03-1. check respiratory

In [100]:
df_respiratory_groupby = df_respiratory.groupby(['uniqueID', 'respiratoryTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_respiratory_groupby['_count'] > 1
df_respiratory_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,respiratoryTimeSlot,Unnamed: 2_level_1


### 02-03-2. check empty cells

In [101]:
COMBINE_harmonizer.check_empty(df_respiratory)

(0/12) column: center (1979 / 0)
(1/12) column: subjectID (1979 / 0)
(2/12) column: uniqueID (1979 / 0)
(3/12) column: respiratoryTimeSlot (1979 / 0)
(4/12) column: respiratoryDate (1979 / 0)
(5/12) column: respiratoryTime (1974 / 5)
(6/12) column: respiratorySupportType (1974 / 5)
(7/12) column: respiratoryFiO2 (1416 / 563)
(8/12) column: respiratoryRate_Hz (1052 / 927)
(9/12) column: respiratoryPIP_cmH2O (1041 / 938)
(10/12) column: respiratoryMAP_cmH2O (801 / 1178)
(11/12) column: respiratoryPEEP_cmH2O (964 / 1015)


In [102]:
COMBINE_harmonizer.column_info(df_respiratory)

(0/12) center: (1979/0)
(1/12) subjectID: (1979/0)
(2/12) uniqueID: (1979/0)
(3/12) respiratoryTimeSlot: (1979/0)
(4/12) respiratoryDate: (1979/0)
(5/12) respiratoryTime: (1974/5)
(6/12) respiratorySupportType: (1974/5)
(7/12) respiratoryFiO2: (1416/563)
(8/12) respiratoryRate_Hz: (1052/927)
(9/12) respiratoryPIP_cmH2O: (1041/938)
(10/12) respiratoryMAP_cmH2O: (801/1178)
(11/12) respiratoryPEEP_cmH2O: (964/1015)


## 02-04. blood-gas

In [103]:
blood_gas_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Blood Gas')
all_blood_gas_columns = id_columns + blood_gas_columns
blood_gas_columns

['bloodGasTimeSlot',
 'bloodGasDate',
 'bloodGasTime',
 'bloodGasSrc',
 'bloodGasPH',
 'bloodGasPCO2_mmHg',
 'bloodGasPO2_mmHg',
 'bloodGasHCO3_mEqPerL',
 'bloodGasBaseDeficit_mEqPerL',
 'bloodGasPHCorrect',
 'bloodGasPCO2Correct_mmHg',
 'bloodGasPO2Correct_mmHg',
 'bloodGasHCO3Correct_mEqPerL',
 'bloodGasBaseDeficitCorrect_mEqPerL']

In [104]:
df_blood_gas = df_dict['oc07.csv'].copy().rename(columns={'cardioTimeSlot': 'bloodGasTimeSlot'})

bloodGasTimeSlot_int = df_blood_gas['bloodGasTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = bloodGasTimeSlot_int == 0
is_intervention = is_pre == False
df_blood_gas = df_blood_gas[is_intervention]

df_blood_gas = COMBINE_harmonizer.valid_columns(df_blood_gas, all_blood_gas_columns)
df_blood_gas = COMBINE_harmonizer.postprocess(df_blood_gas)

# XXX hack for center30 OC0071 bloodGasTimeSlot = 36
is_center30_OC0071_bloodGasTimeSlot36 = (df_blood_gas['center'] == '30') & (df_blood_gas['subjectID'] == 'OC0071') & (df_blood_gas['bloodGasTimeSlot'].isin(['36', '36.0']))
df_blood_gas.loc[is_center30_OC0071_bloodGasTimeSlot36, 'bloodGasDate'] = '2012-01-03'

out_filename = os.sep.join([out_dir, '02-04-blood-gas.csv'])
df_blood_gas.to_csv(out_filename, index=False)

### 02-04-1. check blood-gas

In [105]:
df_blood_gas_groupby = df_blood_gas.groupby(['uniqueID', 'bloodGasTimeSlot']).agg(_count=('uniqueID', 'count'))


is_invalid = df_blood_gas_groupby['_count'] > 1
df_blood_gas_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,bloodGasTimeSlot,Unnamed: 2_level_1


### 02-04-2. check empty cells

In [106]:
COMBINE_harmonizer.check_empty(df_blood_gas)

(0/17) column: center (9573 / 0)
(1/17) column: subjectID (9573 / 0)
(2/17) column: uniqueID (9573 / 0)
(3/17) column: bloodGasSrc (3775 / 5798)
(4/17) column: bloodGasTimeSlot (9573 / 0)
(5/17) column: bloodGasDate (5127 / 4446)
(6/17) column: bloodGasTime (4887 / 4686)
(7/17) column: bloodGasPH (2575 / 6998)
(8/17) column: bloodGasPCO2_mmHg (2576 / 6997)
(9/17) column: bloodGasPO2_mmHg (2570 / 7003)
(10/17) column: bloodGasHCO3_mEqPerL (2502 / 7071)
(11/17) column: bloodGasBaseDeficit_mEqPerL (2449 / 7124)
(12/17) column: bloodGasPHCorrect (3334 / 6239)
(13/17) column: bloodGasPCO2Correct_mmHg (3336 / 6237)
(14/17) column: bloodGasPO2Correct_mmHg (3291 / 6282)
(15/17) column: bloodGasHCO3Correct_mEqPerL (2748 / 6825)
(16/17) column: bloodGasBaseDeficitCorrect_mEqPerL (2674 / 6899)


In [107]:
COMBINE_harmonizer.column_info(df_blood_gas)

(0/17) center: (9573/0)
(1/17) subjectID: (9573/0)
(2/17) uniqueID: (9573/0)
(3/17) bloodGasSrc: (3775/5798)
(4/17) bloodGasTimeSlot: (9573/0)
(5/17) bloodGasDate: (5127/4446)
(6/17) bloodGasTime: (4887/4686)
(7/17) bloodGasPH: (2575/6998)
(8/17) bloodGasPCO2_mmHg: (2576/6997)
(9/17) bloodGasPO2_mmHg: (2570/7003)
(10/17) bloodGasHCO3_mEqPerL: (2502/7071)
(11/17) bloodGasBaseDeficit_mEqPerL: (2449/7124)
(12/17) bloodGasPHCorrect: (3334/6239)
(13/17) bloodGasPCO2Correct_mmHg: (3336/6237)
(14/17) bloodGasPO2Correct_mmHg: (3291/6282)
(15/17) bloodGasHCO3Correct_mEqPerL: (2748/6825)
(16/17) bloodGasBaseDeficitCorrect_mEqPerL: (2674/6899)


## 02-05. hematology

In [108]:
hematology_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Hematology CBC')
all_hematology_columns = id_columns + hematology_columns
hematology_columns

['hematology',
 'hematologyTimeSlot',
 'hematologyDate',
 'hematologyTime',
 'hematologyWBC',
 'hematologyHemoglobin',
 'hematologyPolymorphNeutrophilsDifferentialCount',
 'hematologyMonocytes',
 'hematologyLymphocytes',
 'hematologyPlateletCount',
 'hematologyPT_s',
 'hematologyPTT_s',
 'hematologyHematocritMin',
 'hematologyHematocritMinDate',
 'hematologyPlateletCountMin',
 'hematologyPlateletCountMinDate']

In [109]:
df_hematology = df_dict['oc08.csv'].copy().rename(columns={'respiratoryTimeSlot': 'hematologyTimeSlot', 'respiratoryDate': 'hematologyDate', 'respiratoryTime': 'hematologyTime'})

hematology_time_slot_int =df_hematology['hematologyTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = hematology_time_slot_int == 0
is_intervention = is_pre == False
# after_baseline
df_hematology = df_hematology[is_intervention]

df_hematology = COMBINE_harmonizer.valid_columns(df_hematology, all_hematology_columns, debug_df=True, debug_columns=True)
df_hematology = COMBINE_harmonizer.postprocess(df_hematology)
out_filename = os.sep.join([out_dir, '02-05-hematology.csv'])
df_hematology.to_csv(out_filename, index=False)

(2/37) REC_CMP not in columns
(7/37) respiratorySupportType not in columns
(8/37) respiratoryFiO2 not in columns
(9/37) respiratoryRate_Hz not in columns
(10/37) respiratoryPIP_cmH2O not in columns
(11/37) respiratoryMAP_cmH2O not in columns
(12/37) respiratoryPEEP_cmH2O not in columns
(21/37) anticonvulsants1 not in columns
(22/37) anticonvulsants2 not in columns
(23/37) anticonvulsants3 not in columns
(24/37) analgesicsSedatives1 not in columns
(25/37) analgesicsSedatives2 not in columns
(26/37) analgesicsSedatives3 not in columns
(27/37) antipyretics1 not in columns
(28/37) antipyretics2 not in columns
(29/37) antipyretics3 not in columns
(30/37) paralytics1 not in columns
(31/37) paralytics2 not in columns
(32/37) paralytics3 not in columns
(33/37) otherMedFluidIntake_ccPerKg not in columns
(34/37) otherMedUrineOutput_ccPerKg not in columns
(35/37) CMP_DATE not in columns
(36/37) CRT_DATE not in columns
(14/18) hematologyHematocritMin not in df
(15/18) hematologyHematocritMinDate n

### 02-05-1. check hematology

In [110]:
df_hematology_groupby = df_hematology.groupby(['uniqueID', 'hematologyTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_hematology_groupby['_count'] > 1
df_hematology_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,hematologyTimeSlot,Unnamed: 2_level_1


### 02-05-2. check empty cells

In [111]:
COMBINE_harmonizer.check_empty(df_hematology)

(0/15) column: center (1979 / 0)
(1/15) column: subjectID (1979 / 0)
(2/15) column: uniqueID (1979 / 0)
(3/15) column: hematology (1951 / 28)
(4/15) column: hematologyTimeSlot (1979 / 0)
(5/15) column: hematologyDate (1979 / 0)
(6/15) column: hematologyTime (1974 / 5)
(7/15) column: hematologyWBC (1239 / 740)
(8/15) column: hematologyHemoglobin (1340 / 639)
(9/15) column: hematologyPolymorphNeutrophilsDifferentialCount (1067 / 912)
(10/15) column: hematologyMonocytes (1089 / 890)
(11/15) column: hematologyLymphocytes (1112 / 867)
(12/15) column: hematologyPlateletCount (1305 / 674)
(13/15) column: hematologyPT_s (810 / 1169)
(14/15) column: hematologyPTT_s (776 / 1203)


In [112]:
COMBINE_harmonizer.column_info(df_hematology)

(0/15) center: (1979/0)
(1/15) subjectID: (1979/0)
(2/15) uniqueID: (1979/0)
(3/15) hematology: (1951/28)
(4/15) hematologyTimeSlot: (1979/0)
(5/15) hematologyDate: (1979/0)
(6/15) hematologyTime: (1974/5)
(7/15) hematologyWBC: (1239/740)
(8/15) hematologyHemoglobin: (1340/639)
(9/15) hematologyPolymorphNeutrophilsDifferentialCount: (1067/912)
(10/15) hematologyMonocytes: (1089/890)
(11/15) hematologyLymphocytes: (1112/867)
(12/15) hematologyPlateletCount: (1305/674)
(13/15) hematologyPT_s: (810/1169)
(14/15) hematologyPTT_s: (776/1203)


### 02-05-3. Hematology summary

In [113]:
df_main_hematology = COMBINE_harmonizer.valid_columns(df_main, all_hematology_columns)
df_main_hematology = COMBINE_harmonizer.postprocess(df_main_hematology)

out_filename = os.sep.join([out_dir, '02-05_s-hematology.csv'])
df_main_hematology.to_csv(out_filename, index=False)

(2/18) hematology not in df
(3/18) hematologyTimeSlot not in df
(4/18) hematologyDate not in df
(5/18) hematologyTime not in df
(6/18) hematologyWBC not in df
(7/18) hematologyHemoglobin not in df
(8/18) hematologyPolymorphNeutrophilsDifferentialCount not in df
(9/18) hematologyMonocytes not in df
(10/18) hematologyLymphocytes not in df
(11/18) hematologyPlateletCount not in df
(12/18) hematologyPT_s not in df
(13/18) hematologyPTT_s not in df


### 02-05-4. check empty cells

In [114]:
COMBINE_harmonizer.check_empty(df_main_hematology)

(0/7) column: center (364 / 0)
(1/7) column: subjectID (364 / 0)
(2/7) column: uniqueID (364 / 0)
(3/7) column: hematologyHematocritMin (364 / 0)
(4/7) column: hematologyHematocritMinDate (364 / 0)
(5/7) column: hematologyPlateletCountMin (361 / 3)
(6/7) column: hematologyPlateletCountMinDate (361 / 3)


In [115]:
COMBINE_harmonizer.column_info(df_main_hematology)

(0/7) center: (364/0)
(1/7) subjectID: (364/0)
(2/7) uniqueID: (364/0)
(3/7) hematologyHematocritMin: (364/0)
(4/7) hematologyHematocritMinDate: (364/0)
(5/7) hematologyPlateletCountMin: (361/3)
(6/7) hematologyPlateletCountMinDate: (361/3)


## 02-06. Blood Value

In [116]:
blood_value_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Blood Value')
all_blood_value_columns = id_columns + blood_value_columns
blood_value_columns

['bloodValueBunBaseline_mgPerdL',
 'bloodValueBunBaseline_mgPerdLDate',
 'bloodValueCreatinineBaseline_mgPerdL',
 'bloodValueCreatinineBaseline_mgPerdLDate',
 'bloodValueASTSGOTBaseline_UPerL',
 'bloodValueASTSGOTBaseline_UPerLDate',
 'bloodValueALTSGPTBaseline_UPerL',
 'bloodValueALTSGPTBaseline_UPerLDate',
 'bloodValueTotalBilirubinBaseline_mgPerdL',
 'bloodValueTotalBilirubinBaseline_mgPerdLDate',
 'bloodValuePHMin',
 'bloodValuePHMinDate',
 'bloodValueHCO3Min_mEqPerL',
 'bloodValueHCO3Min_mEqPerLDate',
 'bloodValueSerumNaMin_mEqPerL',
 'bloodValueSerumNaMin_mEqPerLDate',
 'bloodValueSerumKMin_mEqPerL',
 'bloodValueSerumKMin_mEqPerLDate',
 'bloodValueClMin_mEqPerL',
 'bloodValueClMin_mEqPerLDate',
 'bloodValueGlucoseMin_mgPerdL',
 'bloodValueGlucoseMin_mgPerdLDate',
 'bloodValueTotalCaMin_mgPerdL',
 'bloodValueTotalCaMin_mgPerdLDate',
 'bloodValueIonCaMin_mgPerdL',
 'bloodValueIonCaMin_mgPerdLDate',
 'bloodValueASTSGOTMin_UPerL',
 'bloodValueASTSGOTMin_UPerLDate',
 'bloodValueALTSGP

In [117]:
df_main_blood_value = COMBINE_harmonizer.valid_columns(df_main, all_blood_value_columns)
df_main_blood_value = COMBINE_harmonizer.postprocess(df_main_blood_value)

out_filename = os.sep.join([out_dir, '02-06_s-blood-value.csv'])
df_main_blood_value.to_csv(out_filename, index=False)


### 02-06-2. check empty cells

In [118]:
COMBINE_harmonizer.check_empty(df_main_blood_value)

(0/61) column: center (364 / 0)
(1/61) column: subjectID (364 / 0)
(2/61) column: uniqueID (364 / 0)
(3/61) column: bloodValuePHMin (360 / 4)
(4/61) column: bloodValuePHMinDate (360 / 4)
(5/61) column: bloodValuePHMax (360 / 4)
(6/61) column: bloodValuePHMaxDate (360 / 4)
(7/61) column: bloodValueHCO3Min_mEqPerL (361 / 3)
(8/61) column: bloodValueHCO3Min_mEqPerLDate (361 / 3)
(9/61) column: bloodValueBaseDeficitMax_mEqPerL (360 / 4)
(10/61) column: bloodValueBaseDeficitMax_mEqPerLDate (360 / 4)
(11/61) column: bloodValueSerumNaMin_mEqPerL (363 / 1)
(12/61) column: bloodValueSerumNaMin_mEqPerLDate (364 / 0)
(13/61) column: bloodValueSerumNaMax_mEqPerL (363 / 1)
(14/61) column: bloodValueSerumNaMax_mEqPerLDate (363 / 1)
(15/61) column: bloodValueSerumKMin_mEqPerL (363 / 1)
(16/61) column: bloodValueSerumKMin_mEqPerLDate (364 / 0)
(17/61) column: bloodValueSerumKMax_mEqPerL (362 / 2)
(18/61) column: bloodValueSerumKMax_mEqPerLDate (363 / 1)
(19/61) column: bloodValueClMin_mEqPerL (363 / 1

In [119]:
COMBINE_harmonizer.column_info(df_main_blood_value)

(0/61) center: (364/0)
(1/61) subjectID: (364/0)
(2/61) uniqueID: (364/0)
(3/61) bloodValuePHMin: (360/4)
(4/61) bloodValuePHMinDate: (360/4)
(5/61) bloodValuePHMax: (360/4)
(6/61) bloodValuePHMaxDate: (360/4)
(7/61) bloodValueHCO3Min_mEqPerL: (361/3)
(8/61) bloodValueHCO3Min_mEqPerLDate: (361/3)
(9/61) bloodValueBaseDeficitMax_mEqPerL: (360/4)
(10/61) bloodValueBaseDeficitMax_mEqPerLDate: (360/4)
(11/61) bloodValueSerumNaMin_mEqPerL: (363/1)
(12/61) bloodValueSerumNaMin_mEqPerLDate: (364/0)
(13/61) bloodValueSerumNaMax_mEqPerL: (363/1)
(14/61) bloodValueSerumNaMax_mEqPerLDate: (363/1)
(15/61) bloodValueSerumKMin_mEqPerL: (363/1)
(16/61) bloodValueSerumKMin_mEqPerLDate: (364/0)
(17/61) bloodValueSerumKMax_mEqPerL: (362/2)
(18/61) bloodValueSerumKMax_mEqPerLDate: (363/1)
(19/61) bloodValueClMin_mEqPerL: (363/1)
(20/61) bloodValueClMin_mEqPerLDate: (363/1)
(21/61) bloodValueClMax_mEqPerL: (362/2)
(22/61) bloodValueClMax_mEqPerLDate: (362/2)
(23/61) bloodValueBunBaseline_mgPerdL: (307/57)

## 02-07. infection

In [120]:
infection_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Infection')
all_infection_columns = id_columns + infection_columns
infection_columns

['positiveCultureNumber',
 'positiveCulture',
 'positiveCultureSrc',
 'positiveCultureDate',
 'positiveCultureTime',
 'positiveCultureOrganismCode1',
 'positiveCultureOrganismCode2',
 'positiveCultureOrganismCode3',
 'antibiotics',
 'antibioticsCode1',
 'antibioticsCode2',
 'antibioticsCode3',
 'rewarmingAntibiotics',
 'rewarmingAntibioticsCode1',
 'rewarmingAntibioticsCode2',
 'rewarmingAntibioticsCode3']

In [121]:
df_infection_1 = df_dict['oc09i.csv'].copy()
df_infection_2 = df_dict['oc09.csv'].copy()
df_infection = df_infection_1.merge(df_infection_2, on=['center', 'subjectID'], how='outer')
df_infection = COMBINE_harmonizer.valid_columns(df_infection, all_infection_columns)
df_infection = COMBINE_harmonizer.postprocess(df_infection)

### XXX positive culture number as 1
df_infection['positiveCultureNumber'] = 1

out_filename = os.sep.join([out_dir, '02-07-infection.csv'])
df_infection.to_csv(out_filename, index=False)

(3/18) positiveCulture not in df


### 02-07-1. check infection

In [122]:
df_infection_groupby = df_infection.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_infection_groupby['_count'] > 1
df_infection_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 02-07-2. check empty cells

In [123]:
COMBINE_harmonizer.check_empty(df_infection)

(0/18) column: center (364 / 0)
(1/18) column: subjectID (364 / 0)
(2/18) column: uniqueID (364 / 0)
(3/18) column: positiveCultureNumber (364 / 0)
(4/18) column: positiveCultureSrc (5 / 359)
(5/18) column: positiveCultureDate (5 / 359)
(6/18) column: positiveCultureTime (5 / 359)
(7/18) column: positiveCultureOrganismCode1 (5 / 359)
(8/18) column: positiveCultureOrganismCode2 (0 / 364)
(9/18) column: positiveCultureOrganismCode3 (0 / 364)
(10/18) column: antibiotics (364 / 0)
(11/18) column: rewarmingAntibiotics (364 / 0)
(12/18) column: antibioticsCode1 (152 / 212)
(13/18) column: antibioticsCode2 (88 / 276)
(14/18) column: antibioticsCode3 (14 / 350)
(15/18) column: rewarmingAntibioticsCode1 (16 / 348)
(16/18) column: rewarmingAntibioticsCode2 (5 / 359)
(17/18) column: rewarmingAntibioticsCode3 (0 / 364)


In [124]:
COMBINE_harmonizer.column_info(df_infection)

(0/18) center: (364/0)
(1/18) subjectID: (364/0)
(2/18) uniqueID: (364/0)
(3/18) positiveCultureNumber: (364/0)
(4/18) positiveCultureSrc: (5/359)
(5/18) positiveCultureDate: (5/359)
(6/18) positiveCultureTime: (5/359)
(7/18) positiveCultureOrganismCode1: (5/359)
(8/18) positiveCultureOrganismCode2: (0/364)
(9/18) positiveCultureOrganismCode3: (0/364)
(10/18) antibiotics: (364/0)
(11/18) rewarmingAntibiotics: (364/0)
(12/18) antibioticsCode1: (152/212)
(13/18) antibioticsCode2: (88/276)
(14/18) antibioticsCode3: (14/350)
(15/18) rewarmingAntibioticsCode1: (16/348)
(16/18) rewarmingAntibioticsCode2: (5/359)
(17/18) rewarmingAntibioticsCode3: (0/364)


## 02-08. other-med

In [125]:
other_med_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Other Medication')
all_other_med_columns = id_columns + other_med_columns
other_med_columns

['otherMedTimeSlot',
 'otherMedTargetDate',
 'otherMedTargetTime',
 'anticonvulsants',
 'anticonvulsants1',
 'anticonvulsants2',
 'anticonvulsants3',
 'analgesics',
 'analgesicsSedatives1',
 'analgesicsSedatives2',
 'analgesicsSedatives3',
 'antipyretics',
 'antipyretics1',
 'antipyretics2',
 'antipyretics3',
 'paralytics',
 'paralytics1',
 'paralytics2',
 'paralytics3',
 'otherMedFluidIntake_ccPerKg',
 'otherMedUrineOutput_ccPerKg']

In [126]:
df_other_med = df_dict['oc08.csv'].copy().rename(columns={'respiratoryTimeSlot': 'otherMedTimeSlot'})

other_med_time_slot_int = df_other_med['otherMedTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = other_med_time_slot_int == 0
is_intervention = is_pre == False
# after_baseline
df_other_med = df_other_med[is_intervention]

df_other_med = COMBINE_harmonizer.valid_columns(df_other_med, all_other_med_columns, debug_df=True, debug_columns=True)
df_other_med = COMBINE_harmonizer.postprocess(df_other_med)

out_filename = os.sep.join([out_dir, '02-08-other-med.csv'])
df_other_med.to_csv(out_filename, index=False)

(1/37) hematology not in columns
(2/37) REC_CMP not in columns
(5/37) respiratoryDate not in columns
(6/37) respiratoryTime not in columns
(7/37) respiratorySupportType not in columns
(8/37) respiratoryFiO2 not in columns
(9/37) respiratoryRate_Hz not in columns
(10/37) respiratoryPIP_cmH2O not in columns
(11/37) respiratoryMAP_cmH2O not in columns
(12/37) respiratoryPEEP_cmH2O not in columns
(13/37) hematologyWBC not in columns
(14/37) hematologyHemoglobin not in columns
(15/37) hematologyPolymorphNeutrophilsDifferentialCount not in columns
(16/37) hematologyMonocytes not in columns
(17/37) hematologyLymphocytes not in columns
(18/37) hematologyPlateletCount not in columns
(19/37) hematologyPT_s not in columns
(20/37) hematologyPTT_s not in columns
(35/37) CMP_DATE not in columns
(36/37) CRT_DATE not in columns
(3/23) otherMedTargetDate not in df
(4/23) otherMedTargetTime not in df
(5/23) anticonvulsants not in df
(9/23) analgesics not in df
(13/23) antipyretics not in df
(17/23) para

### 02-08-1. check other-med

In [127]:
df_other_med_groupby = df_other_med.groupby(['uniqueID', 'otherMedTimeSlot']).agg(_count=('uniqueID', 'count'))


is_invalid = df_other_med_groupby['_count'] > 1
df_other_med_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,otherMedTimeSlot,Unnamed: 2_level_1


### 02-08-2. check empty cells

In [128]:
COMBINE_harmonizer.check_empty(df_other_med)

(0/18) column: center (1979 / 0)
(1/18) column: subjectID (1979 / 0)
(2/18) column: uniqueID (1979 / 0)
(3/18) column: otherMedTimeSlot (1979 / 0)
(4/18) column: anticonvulsants1 (628 / 1351)
(5/18) column: anticonvulsants2 (106 / 1873)
(6/18) column: anticonvulsants3 (12 / 1967)
(7/18) column: analgesicsSedatives1 (852 / 1127)
(8/18) column: analgesicsSedatives2 (279 / 1700)
(9/18) column: analgesicsSedatives3 (20 / 1959)
(10/18) column: antipyretics1 (9 / 1970)
(11/18) column: antipyretics2 (0 / 1979)
(12/18) column: antipyretics3 (0 / 1979)
(13/18) column: paralytics1 (64 / 1915)
(14/18) column: paralytics2 (1 / 1978)
(15/18) column: paralytics3 (0 / 1979)
(16/18) column: otherMedFluidIntake_ccPerKg (1606 / 373)
(17/18) column: otherMedUrineOutput_ccPerKg (1603 / 376)


In [129]:
COMBINE_harmonizer.column_info(df_other_med)

(0/18) center: (1979/0)
(1/18) subjectID: (1979/0)
(2/18) uniqueID: (1979/0)
(3/18) otherMedTimeSlot: (1979/0)
(4/18) anticonvulsants1: (628/1351)
(5/18) anticonvulsants2: (106/1873)
(6/18) anticonvulsants3: (12/1967)
(7/18) analgesicsSedatives1: (852/1127)
(8/18) analgesicsSedatives2: (279/1700)
(9/18) analgesicsSedatives3: (20/1959)
(10/18) antipyretics1: (9/1970)
(11/18) antipyretics2: (0/1979)
(12/18) antipyretics3: (0/1979)
(13/18) paralytics1: (64/1915)
(14/18) paralytics2: (1/1978)
(15/18) paralytics3: (0/1979)
(16/18) otherMedFluidIntake_ccPerKg: (1606/373)
(17/18) otherMedUrineOutput_ccPerKg: (1603/376)


## 02-09. imaging

In [130]:
imaging_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Imaging')
all_imaging_columns = id_columns + imaging_columns
imaging_columns

['imagingTimeSlot',
 'headSonogram',
 'headSonogramDate',
 'headSonogramTime',
 'headSonogramResult1',
 'headSonogramResult2',
 'headSonogramResult3',
 'headSonogramResult4',
 'headSonogramResult5',
 'headSonogramResult6',
 'headSonogramResult7',
 'headSonogramResult8',
 'headSonogramResultText',
 'headCT',
 'headCTDate',
 'headCTTime',
 'headCTResult1',
 'headCTResult2',
 'headCTResult3',
 'headCTResult4',
 'headCTResult5',
 'headCTResult6',
 'headCTResult7',
 'headCTResult8',
 'headCTResultText',
 'brainMRI',
 'brainMRIDate',
 'brainMRITime',
 'brainMRIResult1',
 'brainMRIResult2',
 'brainMRIResult3',
 'brainMRIResult4',
 'brainMRIResult5',
 'brainMRIResult6',
 'brainMRIResult7',
 'brainMRIResult8',
 'brainMRIResultText']

In [131]:
df_imaging = df_dict['oc12.csv'].copy()

imaging_time_slot_int = df_imaging['imagingTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_pre = imaging_time_slot_int == 1
is_post = imaging_time_slot_int == 3
is_intervention = (is_pre == False) & (is_post == False)
df_imaging = df_imaging[is_intervention]

df_imaging = COMBINE_harmonizer.valid_columns(df_imaging, all_valid_columns, debug_df=True, debug_columns=False)
df_imaging = COMBINE_harmonizer.postprocess(df_imaging)
out_filename = os.sep.join([out_dir, '02-09-imaging.csv'])
df_imaging.to_csv(out_filename, index=False)

(7/42) REC_CMP not in columns
(40/42) CMP_DATE not in columns
(41/42) CRT_DATE not in columns


### 02-09-1. check imaging

In [132]:
df_imaging_groupby = df_imaging.groupby(['uniqueID', 'imagingTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_imaging_groupby['_count'] > 1
df_imaging_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,imagingTimeSlot,Unnamed: 2_level_1


### 02-09-2. check empty cells

In [133]:
COMBINE_harmonizer.check_empty(df_imaging)

(0/40) column: center (357 / 0)
(1/40) column: subjectID (357 / 0)
(2/40) column: uniqueID (357 / 0)
(3/40) column: headSonogram (357 / 0)
(4/40) column: headSonogramResultText (23 / 334)
(5/40) column: headCT (357 / 0)
(6/40) column: headCTResultText (0 / 357)
(7/40) column: brainMRI (357 / 0)
(8/40) column: brainMRIResultText (0 / 357)
(9/40) column: imagingTimeSlot (357 / 0)
(10/40) column: headSonogramDate (176 / 181)
(11/40) column: headSonogramTime (174 / 183)
(12/40) column: headSonogramResult1 (174 / 183)
(13/40) column: headSonogramResult2 (25 / 332)
(14/40) column: headSonogramResult3 (3 / 354)
(15/40) column: headSonogramResult4 (0 / 357)
(16/40) column: headSonogramResult5 (0 / 357)
(17/40) column: headSonogramResult6 (0 / 357)
(18/40) column: headSonogramResult7 (0 / 357)
(19/40) column: headSonogramResult8 (0 / 357)
(20/40) column: headCTDate (2 / 355)
(21/40) column: headCTTime (2 / 355)
(22/40) column: headCTResult1 (2 / 355)
(23/40) column: headCTResult2 (0 / 357)
(24/

In [134]:
COMBINE_harmonizer.column_info(df_imaging)

(0/40) center: (357/0)
(1/40) subjectID: (357/0)
(2/40) uniqueID: (357/0)
(3/40) headSonogram: (357/0)
(4/40) headSonogramResultText: (23/334)
(5/40) headCT: (357/0)
(6/40) headCTResultText: (0/357)
(7/40) brainMRI: (357/0)
(8/40) brainMRIResultText: (0/357)
(9/40) imagingTimeSlot: (357/0)
(10/40) headSonogramDate: (176/181)
(11/40) headSonogramTime: (174/183)
(12/40) headSonogramResult1: (174/183)
(13/40) headSonogramResult2: (25/332)
(14/40) headSonogramResult3: (3/354)
(15/40) headSonogramResult4: (0/357)
(16/40) headSonogramResult5: (0/357)
(17/40) headSonogramResult6: (0/357)
(18/40) headSonogramResult7: (0/357)
(19/40) headSonogramResult8: (0/357)
(20/40) headCTDate: (2/355)
(21/40) headCTTime: (2/355)
(22/40) headCTResult1: (2/355)
(23/40) headCTResult2: (0/357)
(24/40) headCTResult3: (0/357)
(25/40) headCTResult4: (0/357)
(26/40) headCTResult5: (0/357)
(27/40) headCTResult6: (0/357)
(28/40) headCTResult7: (0/357)
(29/40) headCTResult8: (0/357)
(30/40) brainMRIDate: (1/356)
(31/

## 03-01. Post-intervention Temperature

In [135]:
temperature_post_treat_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Temperature')
all_temperature_post_treat_columns = id_columns + temperature_post_treat_columns
temperature_post_treat_columns

['post_TemperatureTimeSlot',
 'post_TemperatureDate',
 'post_TemperatureTime',
 'post_SkinTemperature_C',
 'post_AxillaryTemperature_C',
 'post_AlterationSkinIntegrity',
 'post_Shiver',
 'normothermiaAtEndIntervention',
 'normothermiaDate',
 'normothermiaTime',
 'normothermiaAxillaryTemperature_C',
 'noNormothermiaReason',
 'coolAfterIntervention',
 'coolAfterInterventionText']

In [136]:
df_temperature_post = df_dict['oc06d.csv'].copy()
df_temperature_post = COMBINE_harmonizer.valid_columns(df_temperature_post, all_temperature_post_treat_columns, debug_df=True, debug_columns=False)
df_temperature_post = COMBINE_harmonizer.postprocess(df_temperature_post)

out_filename = os.sep.join([out_dir, '03-01-post-temperature.csv'])
df_temperature_post.to_csv(out_filename, index=False)

### 03-01-1. check post-treat temperature

In [137]:
df_temperature_post_groupby = df_temperature_post.groupby(['uniqueID', 'post_TemperatureTimeSlot']).agg(_count=('uniqueID', 'count'))

is_invalid = df_temperature_post_groupby['_count'] > 1
df_temperature_post_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,post_TemperatureTimeSlot,Unnamed: 2_level_1


### 03-01-2. check empty cells

In [138]:
COMBINE_harmonizer.check_empty(df_temperature_post)

(0/10) column: center (1883 / 0)
(1/10) column: subjectID (1883 / 0)
(2/10) column: uniqueID (1883 / 0)
(3/10) column: post_AlterationSkinIntegrity (1847 / 36)
(4/10) column: post_Shiver (1847 / 36)
(5/10) column: post_TemperatureTimeSlot (1883 / 0)
(6/10) column: post_TemperatureDate (1847 / 36)
(7/10) column: post_TemperatureTime (1835 / 48)
(8/10) column: post_SkinTemperature_C (850 / 1033)
(9/10) column: post_AxillaryTemperature_C (1739 / 144)


In [139]:
COMBINE_harmonizer.column_info(df_temperature_post)

(0/10) center: (1883/0)
(1/10) subjectID: (1883/0)
(2/10) uniqueID: (1883/0)
(3/10) post_AlterationSkinIntegrity: (1847/36)
(4/10) post_Shiver: (1847/36)
(5/10) post_TemperatureTimeSlot: (1883/0)
(6/10) post_TemperatureDate: (1847/36)
(7/10) post_TemperatureTime: (1835/48)
(8/10) post_SkinTemperature_C: (850/1033)
(9/10) post_AxillaryTemperature_C: (1739/144)


### 03-01-3. post-intervention temperature - summary

In [140]:
df_temperature_72_120 = pd.concat([df_dict['oc06t72.csv'], df_dict['oc06t120.csv']])
df_temperature_post_treat_s = COMBINE_harmonizer.valid_columns(df_temperature_72_120, all_temperature_post_treat_columns, debug_df=True, debug_columns=True)
df_temperature_post_treat_s = COMBINE_harmonizer.postprocess(df_temperature_post_treat_s)

out_filename = os.sep.join([out_dir, '03-01_s-post-temperature.csv'])
df_temperature_post_treat_s.to_csv(out_filename, index=False)


(1/37) pre_CoolInitiate not in columns
(2/37) pre_CoolbyIceGelPack not in columns
(3/37) pre_CoolPassively not in columns
(4/37) pre_CoolClinically not in columns
(5/37) pre_AfterOvershootReach33p5C not in columns
(8/37) discontinueBeforeEndPeriod not in columns
(9/37) discontinueOtherText not in columns
(11/37) OC6INIT not in columns
(13/37) targetTreatmentTemperature_C not in columns
(14/37) pre_CoolInitiateDate not in columns
(15/37) pre_CoolInitiateTime not in columns
(16/37) pre_AfterOvershootReach33p5CDate not in columns
(17/37) pre_AfterOvershootReach33p5CTime not in columns
(18/37) pre_TemperatureMinDate not in columns
(19/37) pre_TemperatureMinTime not in columns
(20/37) pre_SkinTemperatureMin_C not in columns
(21/37) pre_AxillaryTemperatureMin_C not in columns
(22/37) pre_EsophagealTemperatureMin_C not in columns
(23/37) pre_ServoSetMin_C not in columns
(24/37) pre_TemperatureMaxDate not in columns
(25/37) pre_TemperatureMaxTime not in columns
(26/37) pre_SkinTemperatureMax_C

### 03-01-4. check post-normo temperature - summary

In [141]:
COMBINE_harmonizer.check_empty(df_temperature_post_treat_s)

(0/10) column: center (364 / 0)
(1/10) column: subjectID (364 / 0)
(2/10) column: uniqueID (364 / 0)
(3/10) column: normothermiaAtEndIntervention (364 / 0)
(4/10) column: noNormothermiaReason (50 / 314)
(5/10) column: coolAfterInterventionText (25 / 339)
(6/10) column: normothermiaDate (313 / 51)
(7/10) column: normothermiaTime (313 / 51)
(8/10) column: normothermiaAxillaryTemperature_C (268 / 96)
(9/10) column: coolAfterIntervention (55 / 309)


In [142]:
COMBINE_harmonizer.column_info(df_temperature_post_treat_s)

(0/10) center: (364/0)
(1/10) subjectID: (364/0)
(2/10) uniqueID: (364/0)
(3/10) normothermiaAtEndIntervention: (364/0)
(4/10) noNormothermiaReason: (50/314)
(5/10) coolAfterInterventionText: (25/339)
(6/10) normothermiaDate: (313/51)
(7/10) normothermiaTime: (313/51)
(8/10) normothermiaAxillaryTemperature_C: (268/96)
(9/10) coolAfterIntervention: (55/309)


## 03-02. Post-intervention Blood Value

In [143]:
blood_value_post_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Blood Value')
all_blood_value_post_columns = id_columns + blood_value_post_columns
blood_value_post_columns

['post_BloodValueASTSGOT_UPerL',
 'post_BloodValueASTSGOT_UPerLDate',
 'post_BloodValueALTSGPT_UPerL',
 'post_BloodValueALTSGPT_UPerLDate',
 'post_BloodValueTotalBilirubin_mgPerdL',
 'post_BloodValueTotalBilirubin_mgPerdLDate']

In [144]:
df_blood_value_post = COMBINE_harmonizer.valid_columns(df_main, all_blood_value_post_columns, debug_df=False, debug_columns=True)
df_blood_value_post = COMBINE_harmonizer.postprocess(df_blood_value_post)

out_filename = os.sep.join([out_dir, '03-02-post-blood-value.csv'])
df_blood_value_post.to_csv(out_filename, index=False)

### 03-02-1. check post-normo blood value 

In [145]:
COMBINE_harmonizer.check_empty(df_blood_value_post)

(0/9) column: center (364 / 0)
(1/9) column: subjectID (364 / 0)
(2/9) column: uniqueID (364 / 0)
(3/9) column: post_BloodValueASTSGOT_UPerL (272 / 92)
(4/9) column: post_BloodValueASTSGOT_UPerLDate (275 / 89)
(5/9) column: post_BloodValueALTSGPT_UPerL (278 / 86)
(6/9) column: post_BloodValueALTSGPT_UPerLDate (281 / 83)
(7/9) column: post_BloodValueTotalBilirubin_mgPerdL (281 / 83)
(8/9) column: post_BloodValueTotalBilirubin_mgPerdLDate (283 / 81)


In [146]:
COMBINE_harmonizer.column_info(df_blood_value_post)

(0/9) center: (364/0)
(1/9) subjectID: (364/0)
(2/9) uniqueID: (364/0)
(3/9) post_BloodValueASTSGOT_UPerL: (272/92)
(4/9) post_BloodValueASTSGOT_UPerLDate: (275/89)
(5/9) post_BloodValueALTSGPT_UPerL: (278/86)
(6/9) post_BloodValueALTSGPT_UPerLDate: (281/83)
(7/9) post_BloodValueTotalBilirubin_mgPerdL: (281/83)
(8/9) post_BloodValueTotalBilirubin_mgPerdLDate: (283/81)


## 03-03. Post-intervention Imaging

In [147]:
imaging_post_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Imaging')
all_imaging_post_columns = id_columns + imaging_post_columns
imaging_post_columns

['post_HeadSonogram',
 'post_HeadSonogramDate',
 'post_HeadSonogramTime',
 'post_HeadSonogramResult1',
 'post_HeadSonogramResult2',
 'post_HeadSonogramResult3',
 'post_HeadSonogramResult4',
 'post_HeadSonogramResult5',
 'post_HeadSonogramResult6',
 'post_HeadSonogramResult7',
 'post_HeadSonogramResult8',
 'post_HeadSonogramResultText',
 'post_HeadCT',
 'post_HeadCTDate',
 'post_HeadCTTime',
 'post_HeadCTResult1',
 'post_HeadCTResult2',
 'post_HeadCTResult3',
 'post_HeadCTResult4',
 'post_HeadCTResult5',
 'post_HeadCTResult6',
 'post_HeadCTResult7',
 'post_HeadCTResult8',
 'post_HeadCTResultText',
 'post_BrainMRI',
 'post_BrainMRIDate',
 'post_BrainMRITime',
 'post_BrainMRIResult1',
 'post_BrainMRIResult2',
 'post_BrainMRIResult3',
 'post_BrainMRIResult4',
 'post_BrainMRIResult5',
 'post_BrainMRIResult6',
 'post_BrainMRIResult7',
 'post_BrainMRIResult8',
 'post_BrainMRIResultText']

In [148]:
df_imaging = df_dict['oc12.csv'].copy()

imaging_time_slot_int = df_imaging['imagingTimeSlot'].apply(COMBINE_harmonizer.to_int)
is_post = imaging_time_slot_int == 3
df_imaging_post = df_imaging[is_post]
post_treat_rename_map = {
    'headSonogram': 'post_HeadSonogram',
    'headSonogramDate': 'post_HeadSonogramDate',
    'headSonogramTime': 'post_HeadSonogramTime',
    'headSonogramResult1': 'post_HeadSonogramResult1',
    'headSonogramResult2': 'post_HeadSonogramResult2',
    'headSonogramResult3': 'post_HeadSonogramResult3',
    'headSonogramResult4': 'post_HeadSonogramResult4',
    'headSonogramResult5': 'post_HeadSonogramResult5',
    'headSonogramResult6': 'post_HeadSonogramResult6',
    'headSonogramResult7': 'post_HeadSonogramResult7',
    'headSonogramResult8': 'post_HeadSonogramResult8',
    'headSonogramResultText': 'post_HeadSonogramResultText',
    'headCT': 'post_HeadCT',
    'headCTDate': 'post_HeadCTDate',
    'headCTTime': 'post_HeadCTTime',
    'headCTResult1': 'post_HeadCTResult1',
    'headCTResult2': 'post_HeadCTResult2',
    'headCTResult3': 'post_HeadCTResult3',
    'headCTResult4': 'post_HeadCTResult4',
    'headCTResult5': 'post_HeadCTResult5',
    'headCTResult6': 'post_HeadCTResult6',
    'headCTResult7': 'post_HeadCTResult7',
    'headCTResult8': 'post_HeadCTResult8',
    'headCTResultText': 'post_HeadCTResultText',
    'brainMRI': 'post_BrainMRI',
    'brainMRIDate': 'post_BrainMRIDate',
    'brainMRITime': 'post_BrainMRITime',
    'brainMRIResult1': 'post_BrainMRIResult1',
    'brainMRIResult2': 'post_BrainMRIResult2',
    'brainMRIResult3': 'post_BrainMRIResult3',
    'brainMRIResult4': 'post_BrainMRIResult4',
    'brainMRIResult5': 'post_BrainMRIResult5',
    'brainMRIResult6': 'post_BrainMRIResult6',
    'brainMRIResult7': 'post_BrainMRIResult7',
    'brainMRIResult8': 'post_BrainMRIResult8',
    'brainMRIResultText': 'post_BrainMRIResultText'
}
df_imaging_post = df_imaging_post.rename(columns=post_treat_rename_map)

df_imaging_post = COMBINE_harmonizer.valid_columns(df_imaging_post, all_imaging_post_columns, debug_df=True, debug_columns=True)
df_imaging_post = COMBINE_harmonizer.postprocess(df_imaging_post)
out_filename = os.sep.join([out_dir, '03-03-post-imaging.csv'])
df_imaging_post.to_csv(out_filename, index=False)

(7/42) REC_CMP not in columns
(9/42) imagingTimeSlot not in columns
(40/42) CMP_DATE not in columns
(41/42) CRT_DATE not in columns


### 03-03-2. check empty cells

In [149]:
COMBINE_harmonizer.check_empty(df_imaging_post)

(0/39) column: center (361 / 0)
(1/39) column: subjectID (361 / 0)
(2/39) column: uniqueID (361 / 0)
(3/39) column: post_HeadSonogram (361 / 0)
(4/39) column: post_HeadSonogramResultText (14 / 347)
(5/39) column: post_HeadCT (361 / 0)
(6/39) column: post_HeadCTResultText (1 / 360)
(7/39) column: post_BrainMRI (361 / 0)
(8/39) column: post_BrainMRIResultText (83 / 278)
(9/39) column: post_HeadSonogramDate (148 / 213)
(10/39) column: post_HeadSonogramTime (146 / 215)
(11/39) column: post_HeadSonogramResult1 (147 / 214)
(12/39) column: post_HeadSonogramResult2 (21 / 340)
(13/39) column: post_HeadSonogramResult3 (8 / 353)
(14/39) column: post_HeadSonogramResult4 (2 / 359)
(15/39) column: post_HeadSonogramResult5 (0 / 361)
(16/39) column: post_HeadSonogramResult6 (0 / 361)
(17/39) column: post_HeadSonogramResult7 (0 / 361)
(18/39) column: post_HeadSonogramResult8 (0 / 361)
(19/39) column: post_HeadCTDate (2 / 359)
(20/39) column: post_HeadCTTime (2 / 359)
(21/39) column: post_HeadCTResult1 

In [150]:
COMBINE_harmonizer.column_info(df_imaging_post)

(0/39) center: (361/0)
(1/39) subjectID: (361/0)
(2/39) uniqueID: (361/0)
(3/39) post_HeadSonogram: (361/0)
(4/39) post_HeadSonogramResultText: (14/347)
(5/39) post_HeadCT: (361/0)
(6/39) post_HeadCTResultText: (1/360)
(7/39) post_BrainMRI: (361/0)
(8/39) post_BrainMRIResultText: (83/278)
(9/39) post_HeadSonogramDate: (148/213)
(10/39) post_HeadSonogramTime: (146/215)
(11/39) post_HeadSonogramResult1: (147/214)
(12/39) post_HeadSonogramResult2: (21/340)
(13/39) post_HeadSonogramResult3: (8/353)
(14/39) post_HeadSonogramResult4: (2/359)
(15/39) post_HeadSonogramResult5: (0/361)
(16/39) post_HeadSonogramResult6: (0/361)
(17/39) post_HeadSonogramResult7: (0/361)
(18/39) post_HeadSonogramResult8: (0/361)
(19/39) post_HeadCTDate: (2/359)
(20/39) post_HeadCTTime: (2/359)
(21/39) post_HeadCTResult1: (2/359)
(22/39) post_HeadCTResult2: (2/359)
(23/39) post_HeadCTResult3: (1/360)
(24/39) post_HeadCTResult4: (0/361)
(25/39) post_HeadCTResult5: (0/361)
(26/39) post_HeadCTResult6: (0/361)
(27/39) 

## 03-04. Post-intervention Neuro Exam

In [151]:
neuro_exam_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'Neuro Exam')
all_neuro_exam_columns = id_columns + neuro_exam_columns
neuro_exam_columns

['post_NeuroExamSectionID',
 'post_NeuroExam',
 'post_NeuroExamDate',
 'post_NeuroExamTime',
 'post_NeuroExamLevelConsciousness',
 'post_NeuroExamSpontaneousActivity',
 'post_NeuroExamPosture',
 'post_NeuroExamTone',
 'post_NeuroExamSuck',
 'post_NeuroExamMoro',
 'post_NeuroExamPupils',
 'post_NeuroExamHeartRate',
 'post_NeuroExamRespiration',
 'post_NeuroExamSeizure',
 'post_NeuroExamSedate',
 'post_NeuroExamClonusSustained',
 'post_NeuroExamFistedHand',
 'post_NeuroExamAbnormalMovement',
 'post_NeuroExamGagReflexAbsent',
 'post_NeuroExamHypertonia',
 'post_NeuroExamAsymTonicNeckReflex']

In [152]:
df_neuro_exam = COMBINE_harmonizer.valid_columns(df_main, all_neuro_exam_columns, debug_df=True, debug_columns=True)
df_neuro_exam = COMBINE_harmonizer.postprocess(df_neuro_exam)

out_filename = os.sep.join([out_dir, '03-04-post-neuro-exam.csv'])
df_neuro_exam.to_csv(out_filename, index=False)

(1/418) siteID not in columns
(2/418) LAST not in columns
(3/418) FIRST not in columns
(4/418) screenComment not in columns
(5/418) REC_CMP not in columns
(7/418) birthDate not in columns
(8/418) birthNumber not in columns
(9/418) CMP_DATE not in columns
(10/418) CRT_DATE not in columns
(11/418) coreTempLess33p5COver1Hr_e not in columns
(12/418) first6HrCoolByClinicalProtocol_e not in columns
(13/418) chromosomalAbnormality_e not in columns
(14/418) majorCongenitalAnomaly_e not in columns
(15/418) birthWeightLessEq1800g_e not in columns
(16/418) infantUnlikelySurvive_e not in columns
(17/418) first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e not in columns
(18/418) first60MinAnyBloodGasPHLessEq7_i not in columns
(19/418) first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i not in columns
(20/418) historyPerinatalEvent_i not in columns
(21/418) at10MinApgarLessEq5OrVent_i not in columns
(22/418) pre_NeuroExamSeizure not in columns
(23/418) pre_NeuroExam not in columns
(24/418

### 03-04-1. check neuro exam

In [153]:
df_neuro_exam_groupby = df_neuro_exam.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_neuro_exam_groupby['_count'] > 1
df_neuro_exam_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 03-04-2. check empty cells

In [154]:
COMBINE_harmonizer.check_empty(df_neuro_exam)

(0/21) column: center (364 / 0)
(1/21) column: subjectID (364 / 0)
(2/21) column: uniqueID (364 / 0)
(3/21) column: post_NeuroExamTone (308 / 56)
(4/21) column: post_NeuroExamRespiration (308 / 56)
(5/21) column: post_NeuroExamSeizure (308 / 56)
(6/21) column: post_NeuroExamSedate (308 / 56)
(7/21) column: post_NeuroExamClonusSustained (308 / 56)
(8/21) column: post_NeuroExamFistedHand (308 / 56)
(9/21) column: post_NeuroExamAbnormalMovement (308 / 56)
(10/21) column: post_NeuroExamGagReflexAbsent (308 / 56)
(11/21) column: post_NeuroExamAsymTonicNeckReflex (0 / 364)
(12/21) column: post_NeuroExamDate (308 / 56)
(13/21) column: post_NeuroExamTime (307 / 57)
(14/21) column: post_NeuroExamLevelConsciousness (308 / 56)
(15/21) column: post_NeuroExamSpontaneousActivity (308 / 56)
(16/21) column: post_NeuroExamPosture (307 / 57)
(17/21) column: post_NeuroExamSuck (306 / 58)
(18/21) column: post_NeuroExamMoro (289 / 75)
(19/21) column: post_NeuroExamPupils (290 / 74)
(20/21) column: post_Neu

In [155]:
COMBINE_harmonizer.column_info(df_neuro_exam)

(0/21) center: (364/0)
(1/21) subjectID: (364/0)
(2/21) uniqueID: (364/0)
(3/21) post_NeuroExamTone: (308/56)
(4/21) post_NeuroExamRespiration: (308/56)
(5/21) post_NeuroExamSeizure: (308/56)
(6/21) post_NeuroExamSedate: (308/56)
(7/21) post_NeuroExamClonusSustained: (308/56)
(8/21) post_NeuroExamFistedHand: (308/56)
(9/21) post_NeuroExamAbnormalMovement: (308/56)
(10/21) post_NeuroExamGagReflexAbsent: (308/56)
(11/21) post_NeuroExamAsymTonicNeckReflex: (0/364)
(12/21) post_NeuroExamDate: (308/56)
(13/21) post_NeuroExamTime: (307/57)
(14/21) post_NeuroExamLevelConsciousness: (308/56)
(15/21) post_NeuroExamSpontaneousActivity: (308/56)
(16/21) post_NeuroExamPosture: (307/57)
(17/21) post_NeuroExamSuck: (306/58)
(18/21) post_NeuroExamMoro: (289/75)
(19/21) post_NeuroExamPupils: (290/74)
(20/21) post_NeuroExamHeartRate: (306/58)


## 03-05. MRI

In [156]:
mri_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Post-intervention', 'MRI')
all_mri_columns = list(dict.fromkeys(id_columns + mri_columns))
all_mri_columns

['center',
 'subjectID',
 'MRIAvailable',
 'MRIAvailable_c',
 'MRIObtain',
 'MRIObtainWindow',
 'MRIObtainWindow_c',
 'MRIDate',
 'MRITime',
 'MRIObtainComment',
 'MRISendRTIDate',
 'MRIReceiveRTIDate',
 'MRINoObtainReason',
 'MRINoObtainReason_c',
 'MRINoObtainReasonText',
 'MRIRead',
 'MRIScore',
 'MRIIteration',
 'MRIIncrement',
 'MRIID',
 'MRIReader',
 'MRIReadDate',
 'MRIStrength_T',
 'MRIStrength_c',
 'MRIAdequateQuality',
 'MRIAdequateQuality_c',
 'MRIT1Axial',
 'MRIT1Axial_c',
 'MRIT1Coronal',
 'MRIT1Coronal_c',
 'MRIT1Sagittal',
 'MRIT1Sagittal_c',
 'MRIT1',
 'MRIT2Axial',
 'MRIT2Axial_c',
 'MRIT2Coronal',
 'MRIT2Coronal_c',
 'MRIT2Sagittal',
 'MRIT2Sagittal_c',
 'MRIT2',
 'MRIT2FLAIRAxial',
 'MRIT2FLAIRAxial_c',
 'MRIT2FLAIRCoronal',
 'MRIT2FLAIRCoronal_c',
 'MRIT2FLAIRSagittal',
 'MRIT2FLAIRSagittal_c',
 'MRIT2FLAIR',
 'MRIGRESWIAxial',
 'MRIGRESWIAxial_c',
 'MRIGRESWICoronal',
 'MRIGRESWICoronal_c',
 'MRIGRESWISagittal',
 'MRIGRESWISagittal_c',
 'MRIGRESWI',
 'MRISPGRAxial'

In [157]:
df_mri = df_dict[_MRI_FILENAME].copy()

df_mri = COMBINE_harmonizer.valid_columns(df_mri, all_valid_columns, debug_df=True, debug_columns=False)
df_mri = COMBINE_harmonizer.postprocess(df_mri)

### XXX hack to reset siteID to meet with the filename
df_mri['siteID'] = ''

is_center03 = df_mri['center'] == '03'
df_mri.loc[is_center03, 'siteID'] = 'A'
print(f"center03: {list(df_mri['subjectID'][is_center03].sort_values())}")

is_center04 = df_mri['center'] == '04'
df_mri.loc[is_center04, 'siteID'] = '1'
print(f"center04: {list(df_mri['subjectID'][is_center04].sort_values())}")

is_center05 = df_mri['center'] == '05'
df_mri.loc[is_center05, 'siteID'] = 'A'
print(f"center05: {list(df_mri['subjectID'][is_center05].sort_values())}")

is_center09 = df_mri['center'] == '09'
is_subjectA = df_mri['subjectID'].isin(['OC0191', 'OC0401', 'OC0441'])
is_center09_subjectA = is_center09 * is_subjectA
is_center09_subjectB = is_center09 * (is_subjectA == False)
df_mri.loc[is_center09_subjectA, 'siteID'] = 'A'
df_mri.loc[is_center09_subjectB, 'siteID'] = 'B'
print(f"center09: {list(df_mri['subjectID'][is_center09].sort_values())}")

is_center11 = df_mri['center'] == '11'
df_mri.loc[is_center11, 'siteID'] = 'B'
print(f"center11: {list(df_mri['subjectID'][is_center11].sort_values())}")

is_center12 = df_mri['center'] == '12'
df_mri.loc[is_center12, 'siteID'] = 'B'
print(f"center12: {list(df_mri['subjectID'][is_center12].sort_values())}")

is_center14 = df_mri['center'] == '14'
df_mri.loc[is_center14, 'siteID'] = 'A'
print(f"center14: {list(df_mri['subjectID'][is_center14].sort_values())}")

is_center15 = df_mri['center'] == '15'
df_mri.loc[is_center15, 'siteID'] = 'A'
print(f"center15: {list(df_mri['subjectID'][is_center15].sort_values())}")

is_center16 = df_mri['center'] == '16'
df_mri.loc[is_center16, 'siteID'] = 'A'
print(f"center16: {list(df_mri['subjectID'][is_center16].sort_values())}")

is_center18 = df_mri['center'] == '18'
df_mri.loc[is_center18, 'siteID'] = '1'
print(f"center18: {list(df_mri['subjectID'][is_center18].sort_values())}")

is_center19 = df_mri['center'] == '19'
is_subjectA = df_mri['subjectID'].isin(['OC0381', 'OC0391', 'OC0411', 'OC0421', 'OC0441', 'OC0481', 'OC0561', 'OC0661', 'OC0671', 'OC0691', 'OC0851', 'OC0861'])
is_center19_subjectA = is_center19 * is_subjectA
is_center19_subjectE = is_center19 * (is_subjectA == False)
df_mri.loc[is_center19_subjectA, 'siteID'] = 'A'
df_mri.loc[is_center19_subjectE, 'siteID'] = 'E'
print(f"center19: {list(df_mri['subjectID'][is_center19].sort_values())}")

is_center24 = df_mri['center'] == '24'
df_mri.loc[is_center24, 'siteID'] = 'A'
print(f"center24: {list(df_mri['subjectID'][is_center24].sort_values())}")

is_center26 = df_mri['center'] == '26'
df_mri.loc[is_center26, 'siteID'] = 'A'
print(f"center26: {list(df_mri['subjectID'][is_center26].sort_values())}")

is_center27 = df_mri['center'] == '27'
df_mri.loc[is_center27, 'siteID'] = 'C'
print(f"center27: {list(df_mri['subjectID'][is_center27].sort_values())}")

is_center28 = df_mri['center'] == '28'
is_subject1 = df_mri['subjectID'].isin(['OC3051', 'OC3061', 'OC3081', 'OC3091', 'OC3161'])
is_center28_subject1 = is_center28 * is_subject1
is_center28_subject2 = is_center28 * (is_subject1 == False)
df_mri.loc[is_center28_subject1, 'siteID'] = '1'
df_mri.loc[is_center28_subject2, 'siteID'] = '2'
print(f"center28: {list(df_mri['subjectID'][is_center28].sort_values())}")

is_center29 = df_mri['center'] == '29'
df_mri.loc[is_center29, 'siteID'] = 'W'
print(f"center29: {list(df_mri['subjectID'][is_center29].sort_values())}")

is_center30 = df_mri['center'] == '30'
df_mri.loc[is_center30, 'siteID'] = 'A'
print(f"center30: {list(df_mri['subjectID'][is_center30].sort_values())}")

is_center31 = df_mri['center'] == '31'
df_mri.loc[is_center31, 'siteID'] = 'A'
print(f"center31: {list(df_mri['subjectID'][is_center31].sort_values())}")

df_mri['siteID'] = df_mri['siteID'].fillna('')
df_mri['MRI_ID'] = df_mri.apply(lambda x: f"{x['subjectID'][:-1]}_Center{x['center']}Site{x['siteID']}", axis=1)

out_filename = os.sep.join([out_dir, '03-05-mri.csv'])
df_mri.to_csv(out_filename, index=False)

(3/82) OM3READR not in columns
(43/82) OM3RDRSTAT not in columns
(44/82) OM3RDRSTAT_code not in columns
(45/82) OM3RDRSTATSP not in columns
(78/82) FormStatus not in columns
(79/82) DateComplete not in columns
(80/82) DateCreated not in columns
(81/82) KeyedUser not in columns
center03: ['OC3261', 'OC3261', 'OC3261', 'OC3281', 'OC3281', 'OC3281', 'OC3291', 'OC3331', 'OC3371', 'OC3371', 'OC3371', 'OC3381', 'OC3381', 'OC3381', 'OC3421', 'OC3421', 'OC3421', 'OC3431', 'OC3451', 'OC3451', 'OC3451']
center04: ['OC0031', 'OC0031', 'OC0031', 'OC0041', 'OC0081', 'OC0091', 'OC0091', 'OC0091', 'OC0101', 'OC0111', 'OC0111', 'OC0111', 'OC0121', 'OC0211', 'OC0211', 'OC0211', 'OC0241', 'OC0361', 'OC0371', 'OC0371', 'OC0371', 'OC0581', 'OC0661', 'OC0661', 'OC0661', 'OC0691', 'OC0691', 'OC0691', 'OC0761', 'OC0761', 'OC0761', 'OC0871', 'OC0871', 'OC0871', 'OC0881', 'OC0891', 'OC0901', 'OC0901', 'OC0901', 'OC0921', 'OC0951', 'OC0951', 'OC0951', 'OC0981', 'OC0981', 'OC0981', 'OC1091', 'OC1111', 'OC1111', 

### 03-05-1. check MRI

In [158]:
df_mri_groupby = df_mri.groupby(['uniqueID', 'MRIReader']).agg(_count=('uniqueID', 'count'))

is_invalid = df_mri_groupby['_count'] > 1
df_mri_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,MRIReader,Unnamed: 2_level_1


### 03-05-2. check empty

In [159]:
COMBINE_harmonizer.check_empty(df_mri)

(0/77) column: center (671 / 0)
(1/77) column: subjectID (671 / 0)
(2/77) column: uniqueID (671 / 0)
(3/77) column: MRIIncrement (671 / 0)
(4/77) column: MRIReader (671 / 0)
(5/77) column: MRIReadDate (661 / 10)
(6/77) column: MRIDate (594 / 77)
(7/77) column: MRIStrength_T (256 / 415)
(8/77) column: MRIAdequateQuality (266 / 405)
(9/77) column: MRIAdequateQuality_c (670 / 1)
(10/77) column: MRIT1Axial (671 / 0)
(11/77) column: MRIT1Coronal (671 / 0)
(12/77) column: MRIT1Sagittal (671 / 0)
(13/77) column: MRIT1 (671 / 0)
(14/77) column: MRIT2Axial (671 / 0)
(15/77) column: MRIT2Coronal (671 / 0)
(16/77) column: MRIT2Sagittal (671 / 0)
(17/77) column: MRIT2 (671 / 0)
(18/77) column: MRIT2FLAIRAxial (671 / 0)
(19/77) column: MRIT2FLAIRCoronal (671 / 0)
(20/77) column: MRIT2FLAIRSagittal (671 / 0)
(21/77) column: MRIT2FLAIR (671 / 0)
(22/77) column: MRIGRESWIAxial (671 / 0)
(23/77) column: MRIGRESWICoronal (671 / 0)
(24/77) column: MRIGRESWISagittal (671 / 0)
(25/77) column: MRIGRESWI (67

In [160]:
COMBINE_harmonizer.column_info(df_mri)

(0/77) center: (671/0)
(1/77) subjectID: (671/0)
(2/77) uniqueID: (671/0)
(3/77) MRIIncrement: (671/0)
(4/77) MRIReader: (671/0)
(5/77) MRIReadDate: (661/10)
(6/77) MRIDate: (594/77)
(7/77) MRIStrength_T: (256/415)
(8/77) MRIAdequateQuality: (266/405)
(9/77) MRIAdequateQuality_c: (670/1)
(10/77) MRIT1Axial: (671/0)
(11/77) MRIT1Coronal: (671/0)
(12/77) MRIT1Sagittal: (671/0)
(13/77) MRIT1: (671/0)
(14/77) MRIT2Axial: (671/0)
(15/77) MRIT2Coronal: (671/0)
(16/77) MRIT2Sagittal: (671/0)
(17/77) MRIT2: (671/0)
(18/77) MRIT2FLAIRAxial: (671/0)
(19/77) MRIT2FLAIRCoronal: (671/0)
(20/77) MRIT2FLAIRSagittal: (671/0)
(21/77) MRIT2FLAIR: (671/0)
(22/77) MRIGRESWIAxial: (671/0)
(23/77) MRIGRESWICoronal: (671/0)
(24/77) MRIGRESWISagittal: (671/0)
(25/77) MRIGRESWI: (671/0)
(26/77) MRISPGRAxial: (671/0)
(27/77) MRISPGRCoronal: (671/0)
(28/77) MRISPGRSagittal: (671/0)
(29/77) MRISPGR: (671/0)
(30/77) MRIDWI: (475/196)
(31/77) MRIDWI_c: (671/0)
(32/77) MRIADC: (1/670)
(33/77) MRIADC_c: (669/2)
(34/7

### 03-05-3. MRI summary

In [161]:
df_mri_s = None
for idx, each_filename in enumerate(_MRI_FILENAMES_MERGE):
    each_df = df_dict[each_filename]
    columns = list(each_df.columns)
    each_filename_prefix = re.sub(r'\.csv$', '', each_filename)
    print(f'({idx}/{len(_MRI_FILENAMES_MERGE)}) filename: {each_filename} columns: {columns}')

    if df_mri_s is None:
        df_mri_s = each_df
    else:
        df_mri_s = df_mri_s.merge(each_df, on=['center', 'subjectID'], how='outer', suffixes=['', ':' + each_filename_prefix])

# postprocess
df_mri_s = COMBINE_harmonizer.valid_columns(df_mri_s, all_valid_columns, debug_df=True, debug_columns=False)
df_mri_s = df_mri_s.drop_duplicates(df_mri_s.columns)
df_mri_s = COMBINE_harmonizer.postprocess(df_mri_s)

out_filename = os.sep.join([out_dir, '03-05_s-mri.csv'])
df_mri_s.to_csv(out_filename, index=False)

(0/2) filename: ocmr01.csv columns: ['center', 'subjectID', 'birthNumber', 'siteID', 'MRIAvailable', 'MRIAvailable_c', 'MRINoObtainReason', 'MRINoObtainReason_c', 'MRINoObtainReasonText', 'MRIObtainWindow', 'MRIObtainWindow_c', 'MRIObtainComment', 'FormStatus', 'DateComplete', 'DateCreated', 'KeyedUser']
(1/2) filename: ocmr02.csv columns: ['center', 'subjectID', 'MRIDate', 'MRITime', 'MRISendRTIDate', 'OM2INIT', 'MRIReceiveRTIDate', 'FormStatus', 'DateComplete', 'DateCreated', 'KeyedUser']
(12/25) FormStatus not in columns
(13/25) DateComplete not in columns
(14/25) DateCreated not in columns
(15/25) KeyedUser not in columns
(19/25) OM2INIT not in columns
(21/25) FormStatus:ocmr02 not in columns
(22/25) DateComplete:ocmr02 not in columns
(23/25) DateCreated:ocmr02 not in columns
(24/25) KeyedUser:ocmr02 not in columns


### 03-05-4. check MRI

In [162]:
df_mri_s_groupby = df_mri_s.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_mri_groupby['_count'] > 1
df_mri_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,MRIReader,Unnamed: 2_level_1


### 03-05-5. check empty cells

In [163]:
COMBINE_harmonizer.check_empty(df_mri_s)

(0/17) column: center (327 / 0)
(1/17) column: subjectID (327 / 0)
(2/17) column: uniqueID (327 / 0)
(3/17) column: birthNumber (327 / 0)
(4/17) column: siteID (232 / 95)
(5/17) column: MRIAvailable (327 / 0)
(6/17) column: MRIAvailable_c (327 / 0)
(7/17) column: MRINoObtainReason (327 / 0)
(8/17) column: MRINoObtainReason_c (10 / 317)
(9/17) column: MRINoObtainReasonText (5 / 322)
(10/17) column: MRIObtainWindow (327 / 0)
(11/17) column: MRIObtainWindow_c (200 / 127)
(12/17) column: MRIObtainComment (1 / 326)
(13/17) column: MRIDate (200 / 127)
(14/17) column: MRITime (200 / 127)
(15/17) column: MRISendRTIDate (200 / 127)
(16/17) column: MRIReceiveRTIDate (0 / 327)


In [164]:
COMBINE_harmonizer.column_info(df_mri_s)

(0/17) center: (327/0)
(1/17) subjectID: (327/0)
(2/17) uniqueID: (327/0)
(3/17) birthNumber: (327/0)
(4/17) siteID: (232/95)
(5/17) MRIAvailable: (327/0)
(6/17) MRIAvailable_c: (327/0)
(7/17) MRINoObtainReason: (327/0)
(8/17) MRINoObtainReason_c: (10/317)
(9/17) MRINoObtainReasonText: (5/322)
(10/17) MRIObtainWindow: (327/0)
(11/17) MRIObtainWindow_c: (200/127)
(12/17) MRIObtainComment: (1/326)
(13/17) MRIDate: (200/127)
(14/17) MRITime: (200/127)
(15/17) MRISendRTIDate: (200/127)
(16/17) MRIReceiveRTIDate: (0/327)


## 02-11. elevated temperature

## 02-12. fluctuated temperature

## 02-13. bradycardia

In [165]:
bradycardia_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Bradycardia')
all_bradycardia_columns = id_columns + bradycardia_columns
bradycardia_columns

['bradycardiaEventNumber',
 'bradycardiaBelow70Over15min',
 'bradycardiaEKG',
 'bradycardiaEKGResult',
 'bradycardiaEKGResultOtherText',
 'bradycardiaAntiarrhythmiaMedication',
 'bradycardiaDate',
 'bradycardiaTime',
 'bradycardiaDuration',
 'bradycardiaHeartRateMin']

In [166]:
df_bradycardia = df_dict['oc17.csv']
df_bradycardia = COMBINE_harmonizer.valid_columns(df_bradycardia, all_bradycardia_columns, debug_df=True, debug_columns=False)
df_bradycardia = COMBINE_harmonizer.postprocess(df_bradycardia)

out_filename = os.sep.join([out_dir, '02-13-bradycardia.csv'])
df_bradycardia.to_csv(out_filename, index=False)

(5/13) O17INIT not in columns


### 02-13-1. check bradycardia

In [167]:
df_bradycardia_groupby = df_bradycardia.groupby(['uniqueID', 'bradycardiaEventNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_bradycardia_groupby['_count'] > 1
df_bradycardia_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,bradycardiaEventNumber,Unnamed: 2_level_1


### 02-13-2. check empty cells

In [168]:
COMBINE_harmonizer.check_empty(df_bradycardia)

(0/13) column: center (210 / 0)
(1/13) column: subjectID (210 / 0)
(2/13) column: uniqueID (210 / 0)
(3/13) column: bradycardiaBelow70Over15min (210 / 0)
(4/13) column: bradycardiaEKG (57 / 153)
(5/13) column: bradycardiaEKGResultOtherText (1 / 209)
(6/13) column: bradycardiaAntiarrhythmiaMedication (57 / 153)
(7/13) column: bradycardiaEventNumber (210 / 0)
(8/13) column: bradycardiaEKGResult (3 / 207)
(9/13) column: bradycardiaDate (57 / 153)
(10/13) column: bradycardiaTime (57 / 153)
(11/13) column: bradycardiaDuration (54 / 156)
(12/13) column: bradycardiaHeartRateMin (54 / 156)


In [169]:
COMBINE_harmonizer.column_info(df_bradycardia)

(0/13) center: (210/0)
(1/13) subjectID: (210/0)
(2/13) uniqueID: (210/0)
(3/13) bradycardiaBelow70Over15min: (210/0)
(4/13) bradycardiaEKG: (57/153)
(5/13) bradycardiaEKGResultOtherText: (1/209)
(6/13) bradycardiaAntiarrhythmiaMedication: (57/153)
(7/13) bradycardiaEventNumber: (210/0)
(8/13) bradycardiaEKGResult: (3/207)
(9/13) bradycardiaDate: (57/153)
(10/13) bradycardiaTime: (57/153)
(11/13) bradycardiaDuration: (54/156)
(12/13) bradycardiaHeartRateMin: (54/156)


## 02-14. adverse event

In [170]:
adverse_event_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Adverse Event')
all_adverse_event_columns = id_columns + adverse_event_columns
adverse_event_columns

['adverseEventNumber',
 'SAECardiacExperienceOnsetDate',
 'SAECardiacExperienceOnsetTime',
 'SAECardiacExperienceResolveDate',
 'SAECardiacExperienceResolveTime',
 'SAECardiacExperienceDueToHypothermia',
 'SAECardiacExperienceActionTaken',
 'SAECardiacExperienceOutcome',
 'SAECardiacExperienceComment',
 'SAEMetabolicAcidosisOnsetDate',
 'SAEMetabolicAcidosisOnsetTime',
 'SAEMetabolicAcidosisResolveDate',
 'SAEMetabolicAcidosisResolveTime',
 'SAEMetabolicAcidosisDueToHypothermia',
 'SAEMetabolicAcidosisActionTaken',
 'SAEMetabolicAcidosisOutcome',
 'SAEMetabolicAcidosisComment',
 'SAEThrombosisExperienceOnsetDate',
 'SAEThrombosisExperienceOnsetTime',
 'SAEThrombosisExperienceResolveDate',
 'SAEThrombosisExperienceResolveTime',
 'SAEThrombosisExperienceDueToHypothermia',
 'SAEThrombosisExperienceActionTaken',
 'SAEThrombosisExperienceOutcome',
 'SAEThrombosisExperienceComment',
 'SAEBleedingExperienceOnsetDate',
 'SAEBleedingExperienceOnsetTime',
 'SAEBleedingExperienceResolveDate',
 'S

In [171]:
df_adverse_event = df_dict['oc15.csv']
df_adverse_event = COMBINE_harmonizer.valid_columns(df_adverse_event, all_adverse_event_columns, debug_df=True, debug_columns=True)
df_adverse_event = COMBINE_harmonizer.postprocess(df_adverse_event)

out_filename = os.sep.join([out_dir, '02-14-adverse-event.csv'])
df_adverse_event.to_csv(out_filename, index=False)

(7/61) OC15INIT not in columns
(8/61) REC_CMP not in columns
(50/61) CMP_DATE not in columns
(51/61) CRT_DATE not in columns


### 02-14-1. check adverse event

In [172]:
df_adverse_event_groupby = df_adverse_event.groupby(['uniqueID', 'adverseEventNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_adverse_event_groupby['_count'] > 1
df_adverse_event_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,adverseEventNumber,Unnamed: 2_level_1


### 02-14-2. check empty cells

In [173]:
COMBINE_harmonizer.check_empty(df_adverse_event)

(0/58) column: center (93 / 0)
(1/58) column: subjectID (93 / 0)
(2/58) column: uniqueID (93 / 0)
(3/58) column: SAECardiacExperienceComment (10 / 83)
(4/58) column: SAEMetabolicAcidosisComment (6 / 87)
(5/58) column: SAEThrombosisExperienceComment (1 / 92)
(6/58) column: SAEBleedingExperienceComment (5 / 88)
(7/58) column: SAEAlterationSkinIntegrityComment (9 / 84)
(8/58) column: SAEDeathComment (5 / 88)
(9/58) column: adverseEventNumber (93 / 0)
(10/58) column: SAECardiacExperienceOnsetDate (12 / 81)
(11/58) column: SAECardiacExperienceOnsetTime (12 / 81)
(12/58) column: SAECardiacExperienceResolveDate (11 / 82)
(13/58) column: SAECardiacExperienceResolveTime (11 / 82)
(14/58) column: SAECardiacExperienceDueToHypothermia (12 / 81)
(15/58) column: SAECardiacExperienceActionTaken (12 / 81)
(16/58) column: SAECardiacExperienceOutcome (12 / 81)
(17/58) column: SAEMetabolicAcidosisOnsetDate (8 / 85)
(18/58) column: SAEMetabolicAcidosisOnsetTime (8 / 85)
(19/58) column: SAEMetabolicAcidosi

In [174]:
COMBINE_harmonizer.column_info(df_adverse_event)

(0/58) center: (93/0)
(1/58) subjectID: (93/0)
(2/58) uniqueID: (93/0)
(3/58) SAECardiacExperienceComment: (10/83)
(4/58) SAEMetabolicAcidosisComment: (6/87)
(5/58) SAEThrombosisExperienceComment: (1/92)
(6/58) SAEBleedingExperienceComment: (5/88)
(7/58) SAEAlterationSkinIntegrityComment: (9/84)
(8/58) SAEDeathComment: (5/88)
(9/58) adverseEventNumber: (93/0)
(10/58) SAECardiacExperienceOnsetDate: (12/81)
(11/58) SAECardiacExperienceOnsetTime: (12/81)
(12/58) SAECardiacExperienceResolveDate: (11/82)
(13/58) SAECardiacExperienceResolveTime: (11/82)
(14/58) SAECardiacExperienceDueToHypothermia: (12/81)
(15/58) SAECardiacExperienceActionTaken: (12/81)
(16/58) SAECardiacExperienceOutcome: (12/81)
(17/58) SAEMetabolicAcidosisOnsetDate: (8/85)
(18/58) SAEMetabolicAcidosisOnsetTime: (8/85)
(19/58) SAEMetabolicAcidosisResolveDate: (5/88)
(20/58) SAEMetabolicAcidosisResolveTime: (5/88)
(21/58) SAEMetabolicAcidosisDueToHypothermia: (8/85)
(22/58) SAEMetabolicAcidosisActionTaken: (7/86)
(23/58) S

## 02-15. violation

In [175]:
violation_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Violation')
all_violation_columns = id_columns + violation_columns
violation_columns

['violationNumber',
 'violationDate',
 'violationNature',
 'violationTreatmentAssign',
 'violationTreatmentReceive',
 'violationOtherText',
 'violationCircumstance',
 'violationOtherCirumstanceText',
 'violationComment']

In [176]:
df_violation = df_dict['oc14.csv']
df_violation = COMBINE_harmonizer.valid_columns(df_violation, all_violation_columns, debug_df=True, debug_columns=False)
df_violation = COMBINE_harmonizer.postprocess(df_violation)

out_filename = os.sep.join([out_dir, '02-15-violation.csv'])
df_violation.to_csv(out_filename, index=False)

(4/13) OC14NAME not in columns
(5/13) OC14INIT not in columns


### 02-15-1. check violation

In [177]:
df_violation_groupby = df_violation.groupby(['uniqueID', 'violationNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_violation_groupby['_count'] > 1
df_violation_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,violationNumber,Unnamed: 2_level_1


### 02-15-2. check empty cells

In [178]:
COMBINE_harmonizer.check_empty(df_violation)

(0/12) column: center (104 / 0)
(1/12) column: subjectID (104 / 0)
(2/12) column: uniqueID (104 / 0)
(3/12) column: violationOtherText (102 / 2)
(4/12) column: violationOtherCirumstanceText (94 / 10)
(5/12) column: violationComment (51 / 53)
(6/12) column: violationNumber (104 / 0)
(7/12) column: violationDate (104 / 0)
(8/12) column: violationNature (104 / 0)
(9/12) column: violationTreatmentAssign (0 / 104)
(10/12) column: violationTreatmentReceive (0 / 104)
(11/12) column: violationCircumstance (104 / 0)


In [179]:
COMBINE_harmonizer.column_info(df_violation)

(0/12) center: (104/0)
(1/12) subjectID: (104/0)
(2/12) uniqueID: (104/0)
(3/12) violationOtherText: (102/2)
(4/12) violationOtherCirumstanceText: (94/10)
(5/12) violationComment: (51/53)
(6/12) violationNumber: (104/0)
(7/12) violationDate: (104/0)
(8/12) violationNature: (104/0)
(9/12) violationTreatmentAssign: (0/104)
(10/12) violationTreatmentReceive: (0/104)
(11/12) violationCircumstance: (104/0)


## 02-16. Interrupt

In [180]:
interrupt_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Interrupt')
all_interrupt_columns = id_columns + interrupt_columns
interrupt_columns

['interruptNumber',
 'interrupt',
 'interruptReason',
 'interruptReasonText',
 'interruptDate',
 'interruptTime',
 'interruptRestartDate',
 'interruptRestartTime',
 'interruptRestartEsophagealTemperature_C']

In [181]:
df_interrupt = df_dict['oc10.csv']
df_interrupt = COMBINE_harmonizer.valid_columns(df_interrupt, all_interrupt_columns, debug_df=True, debug_columns=False)
df_interrupt = COMBINE_harmonizer.postprocess(df_interrupt)

out_filename = os.sep.join([out_dir, '02-16-interrupt.csv'])
df_interrupt.to_csv(out_filename, index=False)

(3/15) OC10INIT not in columns
(4/15) REC_CMP not in columns
(13/15) CMP_DATE not in columns
(14/15) CRT_DATE not in columns


### 02-16-1. check interrupt

In [182]:
df_interrupt_groupby = df_interrupt.groupby(['uniqueID', 'interruptNumber']).agg(_count=('uniqueID', 'count'))

is_invalid = df_interrupt_groupby['_count'] > 1
df_interrupt_groupby[is_invalid]

Unnamed: 0_level_0,Unnamed: 1_level_0,_count
uniqueID,interruptNumber,Unnamed: 2_level_1


### 02-16-2. check empty cells

In [183]:
COMBINE_harmonizer.check_empty(df_interrupt)

(0/12) column: center (249 / 0)
(1/12) column: subjectID (249 / 0)
(2/12) column: uniqueID (249 / 0)
(3/12) column: interrupt (249 / 0)
(4/12) column: interruptReasonText (16 / 233)
(5/12) column: interruptNumber (249 / 0)
(6/12) column: interruptReason (20 / 229)
(7/12) column: interruptDate (20 / 229)
(8/12) column: interruptTime (19 / 230)
(9/12) column: interruptRestartDate (14 / 235)
(10/12) column: interruptRestartTime (14 / 235)
(11/12) column: interruptRestartEsophagealTemperature_C (12 / 237)


In [184]:
COMBINE_harmonizer.column_info(df_interrupt)

(0/12) center: (249/0)
(1/12) subjectID: (249/0)
(2/12) uniqueID: (249/0)
(3/12) interrupt: (249/0)
(4/12) interruptReasonText: (16/233)
(5/12) interruptNumber: (249/0)
(6/12) interruptReason: (20/229)
(7/12) interruptDate: (20/229)
(8/12) interruptTime: (19/230)
(9/12) interruptRestartDate: (14/235)
(10/12) interruptRestartTime: (14/235)
(11/12) interruptRestartEsophagealTemperature_C: (12/237)


## 02-17. Discontinue

In [185]:
discontinue_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Intervention', 'Discontinue')
all_discontinue_columns = id_columns + discontinue_columns
discontinue_columns

['discontinueDate',
 'discontinueTime',
 'discontinueBeforeEndPeriod',
 'discontinueParentsWithdraw',
 'discontinuePhysicianWithdraw',
 'discontinueAdverseEvent',
 'discontinueECMO',
 'discontinueDNR',
 'discontinueWdrawSupport',
 'discontinueDeath',
 'discontinueOther',
 'discontinueOtherText']

In [186]:
def _inverse_discontinue_before_end_period(x):
    if x == 'Y':
        return 'N'
    elif x == 'N':
        return 'Y'
    else:
        return x

In [187]:
df_temperature_72_120 = pd.concat([df_dict['oc06t72.csv'], df_dict['oc06t120.csv']]).reset_index()

### XXX discontinueBeforeEndPeriod is inversed
df_temperature_72_120['discontinueBeforeEndPeriod'] = df_temperature_72_120['discontinueBeforeEndPeriod'].apply(_inverse_discontinue_before_end_period)

### XXX discontinue reasons
is_discontinue_before_end_period = df_temperature_72_120['discontinueBeforeEndPeriod'] == 'Y'

df_temperature_72_120['OC6NCEPR'] = df_temperature_72_120['OC6NCEPR'].apply(COMBINE_harmonizer.to_int)

is_discontinue_parents_withdraw = df_temperature_72_120['OC6NCEPR'] == 1
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueParentsWithdraw'] = is_discontinue_parents_withdraw

is_discontinue_physician_withdraw = df_temperature_72_120['OC6NCEPR'] == 2
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinuePhysicianWithdraw'] = is_discontinue_physician_withdraw

is_discontinue_adverse_event = df_temperature_72_120['OC6NCEPR'] == 3
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueAdverseEvent'] = is_discontinue_adverse_event

is_discontinue_ecmo = df_temperature_72_120['OC6NCEPR'] == 4
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueECMO'] = is_discontinue_ecmo

is_discontinue_dnr = df_temperature_72_120['OC6NCEPR'] == 5
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueDNR'] = is_discontinue_dnr

is_discontinue_wdraw_support = df_temperature_72_120['OC6NCEPR'] == 6
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueWdrawSupport'] = is_discontinue_wdraw_support

is_discontinue_death = df_temperature_72_120['OC6NCEPR'] == 7
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueDeath'] = is_discontinue_death

is_discontinue_other = df_temperature_72_120['OC6NCEPR'] == 9
df_temperature_72_120.loc[is_discontinue_before_end_period, 'discontinueOther'] = is_discontinue_other

# postprocess df_discontinue
df_discontinue = COMBINE_harmonizer.valid_columns(df_temperature_72_120, all_discontinue_columns, debug_df=False, debug_columns=True)
df_discontinue = COMBINE_harmonizer.postprocess(df_discontinue)

out_filename = os.sep.join([out_dir, '02-17-discontinue.csv'])
df_discontinue.to_csv(out_filename, index=False)

### 02-17-2. check empty cells

In [188]:
COMBINE_harmonizer.check_empty(df_discontinue)

(0/15) column: center (364 / 0)
(1/15) column: subjectID (364 / 0)
(2/15) column: uniqueID (364 / 0)
(3/15) column: discontinueBeforeEndPeriod (364 / 0)
(4/15) column: discontinueOtherText (9 / 355)
(5/15) column: discontinueDate (55 / 309)
(6/15) column: discontinueTime (55 / 309)
(7/15) column: discontinueParentsWithdraw (53 / 311)
(8/15) column: discontinuePhysicianWithdraw (53 / 311)
(9/15) column: discontinueAdverseEvent (53 / 311)
(10/15) column: discontinueECMO (53 / 311)
(11/15) column: discontinueDNR (53 / 311)
(12/15) column: discontinueWdrawSupport (53 / 311)
(13/15) column: discontinueDeath (53 / 311)
(14/15) column: discontinueOther (53 / 311)


In [189]:
COMBINE_harmonizer.column_info(df_discontinue)

(0/15) center: (364/0)
(1/15) subjectID: (364/0)
(2/15) uniqueID: (364/0)
(3/15) discontinueBeforeEndPeriod: (364/0)
(4/15) discontinueOtherText: (9/355)
(5/15) discontinueDate: (55/309)
(6/15) discontinueTime: (55/309)
(7/15) discontinueParentsWithdraw: (53/311)
(8/15) discontinuePhysicianWithdraw: (53/311)
(9/15) discontinueAdverseEvent: (53/311)
(10/15) discontinueECMO: (53/311)
(11/15) discontinueDNR: (53/311)
(12/15) discontinueWdrawSupport: (53/311)
(13/15) discontinueDeath: (53/311)
(14/15) discontinueOther: (53/311)


## 04-16. Wdraw Support

In [190]:
wdraw_support_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Withdrawal of Support')
all_wdraw_support_columns = id_columns + wdraw_support_columns
wdraw_support_columns

['wdrawSupport',
 'wdrawSupportDate',
 'wdrawSupportTime',
 'wdrawSupportDiscussedWithFamily',
 'wdrawSupportRecommendSolelyByClinicalTeam',
 'wdrawSupportNeurologicalExam',
 'wdrawSupportImagingStudy',
 'wdrawSupportEEGFinding',
 'wdrawSupportMultisystemOrganFailureOtherThanCNS',
 'wdrawSupportBrainBloodFlowScan',
 'wdrawSupportParentWish',
 'wdrawSupportOther',
 'wdrawSupportOtherText']

In [191]:
df_wdraw_support = COMBINE_harmonizer.valid_columns(df_main, all_wdraw_support_columns, debug_df=False, debug_columns=True)
df_wdraw_support = COMBINE_harmonizer.postprocess(df_wdraw_support)

out_filename = os.sep.join([out_dir, '04-16-wdraw-support.csv'])
df_wdraw_support.to_csv(out_filename, index=False)

### 04-16-1. check wdraw support

In [192]:
df_wdraw_support_groupby = df_wdraw_support.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_wdraw_support_groupby['_count'] > 1
df_wdraw_support_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 04-16-2. check empty cells

In [193]:
COMBINE_harmonizer.check_empty(df_wdraw_support)

(0/16) column: center (364 / 0)
(1/16) column: subjectID (364 / 0)
(2/16) column: uniqueID (364 / 0)
(3/16) column: wdrawSupportDiscussedWithFamily (364 / 0)
(4/16) column: wdrawSupportRecommendSolelyByClinicalTeam (59 / 305)
(5/16) column: wdrawSupportNeurologicalExam (59 / 305)
(6/16) column: wdrawSupportImagingStudy (59 / 305)
(7/16) column: wdrawSupportEEGFinding (59 / 305)
(8/16) column: wdrawSupportMultisystemOrganFailureOtherThanCNS (59 / 305)
(9/16) column: wdrawSupportBrainBloodFlowScan (59 / 305)
(10/16) column: wdrawSupportParentWish (59 / 305)
(11/16) column: wdrawSupportOther (59 / 305)
(12/16) column: wdrawSupportOtherText (6 / 358)
(13/16) column: wdrawSupport (59 / 305)
(14/16) column: wdrawSupportDate (44 / 320)
(15/16) column: wdrawSupportTime (43 / 321)


In [194]:
COMBINE_harmonizer.column_info(df_wdraw_support)

(0/16) center: (364/0)
(1/16) subjectID: (364/0)
(2/16) uniqueID: (364/0)
(3/16) wdrawSupportDiscussedWithFamily: (364/0)
(4/16) wdrawSupportRecommendSolelyByClinicalTeam: (59/305)
(5/16) wdrawSupportNeurologicalExam: (59/305)
(6/16) wdrawSupportImagingStudy: (59/305)
(7/16) wdrawSupportEEGFinding: (59/305)
(8/16) wdrawSupportMultisystemOrganFailureOtherThanCNS: (59/305)
(9/16) wdrawSupportBrainBloodFlowScan: (59/305)
(10/16) wdrawSupportParentWish: (59/305)
(11/16) wdrawSupportOther: (59/305)
(12/16) wdrawSupportOtherText: (6/358)
(13/16) wdrawSupport: (59/305)
(14/16) wdrawSupportDate: (44/320)
(15/16) wdrawSupportTime: (43/321)


## 04-17. Limitation of Care

In [195]:
limit_care_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Limitation of Care')
all_limit_care_columns = id_columns + limit_care_columns
limit_care_columns

['limitCareDiscussedWithFamily',
 'limitCareRecommendSolelyByClinicalTeam',
 'limitCareAgreedByFamilyAndCareTeam',
 'limitCareNoFurtherMechanicalVentilationAndIntubation',
 'limitCareNoFurtherVentilationWithBagAndMask',
 'limitCareNoFurtherMedicationsToSupportBP',
 'limitCareNoFurtherChestCompression',
 'limitCareNoFurtherEmergencyMedication',
 'limitCareDNR',
 'limitCareDNRDate',
 'limitCareDNRTime']

In [196]:
df_limit_care = COMBINE_harmonizer.valid_columns(df_main, all_limit_care_columns, debug_df=False, debug_columns=True)
df_limit_care = COMBINE_harmonizer.postprocess(df_limit_care)

out_filename = os.sep.join([out_dir, '04-17-limit-care.csv'])
df_limit_care.to_csv(out_filename, index=False)

### 04-17-1. check limit care

In [197]:
df_limit_care_groupby = df_limit_care.groupby(['uniqueID']).agg(_count=('uniqueID', 'count'))

is_invalid = df_limit_care_groupby['_count'] > 1
df_limit_care_groupby[is_invalid]

Unnamed: 0_level_0,_count
uniqueID,Unnamed: 1_level_1


### 04-17-2. check empty cells

In [198]:
COMBINE_harmonizer.check_empty(df_limit_care)

(0/14) column: center (364 / 0)
(1/14) column: subjectID (364 / 0)
(2/14) column: uniqueID (364 / 0)
(3/14) column: limitCareDiscussedWithFamily (364 / 0)
(4/14) column: limitCareRecommendSolelyByClinicalTeam (45 / 319)
(5/14) column: limitCareAgreedByFamilyAndCareTeam (45 / 319)
(6/14) column: limitCareNoFurtherMechanicalVentilationAndIntubation (40 / 324)
(7/14) column: limitCareNoFurtherVentilationWithBagAndMask (40 / 324)
(8/14) column: limitCareNoFurtherMedicationsToSupportBP (40 / 324)
(9/14) column: limitCareNoFurtherChestCompression (40 / 324)
(10/14) column: limitCareNoFurtherEmergencyMedication (40 / 324)
(11/14) column: limitCareDNR (45 / 319)
(12/14) column: limitCareDNRDate (27 / 337)
(13/14) column: limitCareDNRTime (22 / 342)


In [199]:
COMBINE_harmonizer.column_info(df_limit_care)

(0/14) center: (364/0)
(1/14) subjectID: (364/0)
(2/14) uniqueID: (364/0)
(3/14) limitCareDiscussedWithFamily: (364/0)
(4/14) limitCareRecommendSolelyByClinicalTeam: (45/319)
(5/14) limitCareAgreedByFamilyAndCareTeam: (45/319)
(6/14) limitCareNoFurtherMechanicalVentilationAndIntubation: (40/324)
(7/14) limitCareNoFurtherVentilationWithBagAndMask: (40/324)
(8/14) limitCareNoFurtherMedicationsToSupportBP: (40/324)
(9/14) limitCareNoFurtherChestCompression: (40/324)
(10/14) limitCareNoFurtherEmergencyMedication: (40/324)
(11/14) limitCareDNR: (45/319)
(12/14) limitCareDNRDate: (27/337)
(13/14) limitCareDNRTime: (22/342)


## 04-01. Status

In [200]:
status_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Status')
all_status_columns = id_columns + status_columns
status_columns

['status',
 'statusDate',
 'dischargeStatus',
 'dischargeDate',
 'dischargeWeight_g',
 'dischargeLength_cm',
 'dischargeHeadCircumference_cm',
 'transferReason',
 'transferDate',
 'transferWeight_g',
 'transferLength_cm',
 'transferHeadCircumference_cm',
 'transferOutcome',
 'homeTherapyStatus',
 'homeTherapyVentilator',
 'homeTherapyOxygen',
 'homeTherapyGavageTubeFeed',
 'homeTherapyGastrostomyTubeFeed',
 'homeTherapyTemperatureBlanket',
 'homeTherapyAnticonvulsantMedication',
 'homeTherapyOther',
 'homeTherapyOtherText',
 'deathDate',
 'deathTime',
 'deathAutopsy',
 'deathCause',
 'deathCauseText',
 'deathSrc']

In [201]:
df_status = df_main.copy()
status_rename_map = {
    'dischargeStatus': 'status',
    'dischargeHomeTherapy': 'homeTherapyStatus',
    'dischargeHomeTherapyVentilator': 'homeTherapyVentilator',
    'dischargeHomeTherapyOxygen': 'homeTherapyOxygen',
    'dischargeHomeTherapyGavageTubeFeed': 'homeTherapyGavageTubeFeed',
    'dischargeHomeTherapyGastrostomyTubeFeed': 'homeTherapyGastrostomyTubeFeed',
    'dischargeHomeTherapyTemperatureBlanket': 'homeTherapyTemperatureBlanket',
    'dischargeHomeTherapyAnticonvulsantMedication': 'homeTherapyAnticonvulsantMedication',
    'dischargeHomeTherapyOther': 'homeTherapyOther',
    'dischargeHomeTherapyOtherText': 'homeTherapyOtherText',
}
df_status = df_status.rename(columns=status_rename_map)

# transfer
is_transfer = df_status['status'].isin(['2', '2.0'])
transfer_map = {
    'dischargeDate': 'transferDate',
    'dischargeWeight_g': 'transferWeight_g',
    'dischargeLength_cm': 'transferLength_cm',
    'dischargeHeadCircumference_cm': 'transferHeadCircumference_cm',
}
for map_from, map_to in transfer_map.items():
    df_status.loc[is_transfer, map_to] = df_status.loc[is_transfer, map_from]

    df_status.loc[is_transfer, map_from] = float('nan')

# XXX refine discharge
is_discharge = df_status['status'].isin(['1', '1.0'])
is_no_discharge_date = df_status['dischargeDate'].isnull()
is_invalid = is_discharge & is_no_discharge_date
print(f'no dischargeDate: {is_invalid.sum()}')
df_status.loc[is_invalid, 'dischargeDate'] = df_status.loc[is_invalid, 'statusDate']

df_status = COMBINE_harmonizer.valid_columns(df_status, all_status_columns, debug_df=False, debug_columns=True)
df_status = COMBINE_harmonizer.postprocess(df_status)

out_filename = os.sep.join([out_dir, '04-01-status.csv'])
df_status.to_csv(out_filename, index=False)

no dischargeDate: 1
(4/30) dischargeStatus not in df


### 04-01-2. check empty cells

In [202]:
COMBINE_harmonizer.check_empty(df_status)

(0/30) column: center (364 / 0)
(1/30) column: subjectID (364 / 0)
(2/30) column: uniqueID (364 / 0)
(3/30) column: homeTherapyStatus (314 / 50)
(4/30) column: homeTherapyVentilator (118 / 246)
(5/30) column: homeTherapyOxygen (118 / 246)
(6/30) column: homeTherapyGavageTubeFeed (118 / 246)
(7/30) column: homeTherapyGastrostomyTubeFeed (118 / 246)
(8/30) column: homeTherapyTemperatureBlanket (118 / 246)
(9/30) column: homeTherapyAnticonvulsantMedication (118 / 246)
(10/30) column: homeTherapyOther (118 / 246)
(11/30) column: homeTherapyOtherText (36 / 328)
(12/30) column: deathAutopsy (49 / 315)
(13/30) column: deathCauseText (8 / 356)
(14/30) column: status (364 / 0)
(15/30) column: statusDate (364 / 0)
(16/30) column: dischargeWeight_g (335 / 29)
(17/30) column: dischargeLength_cm (316 / 48)
(18/30) column: dischargeHeadCircumference_cm (318 / 46)
(19/30) column: transferReason (26 / 338)
(20/30) column: transferOutcome (26 / 338)
(21/30) column: dischargeDate (290 / 74)
(22/30) colu

In [203]:
COMBINE_harmonizer.column_info(df_status)

(0/30) center: (364/0)
(1/30) subjectID: (364/0)
(2/30) uniqueID: (364/0)
(3/30) homeTherapyStatus: (314/50)
(4/30) homeTherapyVentilator: (118/246)
(5/30) homeTherapyOxygen: (118/246)
(6/30) homeTherapyGavageTubeFeed: (118/246)
(7/30) homeTherapyGastrostomyTubeFeed: (118/246)
(8/30) homeTherapyTemperatureBlanket: (118/246)
(9/30) homeTherapyAnticonvulsantMedication: (118/246)
(10/30) homeTherapyOther: (118/246)
(11/30) homeTherapyOtherText: (36/328)
(12/30) deathAutopsy: (49/315)
(13/30) deathCauseText: (8/356)
(14/30) status: (364/0)
(15/30) statusDate: (364/0)
(16/30) dischargeWeight_g: (335/29)
(17/30) dischargeLength_cm: (316/48)
(18/30) dischargeHeadCircumference_cm: (318/46)
(19/30) transferReason: (26/338)
(20/30) transferOutcome: (26/338)
(21/30) dischargeDate: (290/74)
(22/30) deathDate: (49/315)
(23/30) deathTime: (49/315)
(24/30) deathCause: (49/315)
(25/30) deathSrc: (49/315)
(26/30) transferDate: (23/341)
(27/30) transferWeight_g: (24/340)
(28/30) transferLength_cm: (21/3

## 04-02. Discharge Neuro Exam

In [204]:
discharge_neuro_exam_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Neuro Exam')
all_discharge_neuro_exam_columns = id_columns + discharge_neuro_exam_columns
discharge_neuro_exam_columns

['dischargeNeuroExam',
 'dischargeNeuroExamStatus',
 'dischargeNeuroExamDate',
 'dischargeNeuroExamTime',
 'dischargeNeuroExamLevelConsciousness',
 'dischargeNeuroExamSpontaneousActivity',
 'dischargeNeuroExamPosture',
 'dischargeNeuroExamTone',
 'dischargeNeuroExamSuck',
 'dischargeNeuroExamMoro',
 'dischargeNeuroExamPupils',
 'dischargeNeuroExamHeartRate',
 'dischargeNeuroExamRespiration',
 'dischargeNeuroExamSeizure',
 'dischargeNeuroExamClonusSustained',
 'dischargeNeuroExamFistedHand',
 'dischargeNeuroExamAbnormalMovement',
 'dischargeNeuroExamGagReflexAbsent',
 'dischargeNeuroExamSedate',
 'dischargeNeuroExamHypertonia',
 'dischargeNeuroExamAsymTonicNeckReflex']

In [205]:
df_discharge_neuro_exam = COMBINE_harmonizer.valid_columns(df_main, all_discharge_neuro_exam_columns, debug_df=False, debug_columns=True)
df_discharge_neuro_exam = COMBINE_harmonizer.postprocess(df_discharge_neuro_exam)

out_filename = os.sep.join([out_dir, '04-02-neuro-exam.csv'])
df_discharge_neuro_exam.to_csv(out_filename, index=False)

(2/23) dischargeNeuroExam not in df
(21/23) dischargeNeuroExamHypertonia not in df


### 04-02-2. check empty cells

In [206]:
COMBINE_harmonizer.check_empty(df_discharge_neuro_exam)

(0/22) column: center (364 / 0)
(1/22) column: subjectID (364 / 0)
(2/22) column: uniqueID (364 / 0)
(3/22) column: dischargeNeuroExamTone (333 / 31)
(4/22) column: dischargeNeuroExamRespiration (333 / 31)
(5/22) column: dischargeNeuroExamSeizure (333 / 31)
(6/22) column: dischargeNeuroExamSedate (333 / 31)
(7/22) column: dischargeNeuroExamClonusSustained (333 / 31)
(8/22) column: dischargeNeuroExamFistedHand (333 / 31)
(9/22) column: dischargeNeuroExamAbnormalMovement (333 / 31)
(10/22) column: dischargeNeuroExamGagReflexAbsent (333 / 31)
(11/22) column: dischargeNeuroExamAsymTonicNeckReflex (328 / 36)
(12/22) column: dischargeNeuroExamStatus (331 / 33)
(13/22) column: dischargeNeuroExamDate (333 / 31)
(14/22) column: dischargeNeuroExamTime (326 / 38)
(15/22) column: dischargeNeuroExamLevelConsciousness (333 / 31)
(16/22) column: dischargeNeuroExamSpontaneousActivity (332 / 32)
(17/22) column: dischargeNeuroExamPosture (332 / 32)
(18/22) column: dischargeNeuroExamSuck (332 / 32)
(19/2

In [207]:
COMBINE_harmonizer.column_info(df_discharge_neuro_exam)

(0/22) center: (364/0)
(1/22) subjectID: (364/0)
(2/22) uniqueID: (364/0)
(3/22) dischargeNeuroExamTone: (333/31)
(4/22) dischargeNeuroExamRespiration: (333/31)
(5/22) dischargeNeuroExamSeizure: (333/31)
(6/22) dischargeNeuroExamSedate: (333/31)
(7/22) dischargeNeuroExamClonusSustained: (333/31)
(8/22) dischargeNeuroExamFistedHand: (333/31)
(9/22) dischargeNeuroExamAbnormalMovement: (333/31)
(10/22) dischargeNeuroExamGagReflexAbsent: (333/31)
(11/22) dischargeNeuroExamAsymTonicNeckReflex: (328/36)
(12/22) dischargeNeuroExamStatus: (331/33)
(13/22) dischargeNeuroExamDate: (333/31)
(14/22) dischargeNeuroExamTime: (326/38)
(15/22) dischargeNeuroExamLevelConsciousness: (333/31)
(16/22) dischargeNeuroExamSpontaneousActivity: (332/32)
(17/22) dischargeNeuroExamPosture: (332/32)
(18/22) dischargeNeuroExamSuck: (332/32)
(19/22) dischargeNeuroExamMoro: (330/34)
(20/22) dischargeNeuroExamPupils: (328/36)
(21/22) dischargeNeuroExamHeartRate: (332/32)


## 04-03. Discharge Cardiovascular

In [208]:
discharge_cardiovascular_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Cardiovascular')
all_discharge_cardiovascular_columns = id_columns + discharge_cardiovascular_columns
discharge_cardiovascular_columns

['dischargeCardiomegaly',
 'dischargeCardiacFailure',
 'dischargeCardiacDysfunctionByEcho',
 'dischargeCardiacIschemiaByEKG',
 'dischargeHypotension',
 'dischargeArrhythmia']

In [209]:
df_discharge_cardiovascular = COMBINE_harmonizer.valid_columns(df_main, all_discharge_cardiovascular_columns, debug_df=False, debug_columns=True)
df_discharge_cardiovascular = COMBINE_harmonizer.postprocess(df_discharge_cardiovascular)

out_filename = os.sep.join([out_dir, '04-03-cardiovascular.csv'])
df_discharge_cardiovascular.to_csv(out_filename, index=False)

### 04-03-2. check empty cells

In [210]:
COMBINE_harmonizer.check_empty(df_discharge_cardiovascular)

(0/9) column: center (364 / 0)
(1/9) column: subjectID (364 / 0)
(2/9) column: uniqueID (364 / 0)
(3/9) column: dischargeCardiomegaly (364 / 0)
(4/9) column: dischargeCardiacFailure (364 / 0)
(5/9) column: dischargeCardiacDysfunctionByEcho (364 / 0)
(6/9) column: dischargeCardiacIschemiaByEKG (364 / 0)
(7/9) column: dischargeHypotension (364 / 0)
(8/9) column: dischargeArrhythmia (364 / 0)


In [211]:
COMBINE_harmonizer.column_info(df_discharge_cardiovascular)

(0/9) center: (364/0)
(1/9) subjectID: (364/0)
(2/9) uniqueID: (364/0)
(3/9) dischargeCardiomegaly: (364/0)
(4/9) dischargeCardiacFailure: (364/0)
(5/9) dischargeCardiacDysfunctionByEcho: (364/0)
(6/9) dischargeCardiacIschemiaByEKG: (364/0)
(7/9) dischargeHypotension: (364/0)
(8/9) dischargeArrhythmia: (364/0)


## 04-04. Discharge Respiratory

In [212]:
discharge_respiratory_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Respiratory')
all_discharge_respiratory_columns = id_columns + discharge_respiratory_columns
discharge_respiratory_columns

['dischargeMeconiumAspirationSyndrome',
 'dischargePPHN',
 'dischargePulmonaryHemorrhage',
 'dischargePenumonia',
 'dischargeChronicLungDisease',
 'dischargeECMO',
 'dischargeINO',
 'dischargeVentilator_day',
 'dischargeOxygen_day',
 'dischargeCPAP_day',
 'dischargePulmonaryStartDate1',
 'dischargePulmonaryStartTime1',
 'dischargePulmonaryEndDate1',
 'dischargePulmonaryEndTime1',
 'dischargePulmonaryStartDate2',
 'dischargePulmonaryStartTime2',
 'dischargePulmonaryEndDate2',
 'dischargePulmonaryEndTime2',
 'dischargePulmonaryStartDate3',
 'dischargePulmonaryStartTime3',
 'dischargePulmonaryEndDate3',
 'dischargePulmonaryEndTime3']

In [213]:
df_discharge_respiratory = COMBINE_harmonizer.valid_columns(df_main, all_discharge_respiratory_columns, debug_df=False, debug_columns=True)
df_discharge_respiratory = COMBINE_harmonizer.postprocess(df_discharge_respiratory)

out_filename = os.sep.join([out_dir, '04-04-respiratory.csv'])
df_discharge_respiratory.to_csv(out_filename, index=False)

(12/24) dischargePulmonaryStartDate1 not in df
(13/24) dischargePulmonaryStartTime1 not in df
(14/24) dischargePulmonaryEndDate1 not in df
(15/24) dischargePulmonaryEndTime1 not in df
(16/24) dischargePulmonaryStartDate2 not in df
(17/24) dischargePulmonaryStartTime2 not in df
(18/24) dischargePulmonaryEndDate2 not in df
(19/24) dischargePulmonaryEndTime2 not in df
(20/24) dischargePulmonaryStartDate3 not in df
(21/24) dischargePulmonaryStartTime3 not in df
(22/24) dischargePulmonaryEndDate3 not in df
(23/24) dischargePulmonaryEndTime3 not in df


### 04-04-2. check empty cells

In [214]:
COMBINE_harmonizer.check_empty(df_discharge_respiratory)

(0/13) column: center (364 / 0)
(1/13) column: subjectID (364 / 0)
(2/13) column: uniqueID (364 / 0)
(3/13) column: dischargeMeconiumAspirationSyndrome (364 / 0)
(4/13) column: dischargePPHN (364 / 0)
(5/13) column: dischargePulmonaryHemorrhage (364 / 0)
(6/13) column: dischargePenumonia (364 / 0)
(7/13) column: dischargeChronicLungDisease (364 / 0)
(8/13) column: dischargeECMO (364 / 0)
(9/13) column: dischargeINO (364 / 0)
(10/13) column: dischargeVentilator_day (364 / 0)
(11/13) column: dischargeOxygen_day (364 / 0)
(12/13) column: dischargeCPAP_day (364 / 0)


In [215]:
COMBINE_harmonizer.column_info(df_discharge_respiratory)

(0/13) center: (364/0)
(1/13) subjectID: (364/0)
(2/13) uniqueID: (364/0)
(3/13) dischargeMeconiumAspirationSyndrome: (364/0)
(4/13) dischargePPHN: (364/0)
(5/13) dischargePulmonaryHemorrhage: (364/0)
(6/13) dischargePenumonia: (364/0)
(7/13) dischargeChronicLungDisease: (364/0)
(8/13) dischargeECMO: (364/0)
(9/13) dischargeINO: (364/0)
(10/13) dischargeVentilator_day: (364/0)
(11/13) dischargeOxygen_day: (364/0)
(12/13) dischargeCPAP_day: (364/0)


## 04-05. Discharge Hematology

In [216]:
discharge_hematology_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Hematology')
all_discharge_hematology_columns = id_columns + discharge_hematology_columns
discharge_hematology_columns

['dischargeDIC']

In [217]:
df_discharge_hematology = COMBINE_harmonizer.valid_columns(df_main, all_discharge_hematology_columns, debug_df=False, debug_columns=True)
df_discharge_hematology = COMBINE_harmonizer.postprocess(df_discharge_hematology)

out_filename = os.sep.join([out_dir, '04-05-hematology.csv'])
df_discharge_hematology.to_csv(out_filename, index=False)

### 04-05-2. check empty cells

In [218]:
COMBINE_harmonizer.check_empty(df_discharge_hematology)

(0/4) column: center (364 / 0)
(1/4) column: subjectID (364 / 0)
(2/4) column: uniqueID (364 / 0)
(3/4) column: dischargeDIC (364 / 0)


In [219]:
COMBINE_harmonizer.column_info(df_discharge_hematology)

(0/4) center: (364/0)
(1/4) subjectID: (364/0)
(2/4) uniqueID: (364/0)
(3/4) dischargeDIC: (364/0)


## 04-06. Discharge Metabolic

In [220]:
discharge_metabolic_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Metabolic')
all_discharge_metabolic_columns = id_columns + discharge_metabolic_columns
discharge_metabolic_columns

['dischargeHypoglycemia', 'dischargeHypocalcemia', 'dischargeHypomagnesemia']

In [221]:
df_discharge_metabolic = COMBINE_harmonizer.valid_columns(df_main, all_discharge_metabolic_columns, debug_df=False, debug_columns=True)
df_discharge_metabolic = COMBINE_harmonizer.postprocess(df_discharge_metabolic)

out_filename = os.sep.join([out_dir, '04-06-metabolic.csv'])
df_discharge_metabolic.to_csv(out_filename, index=False)

### 04-06-2. check empty cells

In [222]:
COMBINE_harmonizer.check_empty(df_discharge_metabolic)

(0/6) column: center (364 / 0)
(1/6) column: subjectID (364 / 0)
(2/6) column: uniqueID (364 / 0)
(3/6) column: dischargeHypoglycemia (364 / 0)
(4/6) column: dischargeHypocalcemia (364 / 0)
(5/6) column: dischargeHypomagnesemia (364 / 0)


In [223]:
COMBINE_harmonizer.column_info(df_discharge_metabolic)

(0/6) center: (364/0)
(1/6) subjectID: (364/0)
(2/6) uniqueID: (364/0)
(3/6) dischargeHypoglycemia: (364/0)
(4/6) dischargeHypocalcemia: (364/0)
(5/6) dischargeHypomagnesemia: (364/0)


## 04-07. Discharge Renal

In [224]:
discharge_renal_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Renal')
all_discharge_renal_columns = id_columns + discharge_renal_columns
discharge_renal_columns

['dischargeOliguria', 'dischargeAnuria', 'dischargeDialysis']

In [225]:
df_discharge_renal = COMBINE_harmonizer.valid_columns(df_main, all_discharge_renal_columns, debug_df=False, debug_columns=True)
df_discharge_renal = COMBINE_harmonizer.postprocess(df_discharge_renal)

out_filename = os.sep.join([out_dir, '04-07-renal.csv'])
df_discharge_renal.to_csv(out_filename, index=False)

### 04-07-2. check empty cells

In [226]:
COMBINE_harmonizer.check_empty(df_discharge_renal)

(0/6) column: center (364 / 0)
(1/6) column: subjectID (364 / 0)
(2/6) column: uniqueID (364 / 0)
(3/6) column: dischargeOliguria (364 / 0)
(4/6) column: dischargeAnuria (364 / 0)
(5/6) column: dischargeDialysis (364 / 0)


In [227]:
COMBINE_harmonizer.column_info(df_discharge_renal)

(0/6) center: (364/0)
(1/6) subjectID: (364/0)
(2/6) uniqueID: (364/0)
(3/6) dischargeOliguria: (364/0)
(4/6) dischargeAnuria: (364/0)
(5/6) dischargeDialysis: (364/0)


## 04-08. Discharge Gastrointestinal

In [228]:
discharge_gastrointestinal_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Gastrointestinal')
all_discharge_gastrointestinal_columns = id_columns + discharge_gastrointestinal_columns
discharge_gastrointestinal_columns

['dischargeEnteralFeedStart_day',
 'dischargeTubeFeedingDuration_day',
 'dischargeFullNippleFeed',
 'dischargeFullNippleFeed_day',
 'dischargeNEC',
 'dischargeHepaticDysfunction']

In [229]:
df_discharge_gastrointestinal = COMBINE_harmonizer.valid_columns(df_main, all_discharge_gastrointestinal_columns, debug_df=False, debug_columns=True)
df_discharge_gastrointestinal = COMBINE_harmonizer.postprocess(df_discharge_gastrointestinal)

out_filename = os.sep.join([out_dir, '04-08-gastrointestinal.csv'])
df_discharge_gastrointestinal.to_csv(out_filename, index=False)

### 04-08-2. check empty cells

In [230]:
COMBINE_harmonizer.check_empty(df_discharge_gastrointestinal)

(0/9) column: center (364 / 0)
(1/9) column: subjectID (364 / 0)
(2/9) column: uniqueID (364 / 0)
(3/9) column: dischargeFullNippleFeed (364 / 0)
(4/9) column: dischargeNEC (364 / 0)
(5/9) column: dischargeHepaticDysfunction (364 / 0)
(6/9) column: dischargeEnteralFeedStart_day (337 / 27)
(7/9) column: dischargeTubeFeedingDuration_day (343 / 21)
(8/9) column: dischargeFullNippleFeed_day (237 / 127)


In [231]:
COMBINE_harmonizer.column_info(df_discharge_gastrointestinal)

(0/9) center: (364/0)
(1/9) subjectID: (364/0)
(2/9) uniqueID: (364/0)
(3/9) dischargeFullNippleFeed: (364/0)
(4/9) dischargeNEC: (364/0)
(5/9) dischargeHepaticDysfunction: (364/0)
(6/9) dischargeEnteralFeedStart_day: (337/27)
(7/9) dischargeTubeFeedingDuration_day: (343/21)
(8/9) dischargeFullNippleFeed_day: (237/127)


## 04-09. Skin

In [232]:
discharge_skin_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Skin')
all_discharge_skin_columns = id_columns + discharge_skin_columns
discharge_skin_columns

['dischargeAlteredSkinItegrityPostIntervention',
 'dischargeErythema',
 'dischargeErythemaOnsetDate',
 'dischargeErythemaResolveDate',
 'dischargeSclerema',
 'dischargeScleremaOnsetDate',
 'dischargeScleremaResolveDate',
 'dischargeCyanosis',
 'dischargeCyanosisOnsetDate',
 'dischargeCyanosisResolveDate',
 'dischargeSubFatNecrosis',
 'dischargeSubFatNecrosisOnsetDate',
 'dischargeSubFatNecrosisResolveDate']

In [233]:
df_discharge_skin = COMBINE_harmonizer.valid_columns(df_main, all_discharge_skin_columns, debug_df=False, debug_columns=True)
df_discharge_skin = COMBINE_harmonizer.postprocess(df_discharge_skin)

out_filename = os.sep.join([out_dir, '04-09-skin.csv'])
df_discharge_skin.to_csv(out_filename, index=False)

### 04-09-2. check empty cells

In [234]:
COMBINE_harmonizer.check_empty(df_discharge_skin)

(0/16) column: center (364 / 0)
(1/16) column: subjectID (364 / 0)
(2/16) column: uniqueID (364 / 0)
(3/16) column: dischargeAlteredSkinItegrityPostIntervention (364 / 0)
(4/16) column: dischargeErythema (28 / 336)
(5/16) column: dischargeSclerema (28 / 336)
(6/16) column: dischargeCyanosis (28 / 336)
(7/16) column: dischargeSubFatNecrosis (28 / 336)
(8/16) column: dischargeErythemaOnsetDate (13 / 351)
(9/16) column: dischargeErythemaResolveDate (10 / 354)
(10/16) column: dischargeScleremaOnsetDate (0 / 364)
(11/16) column: dischargeScleremaResolveDate (0 / 364)
(12/16) column: dischargeCyanosisOnsetDate (1 / 363)
(13/16) column: dischargeCyanosisResolveDate (1 / 363)
(14/16) column: dischargeSubFatNecrosisOnsetDate (15 / 349)
(15/16) column: dischargeSubFatNecrosisResolveDate (8 / 356)


In [235]:
COMBINE_harmonizer.column_info(df_discharge_skin)

(0/16) center: (364/0)
(1/16) subjectID: (364/0)
(2/16) uniqueID: (364/0)
(3/16) dischargeAlteredSkinItegrityPostIntervention: (364/0)
(4/16) dischargeErythema: (28/336)
(5/16) dischargeSclerema: (28/336)
(6/16) dischargeCyanosis: (28/336)
(7/16) dischargeSubFatNecrosis: (28/336)
(8/16) dischargeErythemaOnsetDate: (13/351)
(9/16) dischargeErythemaResolveDate: (10/354)
(10/16) dischargeScleremaOnsetDate: (0/364)
(11/16) dischargeScleremaResolveDate: (0/364)
(12/16) dischargeCyanosisOnsetDate: (1/363)
(13/16) dischargeCyanosisResolveDate: (1/363)
(14/16) dischargeSubFatNecrosisOnsetDate: (15/349)
(15/16) dischargeSubFatNecrosisResolveDate: (8/356)


## 04-10. Auditory

In [236]:
discharge_auditory_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Auditory')
all_discharge_auditory_columns = id_columns + discharge_auditory_columns
discharge_auditory_columns

['dischargeHearingTest', 'dischargeHearingTestNormal']

In [237]:
df_discharge_auditory = COMBINE_harmonizer.valid_columns(df_main, all_discharge_auditory_columns, debug_df=False, debug_columns=True)
df_discharge_auditory = COMBINE_harmonizer.postprocess(df_discharge_auditory)

out_filename = os.sep.join([out_dir, '04-10-auditory.csv'])
df_discharge_auditory.to_csv(out_filename, index=False)

### 04-10-2. check empty cells

In [238]:
COMBINE_harmonizer.check_empty(df_discharge_auditory)

(0/5) column: center (364 / 0)
(1/5) column: subjectID (364 / 0)
(2/5) column: uniqueID (364 / 0)
(3/5) column: dischargeHearingTest (364 / 0)
(4/5) column: dischargeHearingTestNormal (290 / 74)


In [239]:
COMBINE_harmonizer.column_info(df_discharge_auditory)

(0/5) center: (364/0)
(1/5) subjectID: (364/0)
(2/5) uniqueID: (364/0)
(3/5) dischargeHearingTest: (364/0)
(4/5) dischargeHearingTestNormal: (290/74)


## 04-11. Surgery

In [240]:
discharge_surgery_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Surgery')
all_discharge_surgery_columns = id_columns + discharge_surgery_columns
discharge_surgery_columns

['dischargeMajorSurgery',
 'dischargeSurgeryCode1',
 'dischargeSurgeryCode2',
 'dischargeSurgeryCode3']

In [241]:
df_discharge_surgery = COMBINE_harmonizer.valid_columns(df_main, all_discharge_surgery_columns, debug_df=False, debug_columns=True)
df_discharge_surgery = COMBINE_harmonizer.postprocess(df_discharge_surgery)

out_filename = os.sep.join([out_dir, '04-11-surgery.csv'])
df_discharge_surgery.to_csv(out_filename, index=False)

### 04-11-2. check empty cells

In [242]:
COMBINE_harmonizer.check_empty(df_discharge_surgery)

(0/7) column: center (364 / 0)
(1/7) column: subjectID (364 / 0)
(2/7) column: uniqueID (364 / 0)
(3/7) column: dischargeMajorSurgery (364 / 0)
(4/7) column: dischargeSurgeryCode1 (42 / 322)
(5/7) column: dischargeSurgeryCode2 (12 / 352)
(6/7) column: dischargeSurgeryCode3 (5 / 359)


In [243]:
COMBINE_harmonizer.column_info(df_discharge_surgery)

(0/7) center: (364/0)
(1/7) subjectID: (364/0)
(2/7) uniqueID: (364/0)
(3/7) dischargeMajorSurgery: (364/0)
(4/7) dischargeSurgeryCode1: (42/322)
(5/7) dischargeSurgeryCode2: (12/352)
(6/7) dischargeSurgeryCode3: (5/359)


## 04-12. Infection

In [244]:
discharge_infection_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Infection')
all_discharge_infection_columns = id_columns + discharge_infection_columns
discharge_infection_columns

['dischargeSepticemia',
 'dischargeSepticemiaOrganismCode1',
 'dischargeSepticemiaOrganismCode2',
 'dischargeSepticemiaOrganismCode3',
 'dischargeMeningitisEncephalitis',
 'dischargeMeningitisOrganismCode1',
 'dischargeMeningitisOrganismCode2',
 'dischargeMeningitisOrganismCode3']

In [245]:
df_discharge_infection = COMBINE_harmonizer.valid_columns(df_main, all_discharge_infection_columns, debug_df=False, debug_columns=True)
df_discharge_infection = COMBINE_harmonizer.postprocess(df_discharge_infection)

out_filename = os.sep.join([out_dir, '04-12-infection.csv'])
df_discharge_infection.to_csv(out_filename, index=False)

### 04-12-2. check empty cells

In [246]:
COMBINE_harmonizer.check_empty(df_discharge_infection)

(0/11) column: center (364 / 0)
(1/11) column: subjectID (364 / 0)
(2/11) column: uniqueID (364 / 0)
(3/11) column: dischargeSepticemia (364 / 0)
(4/11) column: dischargeMeningitisEncephalitis (364 / 0)
(5/11) column: dischargeSepticemiaOrganismCode1 (13 / 351)
(6/11) column: dischargeSepticemiaOrganismCode2 (1 / 363)
(7/11) column: dischargeSepticemiaOrganismCode3 (0 / 364)
(8/11) column: dischargeMeningitisOrganismCode1 (0 / 364)
(9/11) column: dischargeMeningitisOrganismCode2 (0 / 364)
(10/11) column: dischargeMeningitisOrganismCode3 (0 / 364)


In [247]:
COMBINE_harmonizer.column_info(df_discharge_infection)

(0/11) center: (364/0)
(1/11) subjectID: (364/0)
(2/11) uniqueID: (364/0)
(3/11) dischargeSepticemia: (364/0)
(4/11) dischargeMeningitisEncephalitis: (364/0)
(5/11) dischargeSepticemiaOrganismCode1: (13/351)
(6/11) dischargeSepticemiaOrganismCode2: (1/363)
(7/11) dischargeSepticemiaOrganismCode3: (0/364)
(8/11) dischargeMeningitisOrganismCode1: (0/364)
(9/11) dischargeMeningitisOrganismCode2: (0/364)
(10/11) dischargeMeningitisOrganismCode3: (0/364)


## 04-13. Seizure

In [248]:
discharge_neuro_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Seizure')
all_discharge_neuro_columns = id_columns + discharge_neuro_columns
discharge_neuro_columns

['dischargeSeizure',
 'dischargeSeizurePreIntervention',
 'dischargeSeizureAfterBaseline',
 'dischargeSeizureMaintenance',
 'dischargeSeizureRewarming',
 'dischargeSeizurePostIntervention',
 'dischargeEEG',
 'dischargeEEGFindingConsistentWithSeizure',
 'dischargeEEGFindingConsistentWithSeizureDate',
 'dischargeEEGFindingConsistentWithSeizureTime',
 'dischargeEEGAbnormalBackgroundActivity',
 'dischargeEEGAbnormalBackgroundActivityDate',
 'dischargeEEGAbnormalBackgroundActivityTime',
 'dischargeAnticonvulsantsOver72H']

In [249]:
df_discharge_neuro = COMBINE_harmonizer.valid_columns(df_main, all_discharge_neuro_columns, debug_df=False, debug_columns=True)
df_discharge_neuro = COMBINE_harmonizer.postprocess(df_discharge_neuro)

out_filename = os.sep.join([out_dir, '04-13-seizure.csv'])
df_discharge_neuro.to_csv(out_filename, index=False)

### 04-13-2. check empty cells

In [250]:
COMBINE_harmonizer.check_empty(df_discharge_neuro)

(0/17) column: center (364 / 0)
(1/17) column: subjectID (364 / 0)
(2/17) column: uniqueID (364 / 0)
(3/17) column: dischargeSeizure (364 / 0)
(4/17) column: dischargeSeizurePreIntervention (177 / 187)
(5/17) column: dischargeSeizureAfterBaseline (177 / 187)
(6/17) column: dischargeSeizureMaintenance (177 / 187)
(7/17) column: dischargeSeizureRewarming (177 / 187)
(8/17) column: dischargeSeizurePostIntervention (177 / 187)
(9/17) column: dischargeEEG (364 / 0)
(10/17) column: dischargeEEGFindingConsistentWithSeizure (263 / 101)
(11/17) column: dischargeEEGAbnormalBackgroundActivity (285 / 79)
(12/17) column: dischargeAnticonvulsantsOver72H (364 / 0)
(13/17) column: dischargeEEGFindingConsistentWithSeizureDate (73 / 291)
(14/17) column: dischargeEEGFindingConsistentWithSeizureTime (62 / 302)
(15/17) column: dischargeEEGAbnormalBackgroundActivityDate (186 / 178)
(16/17) column: dischargeEEGAbnormalBackgroundActivityTime (161 / 203)


In [251]:
COMBINE_harmonizer.column_info(df_discharge_neuro)

(0/17) center: (364/0)
(1/17) subjectID: (364/0)
(2/17) uniqueID: (364/0)
(3/17) dischargeSeizure: (364/0)
(4/17) dischargeSeizurePreIntervention: (177/187)
(5/17) dischargeSeizureAfterBaseline: (177/187)
(6/17) dischargeSeizureMaintenance: (177/187)
(7/17) dischargeSeizureRewarming: (177/187)
(8/17) dischargeSeizurePostIntervention: (177/187)
(9/17) dischargeEEG: (364/0)
(10/17) dischargeEEGFindingConsistentWithSeizure: (263/101)
(11/17) dischargeEEGAbnormalBackgroundActivity: (285/79)
(12/17) dischargeAnticonvulsantsOver72H: (364/0)
(13/17) dischargeEEGFindingConsistentWithSeizureDate: (73/291)
(14/17) dischargeEEGFindingConsistentWithSeizureTime: (62/302)
(15/17) dischargeEEGAbnormalBackgroundActivityDate: (186/178)
(16/17) dischargeEEGAbnormalBackgroundActivityTime: (161/203)


## 04-14. Birth Defect

In [252]:
discharge_birth_defect_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Birth Defect')
all_discharge_birth_defect_columns = id_columns + discharge_birth_defect_columns
discharge_birth_defect_columns

['dischargeSyndromeMalformation',
 'dischargeBirthDefectCode1',
 'dischargeBirthDefectCode2',
 'dischargeBirthDefectCode3']

In [253]:
df_discharge_birth_defect = COMBINE_harmonizer.valid_columns(df_main, all_discharge_birth_defect_columns, debug_df=False, debug_columns=True)
df_discharge_birth_defect = COMBINE_harmonizer.postprocess(df_discharge_birth_defect)

out_filename = os.sep.join([out_dir, '04-14-birth-defect.csv'])
df_discharge_birth_defect.to_csv(out_filename, index=False)

### 04-14-2. check empty cells

In [254]:
COMBINE_harmonizer.check_empty(df_discharge_birth_defect)

(0/7) column: center (364 / 0)
(1/7) column: subjectID (364 / 0)
(2/7) column: uniqueID (364 / 0)
(3/7) column: dischargeSyndromeMalformation (364 / 0)
(4/7) column: dischargeBirthDefectCode1 (11 / 353)
(5/7) column: dischargeBirthDefectCode2 (1 / 363)
(6/7) column: dischargeBirthDefectCode3 (0 / 364)


In [255]:
COMBINE_harmonizer.column_info(df_discharge_birth_defect)

(0/7) center: (364/0)
(1/7) subjectID: (364/0)
(2/7) uniqueID: (364/0)
(3/7) dischargeSyndromeMalformation: (364/0)
(4/7) dischargeBirthDefectCode1: (11/353)
(5/7) dischargeBirthDefectCode2: (1/363)
(6/7) dischargeBirthDefectCode3: (0/364)


## 04-15. Home Therapy

In [256]:
discharge_home_therapy_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'NICU Discharge', 'Home Therapy')
all_discharge_home_therapy_columns = id_columns + discharge_home_therapy_columns
discharge_home_therapy_columns

['dischargeHomeTherapy',
 'dischargeHomeTherapyVentilator',
 'dischargeHomeTherapyOxygen',
 'dischargeHomeTherapyGavageTubeFeed',
 'dischargeHomeTherapyGastrostomyTubeFeed',
 'dischargeHomeTherapyTemperatureBlanket',
 'dischargeHomeTherapyAnticonvulsantMedication',
 'dischargeHomeTherapyOther',
 'dischargeHomeTherapyOtherText']

In [257]:
df_discharge_home_therapy = COMBINE_harmonizer.valid_columns(df_main, all_discharge_home_therapy_columns, debug_df=False, debug_columns=True)
df_discharge_home_therapy = COMBINE_harmonizer.postprocess(df_discharge_home_therapy)

out_filename = os.sep.join([out_dir, '04-15-home-therapy.csv'])
df_discharge_home_therapy.to_csv(out_filename, index=False)

### 04-15-2. check empty cells

In [258]:
COMBINE_harmonizer.check_empty(df_discharge_home_therapy)

(0/12) column: center (364 / 0)
(1/12) column: subjectID (364 / 0)
(2/12) column: uniqueID (364 / 0)
(3/12) column: dischargeHomeTherapy (314 / 50)
(4/12) column: dischargeHomeTherapyVentilator (118 / 246)
(5/12) column: dischargeHomeTherapyOxygen (118 / 246)
(6/12) column: dischargeHomeTherapyGavageTubeFeed (118 / 246)
(7/12) column: dischargeHomeTherapyGastrostomyTubeFeed (118 / 246)
(8/12) column: dischargeHomeTherapyTemperatureBlanket (118 / 246)
(9/12) column: dischargeHomeTherapyAnticonvulsantMedication (118 / 246)
(10/12) column: dischargeHomeTherapyOther (118 / 246)
(11/12) column: dischargeHomeTherapyOtherText (36 / 328)


In [259]:
COMBINE_harmonizer.column_info(df_discharge_home_therapy)

(0/12) center: (364/0)
(1/12) subjectID: (364/0)
(2/12) uniqueID: (364/0)
(3/12) dischargeHomeTherapy: (314/50)
(4/12) dischargeHomeTherapyVentilator: (118/246)
(5/12) dischargeHomeTherapyOxygen: (118/246)
(6/12) dischargeHomeTherapyGavageTubeFeed: (118/246)
(7/12) dischargeHomeTherapyGastrostomyTubeFeed: (118/246)
(8/12) dischargeHomeTherapyTemperatureBlanket: (118/246)
(9/12) dischargeHomeTherapyAnticonvulsantMedication: (118/246)
(10/12) dischargeHomeTherapyOther: (118/246)
(11/12) dischargeHomeTherapyOtherText: (36/328)
