In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re
import pydoc

from datetime import datetime, timedelta

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_LH
sheet_name = COMBINE_harmonizer.SHEET_DERIVED_DATA

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
_FILENAMES = [
    'analysis.csv',
]

In [5]:
input_dir = cfg.config[f'{study_name}_analysis_dir']
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}"

os.makedirs(out_dir, exist_ok=True)

## 00-1. Column Map

In [6]:
df_data_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=sheet_name)
all_valid_columns = list(df_data_dict[COMBINE_harmonizer.DATA_DICT_VAR_NAME])
column_map = {each[study_name]: each[COMBINE_harmonizer.DATA_DICT_VAR_NAME] for _, each in df_data_dict.iterrows()}

## 00-2. df-dict from _FILENAMES

In [7]:
df_dict = {filename: pd.read_csv(os.sep.join([input_dir, filename]), dtype='O').rename(columns=column_map) for filename in _FILENAMES}

### 00-4. identifier-column

In [8]:
id_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'Identity')
id_columns

['center', 'followupCenter', 'subjectID', 'followupID']

## 30. Secondary

In [9]:
df_analysis = df_dict['analysis.csv']
df_analysis = COMBINE_harmonizer.valid_columns(df_analysis, all_valid_columns, debug_df=True, debug_columns=False)
df_analysis = COMBINE_harmonizer.postprocess(df_analysis)

(25/48) MRI_RDATE_PB not in columns
(26/48) MRI_RDATE_NR not in columns
(27/48) MRI_RDATE_WG not in columns
(28/48) MRI_RDATE_LC not in columns
(30/48) MRI_PATTERN_PB not in columns
(31/48) MRI_PATTERN_NR not in columns
(32/48) MRI_PATTERN_WG not in columns
(33/48) MRI_PATTERN_LC not in columns
(35/48) MRI_PCLASS_PB not in columns
(36/48) MRI_PCLASS_NR not in columns
(37/48) MRI_PCLASS_WG not in columns
(38/48) MRI_PCLASS_LC not in columns
(40/48) MRI_INFARL_PB not in columns
(41/48) MRI_INFARL_NR not in columns
(42/48) MRI_INFARL_WG not in columns
(44/48) MRI_INFARR_PB not in columns
(45/48) MRI_INFARR_NR not in columns
(46/48) MRI_INFARR_WG not in columns


## 30-01. Secondary

In [10]:
secondary_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'Secondary')
all_secondary_columns = id_columns + secondary_columns
secondary_columns

['acidosis',
 'ageDeath_day',
 'ageRand_hr',
 'baselineAnticonvulsants',
 'dischargeAnticonvulsants',
 'inotropicAgent',
 'perinatalSentinelEvent',
 'dischargeSeizure',
 'Apgar10minLt5',
 'Apgar10minLte5',
 'Apgar5minLte5',
 'bloodGasBaseDeficit_mEqPerL',
 'bloodGasPH',
 'emergencyCSection',
 'encephalopathyLevel',
 'inotropicAgentBaseline',
 'maleSex',
 'maternalEducation',
 'motherInsurancePublic',
 'motherRace',
 'treatmentAssignmentDuration_hr',
 'treatmentAssignmentTemperature',
 'bloodGasBaseDeficit_mEqPerLSrc',
 'bloodGasPHSrc',
 'usualCoolingTreatmentGroup',
 'blindness',
 'moderateSevereCerebralPalsy',
 'cerebralPalsy',
 'gastrostomyTube',
 'grossMotorFunctionLevel',
 'hearingImpairedWithAid',
 'hearingImpairedLevel',
 'multipleImpairment',
 'afterDischargeSeizure',
 'lengthOfStay_day']

In [11]:
df_secondary = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_secondary_columns, debug_df=False, debug_columns=True)
df_secondary = COMBINE_harmonizer.postprocess(df_secondary)

out_filename = os.sep.join([out_dir, '30-01-secondary.csv'])
df_secondary.to_csv(out_filename, index=False)

(4/39) acidosis not in df
(5/39) ageDeath_day not in df
(6/39) ageRand_hr not in df
(7/39) baselineAnticonvulsants not in df
(9/39) inotropicAgent not in df
(10/39) perinatalSentinelEvent not in df
(12/39) Apgar10minLt5 not in df
(13/39) Apgar10minLte5 not in df
(14/39) Apgar5minLte5 not in df
(15/39) bloodGasBaseDeficit_mEqPerL not in df
(16/39) bloodGasPH not in df
(17/39) emergencyCSection not in df
(18/39) encephalopathyLevel not in df
(19/39) inotropicAgentBaseline not in df
(20/39) maleSex not in df
(21/39) maternalEducation not in df
(22/39) motherInsurancePublic not in df
(23/39) motherRace not in df
(24/39) treatmentAssignmentDuration_hr not in df
(25/39) treatmentAssignmentTemperature not in df
(26/39) bloodGasBaseDeficit_mEqPerLSrc not in df
(27/39) bloodGasPHSrc not in df
(28/39) usualCoolingTreatmentGroup not in df
(32/39) gastrostomyTube not in df
(36/39) multipleImpairment not in df
(38/39) lengthOfStay_day not in df


### 30-01-1. check empty

In [12]:
COMBINE_harmonizer.check_empty(df_secondary)

(0/14) column: center (168 / 0)
(1/14) column: subjectID (168 / 0)
(2/14) column: uniqueID (168 / 0)
(3/14) column: followupCenter (151 / 17)
(4/14) column: blindness (141 / 27)
(5/14) column: hearingImpairedLevel (141 / 27)
(6/14) column: hearingImpairedWithAid (141 / 27)
(7/14) column: grossMotorFunctionLevel (141 / 27)
(8/14) column: cerebralPalsy (141 / 27)
(9/14) column: moderateSevereCerebralPalsy (141 / 27)
(10/14) column: dischargeAnticonvulsants (164 / 4)
(11/14) column: followupID (133 / 35)
(12/14) column: dischargeSeizure (168 / 0)
(13/14) column: afterDischargeSeizure (143 / 25)


In [13]:
COMBINE_harmonizer.column_info(df_secondary)

(0/14) center: (168/0)
(1/14) subjectID: (168/0)
(2/14) uniqueID: (168/0)
(3/14) followupCenter: (151/17)
(4/14) blindness: (141/27)
(5/14) hearingImpairedLevel: (141/27)
(6/14) hearingImpairedWithAid: (141/27)
(7/14) grossMotorFunctionLevel: (141/27)
(8/14) cerebralPalsy: (141/27)
(9/14) moderateSevereCerebralPalsy: (141/27)
(10/14) dischargeAnticonvulsants: (164/4)
(11/14) followupID: (133/35)
(12/14) dischargeSeizure: (168/0)
(13/14) afterDischargeSeizure: (143/25)


## 30-02. Outcome

In [14]:
outcome_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'Outcome')
all_outcome_columns = id_columns + outcome_columns
outcome_columns

['flagAdjudicatedOutcome',
 'normalPrimaryOutcome',
 'BayleyIIILanguage',
 'BayleyIIIMotor',
 'BayleyIIICognitive',
 'deathBeforeFollowup',
 'deathBeforeDischarge',
 'disabilityLevelSurvivor',
 'disabilityLevel',
 'moderateSevereDisabilityOrDeath',
 'moderateSevereDisabilitySurvivor',
 'disabilityLevelDeath',
 'outcomeGroup']

In [15]:
df_outcome = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_outcome_columns, debug_df=False, debug_columns=True)
df_outcome = COMBINE_harmonizer.postprocess(df_outcome)

out_filename = os.sep.join([out_dir, '30-02-outcome.csv'])
df_outcome.to_csv(out_filename, index=False)

(4/17) flagAdjudicatedOutcome not in df
(6/17) BayleyIIILanguage not in df
(7/17) BayleyIIIMotor not in df
(15/17) disabilityLevelDeath not in df
(16/17) outcomeGroup not in df


### 30-02-1. check empty

In [16]:
COMBINE_harmonizer.check_empty(df_outcome)

(0/13) column: center (168 / 0)
(1/13) column: subjectID (168 / 0)
(2/13) column: uniqueID (168 / 0)
(3/13) column: followupCenter (151 / 17)
(4/13) column: BayleyIIICognitive (138 / 30)
(5/13) column: deathBeforeFollowup (165 / 3)
(6/13) column: normalPrimaryOutcome (139 / 29)
(7/13) column: deathBeforeDischarge (167 / 1)
(8/13) column: followupID (133 / 35)
(9/13) column: moderateSevereDisabilitySurvivor (139 / 29)
(10/13) column: disabilityLevelSurvivor (139 / 29)
(11/13) column: moderateSevereDisabilityOrDeath (157 / 11)
(12/13) column: disabilityLevel (157 / 11)


In [17]:
COMBINE_harmonizer.column_info(df_outcome)

(0/13) center: (168/0)
(1/13) subjectID: (168/0)
(2/13) uniqueID: (168/0)
(3/13) followupCenter: (151/17)
(4/13) BayleyIIICognitive: (138/30)
(5/13) deathBeforeFollowup: (165/3)
(6/13) normalPrimaryOutcome: (139/29)
(7/13) deathBeforeDischarge: (167/1)
(8/13) followupID: (133/35)
(9/13) moderateSevereDisabilitySurvivor: (139/29)
(10/13) disabilityLevelSurvivor: (139/29)
(11/13) moderateSevereDisabilityOrDeath: (157/11)
(12/13) disabilityLevel: (157/11)


## 30-03. MRI

In [18]:
mri_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'MRI Derived')
all_mri_columns = id_columns + mri_columns
mri_columns

['MRINRNPatternOfInjuryMerge',
 'MRINRNPatternOfInjuryAvg',
 'MRINRNPatternOfInjuryMax',
 'MRI2LevelPatternOfInjury',
 'MRIAge_day',
 'MRIOverallDiagnosis',
 'MRINRNPatternOfInjury',
 'MRIDate',
 'MRITime',
 'MRINotDone',
 'MRIUnread',
 'MRIAnalysis',
 'abnormalMRIResult',
 'MRINRNPatternOfInjuryWSvsBGTPLIC',
 'cerebralLesion',
 'cerebellarLesion',
 'basalGangliaLesion',
 'brainstemLesion',
 'corpusCallosumLesion',
 'cerebralLesionLobe',
 'coronaRadiataLesion',
 'edema',
 'extraAxialLesion',
 'extent',
 'frontalParietalLesion',
 'frontalLesion',
 'lateralHemisphericDevastation',
 'hippocampusLesion',
 'hypothalamusLesion',
 'insularLesion',
 'laterality',
 'BGT',
 'PLIC',
 'watershed',
 'whiteMatterInjury',
 'occipitalLesion',
 'opticChiasmLesion',
 'otherLesion',
 'otherCerebralLesion',
 'parasagittalLesion',
 'parietalLesion',
 'preirolandicLesion',
 'perisylvianLesion',
 'pituitaryLesion',
 'parietalOccipitalLesion',
 'parietalTemporalLesion',
 'scalpLesion',
 'thalamusLesion',
 'te

In [19]:
df_mri = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_mri_columns, debug_df=False, debug_columns=True)

# XXX MRI_DATE
def _convert_mri_date(the_date_number):
    if pd.isnull(the_date_number) or the_date_number == '':
        return ''

    try:
        the_date_number = int(float(the_date_number))
    except Exception as e:
        return the_date_number

    the_datetime = datetime(1960, 1, 1) + timedelta(days=the_date_number)
    return the_datetime.strftime('%Y-%m-%d')

df_mri['MRIDate'] = df_mri['MRIDate'].apply(lambda x: _convert_mri_date(x))

df_mri = COMBINE_harmonizer.postprocess(df_mri)

out_filename = os.sep.join([out_dir, '30-03-mri.csv'])
df_mri.to_csv(out_filename, index=False)

(4/64) MRINRNPatternOfInjuryMerge not in df
(5/64) MRINRNPatternOfInjuryAvg not in df
(6/64) MRINRNPatternOfInjuryMax not in df
(9/64) MRIOverallDiagnosis not in df
(15/64) MRIAnalysis not in df
(16/64) abnormalMRIResult not in df
(17/64) MRINRNPatternOfInjuryWSvsBGTPLIC not in df
(18/64) cerebralLesion not in df
(19/64) cerebellarLesion not in df
(20/64) basalGangliaLesion not in df
(21/64) brainstemLesion not in df
(22/64) corpusCallosumLesion not in df
(23/64) cerebralLesionLobe not in df
(24/64) coronaRadiataLesion not in df
(25/64) edema not in df
(26/64) extraAxialLesion not in df
(27/64) extent not in df
(28/64) frontalParietalLesion not in df
(29/64) frontalLesion not in df
(30/64) lateralHemisphericDevastation not in df
(31/64) hippocampusLesion not in df
(32/64) hypothalamusLesion not in df
(33/64) insularLesion not in df
(34/64) laterality not in df
(35/64) BGT not in df
(36/64) PLIC not in df
(37/64) watershed not in df
(38/64) whiteMatterInjury not in df
(39/64) occipitalL

### 30-03-1. check empty

In [20]:
COMBINE_harmonizer.check_empty(df_mri)

(0/14) column: center (168 / 0)
(1/14) column: subjectID (168 / 0)
(2/14) column: uniqueID (168 / 0)
(3/14) column: followupCenter (151 / 17)
(4/14) column: followupID (133 / 35)
(5/14) column: MRIDate (168 / 0)
(6/14) column: MRITime (147 / 21)
(7/14) column: MRIAge_day (128 / 40)
(8/14) column: MRIUnread (19 / 149)
(9/14) column: MRINRNPatternOfInjury (128 / 40)
(10/14) column: MRI2LevelPatternOfInjury (128 / 40)
(11/14) column: vascularTerritoryInfarctionLeft (128 / 40)
(12/14) column: vascularTerritoryInfarctionRight (128 / 40)
(13/14) column: MRINotDone (21 / 147)


In [21]:
COMBINE_harmonizer.column_info(df_mri)

(0/14) center: (168/0)
(1/14) subjectID: (168/0)
(2/14) uniqueID: (168/0)
(3/14) followupCenter: (151/17)
(4/14) followupID: (133/35)
(5/14) MRIDate: (168/0)
(6/14) MRITime: (147/21)
(7/14) MRIAge_day: (128/40)
(8/14) MRIUnread: (19/149)
(9/14) MRINRNPatternOfInjury: (128/40)
(10/14) MRI2LevelPatternOfInjury: (128/40)
(11/14) vascularTerritoryInfarctionLeft: (128/40)
(12/14) vascularTerritoryInfarctionRight: (128/40)
(13/14) MRINotDone: (21/147)
