In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import os
import re
import pydoc

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 00-0. Variables

In [2]:
study_name = COMBINE_harmonizer.STUDY_OC
sheet_name = COMBINE_harmonizer.SHEET_DERIVED_DATA

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
_FILENAMES = [
    'analysis.csv',
]

In [5]:
input_dir = cfg.config[f'{study_name}_analysis_dir']
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = f"{cfg.config['out_dir']}/out-{study_name}"

os.makedirs(out_dir, exist_ok=True)

## 00-1. Column Map

In [6]:
df_data_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=sheet_name)
all_valid_columns = list(df_data_dict[COMBINE_harmonizer.DATA_DICT_VAR_NAME])
column_map = {each[study_name]: each[COMBINE_harmonizer.DATA_DICT_VAR_NAME] for _, each in df_data_dict.iterrows()}

## 00-2. df-dict from _FILENAMES

In [7]:
df_dict = {filename: pd.read_csv(os.sep.join([input_dir, filename]), dtype='O').rename(columns=column_map) for filename in _FILENAMES}

### 00-4. identifier-column

In [8]:
id_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'Identity')
id_columns

['center', 'followupCenter', 'subjectID', 'followupID']

## 30. Secondary

In [9]:
df_analysis = df_dict['analysis.csv']
df_analysis = COMBINE_harmonizer.valid_columns(df_analysis, all_valid_columns, debug_df=True, debug_columns=False)
df_analysis = COMBINE_harmonizer.postprocess(df_analysis)

## 30-01. Secondary

In [10]:
secondary_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'Secondary')
all_secondary_columns = id_columns + secondary_columns
secondary_columns

['acidosis',
 'ageDeath_day',
 'ageRand_hr',
 'baselineAnticonvulsants',
 'dischargeAnticonvulsants',
 'inotropicAgent',
 'perinatalSentinelEvent',
 'dischargeSeizure',
 'Apgar10minLt5',
 'Apgar10minLte5',
 'Apgar5minLte5',
 'bloodGasBaseDeficit_mEqPerL',
 'bloodGasPH',
 'emergencyCSection',
 'encephalopathyLevel',
 'inotropicAgentBaseline',
 'maleSex',
 'maternalEducation',
 'motherInsurancePublic',
 'motherRace',
 'treatmentAssignmentDuration_hr',
 'treatmentAssignmentTemperature',
 'bloodGasBaseDeficit_mEqPerLSrc',
 'bloodGasPHSrc',
 'usualCoolingTreatmentGroup',
 'blindness',
 'moderateSevereCerebralPalsy',
 'cerebralPalsy',
 'gastrostomyTube',
 'grossMotorFunctionLevel',
 'hearingImpairedWithAid',
 'hearingImpairedLevel',
 'multipleImpairment',
 'afterDischargeSeizure',
 'lengthOfStay_day']

In [11]:
df_secondary = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_secondary_columns, debug_df=False, debug_columns=True)

# XXX treatmentAssignmentDuration_hr
df_secondary.loc[:, 'treatmentAssignmentDuration_hr'] = df_secondary.loc[:, 'treatmentAssignmentDuration_hr'].apply(lambda x: re.sub('H', '', x))

df_secondary = COMBINE_harmonizer.postprocess(df_secondary)

out_filename = os.sep.join([out_dir, '30-01-secondary.csv'])
df_secondary.to_csv(out_filename, index=False)

(38/39) lengthOfStay_day not in df


### 30-01-1. check empty

In [12]:
COMBINE_harmonizer.check_empty(df_secondary)

(0/39) column: center (364 / 0)
(1/39) column: subjectID (364 / 0)
(2/39) column: uniqueID (364 / 0)
(3/39) column: followupID (315 / 49)
(4/39) column: followupCenter (315 / 49)
(5/39) column: treatmentAssignmentDuration_hr (364 / 0)
(6/39) column: treatmentAssignmentTemperature (364 / 0)
(7/39) column: encephalopathyLevel (364 / 0)
(8/39) column: blindness (294 / 70)
(9/39) column: hearingImpairedLevel (285 / 79)
(10/39) column: hearingImpairedWithAid (294 / 70)
(11/39) column: grossMotorFunctionLevel (285 / 79)
(12/39) column: cerebralPalsy (294 / 70)
(13/39) column: moderateSevereCerebralPalsy (294 / 70)
(14/39) column: multipleImpairment (294 / 70)
(15/39) column: gastrostomyTube (287 / 77)
(16/39) column: ageDeath_day (49 / 315)
(17/39) column: Apgar10minLt5 (324 / 40)
(18/39) column: maternalEducation (313 / 51)
(19/39) column: baselineAnticonvulsants (364 / 0)
(20/39) column: inotropicAgentBaseline (362 / 2)
(21/39) column: inotropicAgent (364 / 0)
(22/39) column: emergencyCSec

In [13]:
COMBINE_harmonizer.column_info(df_secondary)

(0/39) center: (364/0)
(1/39) subjectID: (364/0)
(2/39) uniqueID: (364/0)
(3/39) followupID: (315/49)
(4/39) followupCenter: (315/49)
(5/39) treatmentAssignmentDuration_hr: (364/0)
(6/39) treatmentAssignmentTemperature: (364/0)
(7/39) encephalopathyLevel: (364/0)
(8/39) blindness: (294/70)
(9/39) hearingImpairedLevel: (285/79)
(10/39) hearingImpairedWithAid: (294/70)
(11/39) grossMotorFunctionLevel: (285/79)
(12/39) cerebralPalsy: (294/70)
(13/39) moderateSevereCerebralPalsy: (294/70)
(14/39) multipleImpairment: (294/70)
(15/39) gastrostomyTube: (287/77)
(16/39) ageDeath_day: (49/315)
(17/39) Apgar10minLt5: (324/40)
(18/39) maternalEducation: (313/51)
(19/39) baselineAnticonvulsants: (364/0)
(20/39) inotropicAgentBaseline: (362/2)
(21/39) inotropicAgent: (364/0)
(22/39) emergencyCSection: (364/0)
(23/39) maleSex: (364/0)
(24/39) Apgar5minLte5: (361/3)
(25/39) Apgar10minLte5: (324/40)
(26/39) ageRand_hr: (363/1)
(27/39) dischargeAnticonvulsants: (338/26)
(28/39) dischargeSeizure: (364/0

## 30-02. Outcome

In [14]:
outcome_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'Outcome')
all_outcome_columns = id_columns + outcome_columns
outcome_columns

['flagAdjudicatedOutcome',
 'normalPrimaryOutcome',
 'BayleyIIILanguage',
 'BayleyIIIMotor',
 'BayleyIIICognitive',
 'deathBeforeFollowup',
 'deathBeforeDischarge',
 'disabilityLevelSurvivor',
 'disabilityLevel',
 'moderateSevereDisabilityOrDeath',
 'moderateSevereDisabilitySurvivor',
 'disabilityLevelDeath',
 'outcomeGroup']

In [15]:
df_outcome = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_outcome_columns, debug_df=False, debug_columns=True)
df_outcome = COMBINE_harmonizer.postprocess(df_outcome)

out_filename = os.sep.join([out_dir, '30-02-outcome.csv'])
df_outcome.to_csv(out_filename, index=False)

(15/17) disabilityLevelDeath not in df


### 30-02-1. check empty

In [16]:
COMBINE_harmonizer.check_empty(df_outcome)

(0/17) column: center (364 / 0)
(1/17) column: subjectID (364 / 0)
(2/17) column: uniqueID (364 / 0)
(3/17) column: followupID (315 / 49)
(4/17) column: followupCenter (315 / 49)
(5/17) column: BayleyIIICognitive (283 / 81)
(6/17) column: BayleyIIILanguage (275 / 89)
(7/17) column: BayleyIIIMotor (277 / 87)
(8/17) column: deathBeforeFollowup (354 / 10)
(9/17) column: normalPrimaryOutcome (285 / 79)
(10/17) column: flagAdjudicatedOutcome (9 / 355)
(11/17) column: deathBeforeDischarge (364 / 0)
(12/17) column: moderateSevereDisabilitySurvivor (291 / 73)
(13/17) column: disabilityLevelSurvivor (285 / 79)
(14/17) column: moderateSevereDisabilityOrDeath (347 / 17)
(15/17) column: disabilityLevel (341 / 23)
(16/17) column: outcomeGroup (347 / 17)


In [17]:
COMBINE_harmonizer.column_info(df_outcome)

(0/17) center: (364/0)
(1/17) subjectID: (364/0)
(2/17) uniqueID: (364/0)
(3/17) followupID: (315/49)
(4/17) followupCenter: (315/49)
(5/17) BayleyIIICognitive: (283/81)
(6/17) BayleyIIILanguage: (275/89)
(7/17) BayleyIIIMotor: (277/87)
(8/17) deathBeforeFollowup: (354/10)
(9/17) normalPrimaryOutcome: (285/79)
(10/17) flagAdjudicatedOutcome: (9/355)
(11/17) deathBeforeDischarge: (364/0)
(12/17) moderateSevereDisabilitySurvivor: (291/73)
(13/17) disabilityLevelSurvivor: (285/79)
(14/17) moderateSevereDisabilityOrDeath: (347/17)
(15/17) disabilityLevel: (341/23)
(16/17) outcomeGroup: (347/17)


## 30-03. MRI

In [18]:
mri_columns = COMBINE_harmonizer.get_columns(df_data_dict, 'Derived Data', 'MRI Derived')
all_mri_columns = id_columns + mri_columns
mri_columns

['MRINRNPatternOfInjuryMerge',
 'MRINRNPatternOfInjuryAvg',
 'MRINRNPatternOfInjuryMax',
 'MRI2LevelPatternOfInjury',
 'MRIAge_day',
 'MRIOverallDiagnosis',
 'MRINRNPatternOfInjury',
 'MRIDate',
 'MRITime',
 'MRINotDone',
 'MRIUnread',
 'MRIAnalysis',
 'abnormalMRIResult',
 'MRINRNPatternOfInjuryWSvsBGTPLIC',
 'cerebralLesion',
 'cerebellarLesion',
 'basalGangliaLesion',
 'brainstemLesion',
 'corpusCallosumLesion',
 'cerebralLesionLobe',
 'coronaRadiataLesion',
 'edema',
 'extraAxialLesion',
 'extent',
 'frontalParietalLesion',
 'frontalLesion',
 'lateralHemisphericDevastation',
 'hippocampusLesion',
 'hypothalamusLesion',
 'insularLesion',
 'laterality',
 'BGT',
 'PLIC',
 'watershed',
 'whiteMatterInjury',
 'occipitalLesion',
 'opticChiasmLesion',
 'otherLesion',
 'otherCerebralLesion',
 'parasagittalLesion',
 'parietalLesion',
 'preirolandicLesion',
 'perisylvianLesion',
 'pituitaryLesion',
 'parietalOccipitalLesion',
 'parietalTemporalLesion',
 'scalpLesion',
 'thalamusLesion',
 'te

In [19]:
df_mri = COMBINE_harmonizer.valid_columns(df_analysis.copy(), all_mri_columns, debug_df=False, debug_columns=True)
df_mri = COMBINE_harmonizer.postprocess(df_mri)

out_filename = os.sep.join([out_dir, '30-03-mri.csv'])
df_mri.to_csv(out_filename, index=False)

(4/64) MRINRNPatternOfInjuryMerge not in df
(5/64) MRINRNPatternOfInjuryAvg not in df
(6/64) MRINRNPatternOfInjuryMax not in df
(11/64) MRIDate not in df
(12/64) MRITime not in df
(13/64) MRINotDone not in df
(14/64) MRIUnread not in df
(58/64) vascularTerritoryInfarctionLeft not in df
(59/64) vascularTerritoryInfarctionRight not in df


### 30-03-1. check empty

In [20]:
COMBINE_harmonizer.check_empty(df_mri)

(0/56) column: center (364 / 0)
(1/56) column: subjectID (364 / 0)
(2/56) column: uniqueID (364 / 0)
(3/56) column: followupID (315 / 49)
(4/56) column: followupCenter (315 / 49)
(5/56) column: otherCerebralLesion (298 / 66)
(6/56) column: edema (298 / 66)
(7/56) column: MRIOverallDiagnosis (298 / 66)
(8/56) column: MRINRNPatternOfInjury (298 / 66)
(9/56) column: MRIAge_day (298 / 66)
(10/56) column: hemisphericDevastation (298 / 66)
(11/56) column: frontalLesion (298 / 66)
(12/56) column: parietalLesion (298 / 66)
(13/56) column: temporalLesion (298 / 66)
(14/56) column: occipitalLesion (298 / 66)
(15/56) column: cerebralLesionLobe (298 / 66)
(16/56) column: frontalParietalLesion (298 / 66)
(17/56) column: parietalTemporalLesion (298 / 66)
(18/56) column: temporalOccipitalLesion (298 / 66)
(19/56) column: parasagittalLesion (298 / 66)
(20/56) column: preirolandicLesion (298 / 66)
(21/56) column: perisylvianLesion (298 / 66)
(22/56) column: insularLesion (298 / 66)
(23/56) column: intr

In [21]:
COMBINE_harmonizer.column_info(df_mri)

(0/56) center: (364/0)
(1/56) subjectID: (364/0)
(2/56) uniqueID: (364/0)
(3/56) followupID: (315/49)
(4/56) followupCenter: (315/49)
(5/56) otherCerebralLesion: (298/66)
(6/56) edema: (298/66)
(7/56) MRIOverallDiagnosis: (298/66)
(8/56) MRINRNPatternOfInjury: (298/66)
(9/56) MRIAge_day: (298/66)
(10/56) hemisphericDevastation: (298/66)
(11/56) frontalLesion: (298/66)
(12/56) parietalLesion: (298/66)
(13/56) temporalLesion: (298/66)
(14/56) occipitalLesion: (298/66)
(15/56) cerebralLesionLobe: (298/66)
(16/56) frontalParietalLesion: (298/66)
(17/56) parietalTemporalLesion: (298/66)
(18/56) temporalOccipitalLesion: (298/66)
(19/56) parasagittalLesion: (298/66)
(20/56) preirolandicLesion: (298/66)
(21/56) perisylvianLesion: (298/66)
(22/56) insularLesion: (298/66)
(23/56) intraventricularLesion: (298/66)
(24/56) cerebellarLesion: (298/66)
(25/56) coronaRadiataLesion: (298/66)
(26/56) hippocampusLesion: (298/66)
(27/56) brainstemLesion: (298/66)
(28/56) pituitaryLesion: (298/66)
(29/56) h