In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import os
import re

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 01. Init

In [2]:
root_dir = '../'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

## 02. Dictionary

In [4]:
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
out_dir = cfg.config['out_dir']

In [5]:
data_dict_filename = f'{root_dir}/Dictionary_HIE_clinical_variables.xlsx'

In [6]:
df_dict_main = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=COMBINE_harmonizer.SHEET_MAIN)
df_dict_followup = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=COMBINE_harmonizer.SHEET_FOLLOW_UP)

In [7]:
is_main_datetime = df_dict_main['type'].isin(['date', 'time'])
df_dict_main_datetime = df_dict_main[is_main_datetime].drop_duplicates([COMBINE_harmonizer.DATA_DICT_VAR_NAME]).set_index(COMBINE_harmonizer.DATA_DICT_VAR_NAME)

is_followup_datetime = df_dict_followup['type'].isin(['date', 'time'])
df_dict_followup_datetime = df_dict_followup[is_followup_datetime].drop_duplicates([COMBINE_harmonizer.DATA_DICT_VAR_NAME]).set_index(COMBINE_harmonizer.DATA_DICT_VAR_NAME)

In [8]:
df_dict_main_datetime

Unnamed: 0_level_0,Category,Subcategory,type,Variable_Description,#studies w/ this var,redcap,comment,lower_var,var_eq_redcap,connect_redcap,LH,OC
Standardized_VariableNames_Dictionary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
birthDate,Pre-intervention,Screening,date,birth date,2,birth_date,,birthdate,True,birthdate,BIRTHDT,BIRTHDT
randomDate,Pre-intervention,Screening,date,Date of Randomization,2,random_date,,randomdate,True,randomdate,LH2RANDT,OC2RANDT
randomTime,Pre-intervention,Screening,time,Time of Randomization (24hr),2,random_time,,randomtime,True,randomtime,LH2RANTM,OC2RANTM
maternalAdmissionDate,Pre-intervention,Labor Delivery,date,Date of maternal admission,2,maternal_admission_date,,maternaladmissiondate,True,maternaladmissiondate,LH4ADMDT,OC4ADAT
maternalAdmissionTime,Pre-intervention,Labor Delivery,time,Time of maternal admission,2,maternal_admission_time,,maternaladmissiontime,True,maternaladmissiontime,LH4ADMTM,OC4ATIM
...,...,...,...,...,...,...,...,...,...,...,...,...
dischargeEEGAbnormalBackgroundActivityTime,NICU Discharge,Seizure,time,Abnormal Bkg activity: Time,2,discharge_eeg_abnormal_background_activity_time,,dischargeeegabnormalbackgroundactivitytime,True,dischargeeegabnormalbackgroundactivitytime,LH13ATIM,OC13ATIM
wdrawSupportDate,NICU Discharge,Withdrawal of Support,date,Date support withdrawn,2,wdraw_support_date,,wdrawsupportdate,True,wdrawsupportdate,LH12DWIT,OC13SWDD
wdrawSupportTime,NICU Discharge,Withdrawal of Support,time,Time support withdrawn,2,wdraw_support_time,,wdrawsupporttime,True,wdrawsupporttime,LH12TWIT,OC13SWDT
limitCareDNRDate,NICU Discharge,Limitation of Care,date,Date of DNR order,2,limit_care_dnr_date,,limitcarednrdate,True,limitcarednrdate,LH12DDNR,OC13DNRD


## 03. Load 01-06-birth.csv

In [9]:
filename = f'{out_dir}/out-merged-normalized/01-06-birth.csv'

df_birth = pd.read_csv(filename)

columns = ['_study', 'uniqueID', 'birthDate', 'birthTime']
df_birth = df_birth[columns]
df_birth

Unnamed: 0,_study,uniqueID,birthDate,birthTime
0,LH,11:LH006,2009-03-31,11:29:00
1,LH,11:LH011,2009-11-03,18:57:00
2,LH,11:LH023,2010-10-20,00:38:00
3,LH,11:LH030,2011-03-23,09:12:00
4,LH,11:LH033,2011-04-30,18:05:00
...,...,...,...,...
527,OC,09:OC0691,2013-04-23,16:02:00
528,OC,09:OC0701,2013-05-06,14:07:00
529,OC,09:OC0731,2013-06-13,02:41:00
530,OC,09:OC0771,2013-09-05,08:00:00


## 04. Update Date and Time

In [10]:
def _get_data_dict(data_dict_type: str):
    if data_dict_type == COMBINE_harmonizer.SHEET_MAIN:
        return df_dict_main_datetime
    elif data_dict_type == COMBINE_harmonizer.SHEET_FOLLOW_UP:
        return df_dict_followup_datetime
    return None


def _process_date_time(df: pd.DataFrame, df_dict_datetime: pd.DataFrame, df_birth: pd.DataFrame, filename_info: dict) -> pd.DataFrame:
    time_columns = [column for column in df.columns if column not in ['birthDate', 'birthTime'] and column in df_dict_datetime.index and df_dict_datetime.loc[column, 'type'] == 'time']

    date_with_time_columns = [f"{re.sub(r'Time', 'Date', column)}" for column in time_columns]

    date_only_columns = [column for column in df.columns if column not in ['birthDate', 'birthTime'] and column in df_dict_datetime.index and column not in time_columns and column not in date_with_time_columns]

    # check validity of date_with_time_columns
    for date_with_time_column in date_with_time_columns:
        if date_with_time_column not in df.columns:
            print(f"[WARN] no date_with_time_column: date_with_time_column: {date_with_time_column} filename: {filename_info['name']}")

    print(f"{filename_info['name']}: time_columns: {time_columns} time_date_columns: {date_with_time_columns} date_columns: {date_only_columns}")

    # merge
    df_merge = df.merge(df_birth, on=['_study', 'uniqueID'], how='left', suffixes=['.df', ''])

    # date columns
    for date_only_column in date_only_columns:
        df_merge.loc[:, date_only_column] = df_merge.apply(lambda x: COMBINE_harmonizer.date_to_day(x[date_only_column], x['birthDate']) , axis=1)

    # time columns
    for time_column in time_columns:
        date_with_time_column = re.sub(r'Time', 'Date', time_column)
        datetime_s = df_merge.apply(lambda x: COMBINE_harmonizer.datetime_to_day_hr(x[date_with_time_column], x[time_column], x['birthDate'], x['birthTime']) , axis=1)
        df_merge.loc[:, date_with_time_column] = datetime_s.apply(lambda x: x[0])
        df_merge.loc[:, time_column] = datetime_s.apply(lambda x: x[1])

    # remove birthDate and birthTime
    for column in ['birthDate', 'birthTime']:
        del df_merge[column]

    # restore birthDate
    if 'birthDate.df' in df_merge:
        df_merge.loc[:, 'birthDate'] = df_merge['birthDate.df'].apply(lambda x: COMBINE_harmonizer.anonymize_birth_date(x))
        del df_merge['birthDate.df']

    # restore birthTime
    if 'birthTime.df' in df_merge:
        df_merge.loc[:, 'birthTime'] = df_merge['birthTime.df'].apply(lambda x: COMBINE_harmonizer.anonymize_birth_time(x))
        del df_merge['birthTime.df']

    return df_merge


## 05. Remove columns

In [11]:
def _process_remove_columns(df: pd.DataFrame) -> pd.DataFrame:
    valid_columns = [column for column in df.columns if column not in COMBINE_harmonizer.REMOVE_COLUMNS]

    return df[valid_columns]

## 06. Process file

In [12]:
def _process_filename_info(filename_info):
    if not filename_info.get('is_merge', True):
        return

    filename = f'{out_dir}/out-merged-normalized/{filename_info['name']}'
    if not os.path.exists(filename):
        return

    df = pd.read_csv(filename, dtype='O')
    df_dict = _get_data_dict(filename_info['data_dict'])

    df_after_date_time = _process_date_time(df, df_dict, df_birth, filename_info)

    df_after_remove_columns = _process_remove_columns(df_after_date_time)

    os.makedirs(f'{out_dir}/out-publish-normalized', exist_ok=True)
    out_filename = f"{out_dir}/out-publish-normalized/{filename_info['name']}"
    df_after_remove_columns.to_csv(out_filename, index=False)


for filename_info in COMBINE_harmonizer.FILENAME_INFOS:
    _process_filename_info(filename_info)

01-02-screening.csv: time_columns: ['randomTime'] time_date_columns: ['randomDate'] date_columns: []
01-03-maternal-demographics.csv: time_columns: [] time_date_columns: [] date_columns: []


01-04-pregnancy-history.csv: time_columns: [] time_date_columns: [] date_columns: []
01-05-labor-delivery.csv: time_columns: ['maternalAdmissionTime', 'ruptureTime', 'laborOnsetTime'] time_date_columns: ['maternalAdmissionDate', 'ruptureDate', 'laborOnsetDate'] date_columns: []


01-05_1-pse.csv: time_columns: [] time_date_columns: [] date_columns: []
01-05_2-emergency-csection.csv: time_columns: [] time_date_columns: [] date_columns: []


01-06-birth.csv: time_columns: ['neonateAdmissionTime', 'firstPostnatalBloodGasTime'] time_date_columns: ['neonateAdmissionDate', 'firstPostnatalBloodGasDate'] date_columns: []
01-07-pre-temperature.csv: time_columns: ['pre_CoolInitiateTime', 'pre_AfterOvershootReach33p5CTime', 'pre_TemperatureMinTime', 'pre_TemperatureMaxTime'] time_date_columns: ['pre_CoolInitiateDate', 'pre_AfterOvershootReach33p5CDate', 'pre_TemperatureMinDate', 'pre_TemperatureMaxDate'] date_columns: []


01-08-pre-cardiovascular.csv: time_columns: ['pre_CardioTime'] time_date_columns: ['pre_CardioDate'] date_columns: []
01-09-pre-infection.csv: time_columns: ['pre_PositiveCultureTime'] time_date_columns: ['pre_PositiveCultureDate'] date_columns: []


01-10-pre-other-med.csv: time_columns: ['pre_OtherMedTargetTime'] time_date_columns: ['pre_OtherMedTargetDate'] date_columns: []


01-11-pre-imaging.csv: time_columns: ['pre_HeadSonogramTime', 'pre_HeadCTTime', 'pre_BrainMRITime'] time_date_columns: ['pre_HeadSonogramDate', 'pre_HeadCTDate', 'pre_BrainMRIDate'] date_columns: []
01-12-neuro-exam.csv: time_columns: ['pre_NeuroExamTime'] time_date_columns: ['pre_NeuroExamDate'] date_columns: []


01-12_1-total-modified-sarnat.csv: time_columns: [] time_date_columns: [] date_columns: []
02-01-temperature.csv: time_columns: ['temperatureTime'] time_date_columns: ['temperatureDate'] date_columns: []


02-02-cardiovascular.csv: time_columns: ['cardioTime'] time_date_columns: ['cardioDate'] date_columns: []


02-03-respiratory.csv: time_columns: ['respiratoryTime'] time_date_columns: ['respiratoryDate'] date_columns: []
02-04-blood-gas.csv: time_columns: ['bloodGasTime'] time_date_columns: ['bloodGasDate'] date_columns: []


02-05-hematology.csv: time_columns: ['hematologyTime'] time_date_columns: ['hematologyDate'] date_columns: []
02-05_s-hematology.csv: time_columns: [] time_date_columns: [] date_columns: ['hematologyHematocritMinDate', 'hematologyPlateletMin_cPermuLDate']


02-06_s-blood-value.csv: time_columns: [] time_date_columns: [] date_columns: ['bloodValueBunBaseline_mgPerdLDate', 'bloodValueCreatinineBaseline_mgPerdLDate', 'bloodValueASTSGOTBaseline_UPerLDate', 'bloodValueALTSGPTBaseline_UPerLDate', 'bloodValueTotalBilirubinBaseline_mgPerdLDate', 'bloodValuePHMinDate', 'bloodValueHCO3Min_mEqPerLDate', 'bloodValueSerumNaMin_mEqPerLDate', 'bloodValueSerumKMin_mEqPerLDate', 'bloodValueClMin_mEqPerLDate', 'bloodValueGlucoseMin_mgPerdLDate', 'bloodValueTotalCaMin_mgPerdLDate', 'bloodValueIonCaMin_mgPerdLDate', 'bloodValueASTSGOTMin_UPerLDate', 'bloodValueALTSGPTMin_UPerLDate', 'bloodValueTotalBilirubinMin_mgPerdLDate', 'bloodValuePHMaxDate', 'bloodValueBaseDeficitMax_mEqPerLDate', 'bloodValueSerumNaMax_mEqPerLDate', 'bloodValueSerumKMax_mEqPerLDate', 'bloodValueClMax_mEqPerLDate', 'bloodValueBunMax_mgPerdLDate', 'bloodValueCreatinineMax_mgPerdLDate', 'bloodValueGlucoseMax_mgPerdLDate', 'bloodValueTotalCaMax_mgPerdLDate', 'bloodValueIonCaMax_mgPerdLDate

02-07-infection.csv: time_columns: ['positiveCultureTime'] time_date_columns: ['positiveCultureDate'] date_columns: []
02-08-other-med.csv: time_columns: ['otherMedTargetTime'] time_date_columns: ['otherMedTargetDate'] date_columns: []


02-09-imaging.csv: time_columns: ['headSonogramTime', 'headCTTime', 'brainMRITime'] time_date_columns: ['headSonogramDate', 'headCTDate', 'brainMRIDate'] date_columns: []
02-11-elevated-temperature.csv: time_columns: ['elevatedTempTime'] time_date_columns: ['elevatedTempDate'] date_columns: []


02-12-fluctuated-temperature.csv: time_columns: ['fluctuateTempTime'] time_date_columns: ['fluctuateTempDate'] date_columns: []
02-13-bradycardia.csv: time_columns: ['bradycardiaTime'] time_date_columns: ['bradycardiaDate'] date_columns: []


02-14-adverse-event.csv: time_columns: ['SAECardiacExperienceOnsetTime', 'SAECardiacExperienceResolveTime', 'SAEMetabolicAcidosisOnsetTime', 'SAEMetabolicAcidosisResolveTime', 'SAEThrombosisExperienceOnsetTime', 'SAEThrombosisExperienceResolveTime', 'SAEBleedingExperienceOnsetTime', 'SAEBleedingExperienceResolveTime', 'SAEDeathTime', 'SAEOtherOnsetTime', 'SAEOtherResolveTime'] time_date_columns: ['SAECardiacExperienceOnsetDate', 'SAECardiacExperienceResolveDate', 'SAEMetabolicAcidosisOnsetDate', 'SAEMetabolicAcidosisResolveDate', 'SAEThrombosisExperienceOnsetDate', 'SAEThrombosisExperienceResolveDate', 'SAEBleedingExperienceOnsetDate', 'SAEBleedingExperienceResolveDate', 'SAEDeathDate', 'SAEOtherOnsetDate', 'SAEOtherResolveDate'] date_columns: ['SAEAlterationSkinIntegrityOnsetDate', 'SAEAlterationSkinIntegrityResolveDate']
02-15-violation.csv: time_columns: [] time_date_columns: [] date_columns: ['violationDate']


02-16-interrupt.csv: time_columns: ['interruptTime', 'interruptRestartTime'] time_date_columns: ['interruptDate', 'interruptRestartDate'] date_columns: []
02-17-discontinue.csv: time_columns: ['discontinueTime'] time_date_columns: ['discontinueDate'] date_columns: []


03-01-post-temperature.csv: time_columns: ['post_TemperatureTime'] time_date_columns: ['post_TemperatureDate'] date_columns: []
03-01_s-post-temperature.csv: time_columns: ['normothermiaTime'] time_date_columns: ['normothermiaDate'] date_columns: []


03-02-post-blood-value.csv: time_columns: [] time_date_columns: [] date_columns: ['post_BloodValueASTSGOT_UPerLDate', 'post_BloodValueALTSGPT_UPerLDate', 'post_BloodValueTotalBilirubin_mgPerdLDate']
03-03-post-imaging.csv: time_columns: ['post_HeadSonogramTime', 'post_HeadCTTime', 'post_BrainMRITime'] time_date_columns: ['post_HeadSonogramDate', 'post_HeadCTDate', 'post_BrainMRIDate'] date_columns: []


03-04-post-neuro-exam.csv: time_columns: ['post_NeuroExamTime'] time_date_columns: ['post_NeuroExamDate'] date_columns: []
03-04_1-total-modified-sarnat.csv: time_columns: [] time_date_columns: [] date_columns: []


03-05-mri.csv: time_columns: [] time_date_columns: [] date_columns: ['MRIReadDate', 'MRIDate']
03-05_s-mri.csv: time_columns: ['MRITime'] time_date_columns: ['MRIDate'] date_columns: ['MRISendRTIDate', 'MRIReceiveRTIDate']
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2010-11-01 the_time: 17:04
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2010-11-11 the_time: 17:00
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2010-11-21 the_time: 11:07
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2011-01-03 the_time: 17:00
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2011-02-07 the_time: 16:52
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2011-03-11 the_time: 20:45
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2011-04-14 the_time: 18:46
[WARN] datetime_to_day_hr: unable to strptime (with %S): the_date: 2011-08-04 the_time: 22:22
[WARN] datetime_to_day_hr

03-05_s1-mri.csv: time_columns: [] time_date_columns: [] date_columns: []
04-01-status.csv: time_columns: ['deathTime'] time_date_columns: ['deathDate'] date_columns: ['statusDate', 'dischargeDate', 'transferDate']


04-02-cardiovascular.csv: time_columns: [] time_date_columns: [] date_columns: []
04-03-respiratory.csv: time_columns: ['dischargePulmonaryStartTime1', 'dischargePulmonaryEndTime1', 'dischargePulmonaryStartTime2', 'dischargePulmonaryEndTime2', 'dischargePulmonaryStartTime3', 'dischargePulmonaryEndTime3'] time_date_columns: ['dischargePulmonaryStartDate1', 'dischargePulmonaryEndDate1', 'dischargePulmonaryStartDate2', 'dischargePulmonaryEndDate2', 'dischargePulmonaryStartDate3', 'dischargePulmonaryEndDate3'] date_columns: []


04-04-hematology.csv: time_columns: [] time_date_columns: [] date_columns: []
04-05-metabolic.csv: time_columns: [] time_date_columns: [] date_columns: []


04-06-renal.csv: time_columns: [] time_date_columns: [] date_columns: []
04-07-gastrointestinal.csv: time_columns: [] time_date_columns: [] date_columns: []


04-08-skin.csv: time_columns: [] time_date_columns: [] date_columns: ['dischargeErythemaOnsetDate', 'dischargeErythemaResolveDate', 'dischargeScleremaOnsetDate', 'dischargeScleremaResolveDate', 'dischargeCyanosisOnsetDate', 'dischargeCyanosisResolveDate', 'dischargeSubFatNecrosisOnsetDate', 'dischargeSubFatNecrosisResolveDate']
04-09-auditory.csv: time_columns: [] time_date_columns: [] date_columns: []


04-10-surgery.csv: time_columns: [] time_date_columns: [] date_columns: []
04-11-infection.csv: time_columns: [] time_date_columns: [] date_columns: []


04-12-neuro-exam.csv: time_columns: ['dischargeNeuroExamTime'] time_date_columns: ['dischargeNeuroExamDate'] date_columns: []
04-12_1-total-modified-sarnat.csv: time_columns: [] time_date_columns: [] date_columns: []


04-13-seizure.csv: time_columns: ['dischargeEEGFindingConsistentWithSeizureTime', 'dischargeEEGAbnormalBackgroundActivityTime'] time_date_columns: ['dischargeEEGFindingConsistentWithSeizureDate', 'dischargeEEGAbnormalBackgroundActivityDate'] date_columns: []
04-14-birth-defect.csv: time_columns: [] time_date_columns: [] date_columns: []


04-15-home-therapy.csv: time_columns: [] time_date_columns: [] date_columns: []
04-16-wdraw-support.csv: time_columns: ['wdrawSupportTime'] time_date_columns: ['wdrawSupportDate'] date_columns: []


04-17-limit-care.csv: time_columns: ['limitCareDNRTime'] time_date_columns: ['limitCareDNRDate'] date_columns: []
20-00-follow-up.csv: time_columns: [] time_date_columns: [] date_columns: ['visitDate']


20-01-ses.csv: time_columns: [] time_date_columns: [] date_columns: ['SESVisitDate', 'SESBirthDate', 'SESInterviewDate']
20-02-medical-history.csv: time_columns: [] time_date_columns: [] date_columns: []


20-03-medical-exam.csv: time_columns: [] time_date_columns: [] date_columns: ['examCompleteDate']
20-04-bayley-iii.csv: time_columns: [] time_date_columns: [] date_columns: ['BayleyIIIDate']


20-05-gmfcs.csv: time_columns: [] time_date_columns: [] date_columns: []
20-06-status.csv: time_columns: [] time_date_columns: [] date_columns: ['statusVisitDate', 'statusBirthDate', 'deathDate', 'firstVisitDate', 'finalVisitDate']


20-07-readmission.csv: time_columns: [] time_date_columns: [] date_columns: []
20-08-lost.csv: time_columns: [] time_date_columns: [] date_columns: ['lostFollowUpLastContactDate', 'lostFollowUpFormCompleteDate', 'lostFollowUpDeathDate', 'lostFollowUpInterviewDate', 'lostFollowUpChartReviewDate']


20-09-secondary.csv: time_columns: [] time_date_columns: [] date_columns: []
20-10-outcome.csv: time_columns: [] time_date_columns: [] date_columns: []


20-10_1-disability-level-death.csv: time_columns: [] time_date_columns: [] date_columns: []
