In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import os

import itertools

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 01. init

In [2]:
root_dir = '../'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

## 02. Dictionary

In [4]:
data_dict_filename = f'{root_dir}/Dictionary_HIE_clinical_variables.xlsx'

In [5]:
df_dict_main = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=COMBINE_harmonizer.SHEET_MAIN)
df_dict_followup = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name=COMBINE_harmonizer.SHEET_FOLLOW_UP)


## 03. Mapping

In [6]:
_MAIN_ORDER_MAP = COMBINE_harmonizer.build_variable_order_map(data_dict_filename, sheet_name=COMBINE_harmonizer.SHEET_MAIN)

_FOLLOWUP_ORDER_MAP = COMBINE_harmonizer.build_variable_order_map(data_dict_filename, sheet_name=COMBINE_harmonizer.SHEET_FOLLOW_UP)

In [7]:
_MAIN_ORDER_MAP

{'_study': 0,
 'center': 1,
 'subjectID': 2,
 'uniqueID': 3,
 'MRI_ID': 4,
 'followupCenter': 5,
 'followupID': 6,
 'uniqueFollowupID': 7,
 '_flatten_index': 8,
 'siteID': 11,
 'birthDate': 176,
 'birthNumber': 13,
 'screenComment': 14,
 'coreTempLess32p5CGreaterEq2Hr_e': 15,
 'coreTempLess33p5CGreater1Hr_e': 16,
 'coreTempLess34CGreater1Hr_e': 17,
 'first6HrCoolByClinicalProtocol_e': 18,
 'chromosomalAbnormality_e': 19,
 'majorCongenitalAnomaly_e': 20,
 'birthWeightLessEq1800g_e': 21,
 'infantUnlikelySurvive_e': 22,
 'first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e': 23,
 'postnatalAgeLess6HrOrGreater24Hr_e': 24,
 'enrolledConflictingTrial_e': 25,
 'first60MinAnyBloodGasPHLessEq7_i': 26,
 'first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i': 27,
 'historyPerinatalEvent_i': 28,
 'at10MinApgarLessEq5OrVent_i': 29,
 'randomEligible': 30,
 'consentStatus': 31,
 'noConsentReason': 32,
 'noInStudyReason': 33,
 'random': 34,
 'noRandomReason': 35,
 'noRandomReasonText': 36,
 '

In [8]:
_FOLLOWUP_ORDER_MAP

{'_study': 0,
 'center': 1,
 'subjectID': 2,
 'uniqueID': 3,
 'MRI_ID': 4,
 'followupCenter': 5,
 'followupID': 6,
 'uniqueFollowupID': 7,
 '_flatten_index': 8,
 'siteID': 1118,
 'birthDate': 1119,
 'visitDate': 1120,
 'birthNumber': 1121,
 'center_orig': 1125,
 'SESVisitDate': 1126,
 'SESBirthDate': 1127,
 'chronologicalAge_mo': 1128,
 'correctedAge_mo': 1129,
 'underStateSupervision': 1130,
 'primaryCaretaker': 1131,
 'otherCaretaker': 1132,
 'maritalStatusPrimaryCaretaker': 1133,
 'livingArrangementChild': 1134,
 'numberPeopleInChildHousehold': 1135,
 'otherContributeMoneyToChildHousehold': 1136,
 'educationPrimaryCaretaker': 1137,
 'educationOtherCaretaker': 1138,
 'workPrimaryCaretaker': 1139,
 'workOtherCaretaker': 1140,
 'inSchoolPrimaryCaretaker': 1141,
 'inSchoolOtherCaretaker': 1142,
 'totalIncomeChildHousehold': 1143,
 'medicalInsuranceChild': 1144,
 'primaryLanguageChild': 1145,
 'primaryLanguageChildOtherText': 1146,
 'isSecondaryLanguageChild': 1147,
 'secondaryLanguageCh

## 04. Filenames and Merge

In [9]:
out_dir = cfg.config['out_dir']
_DIRS = {
    'LH': f'{out_dir}/out-LH-normalized',
    'OC': f'{out_dir}/out-OC-normalized',
}

_FILENAMES = [each['name'] for each in COMBINE_harmonizer.FILENAME_INFOS]

_FILENAME_INFO_MAP = {each['name']: each for each in COMBINE_harmonizer.FILENAME_INFOS}

_OUT_DIR = f'{out_dir}/out-merged-normalized'
os.makedirs(_OUT_DIR, exist_ok=True)

In [10]:
def _get_order(x, order_map):
    if x not in order_map:
        print(f'[WARN] not in order_map: {x}')
        return COMBINE_harmonizer.MAX_INT

    return order_map[x]


def _column_and_orig(column, columns_with_orig):
    ret = [column]

    # XXX remove .orig in merge
    # if column + '.orig' in columns_with_orig:
    #     ret += [column + '.orig']

    return ret


def _reorder_columns(df, filename):
    columns_without_orig = list(filter(lambda x: not x.endswith('.orig'), df.columns))
    columns_with_orig = list(filter(lambda x: x.endswith('.orig'), df.columns))

    each_filename_info = _FILENAME_INFO_MAP[filename]
    order_map = _MAIN_ORDER_MAP
    if each_filename_info['data_dict'] == COMBINE_harmonizer.SHEET_FOLLOW_UP:
        order_map = _FOLLOWUP_ORDER_MAP

    columns_without_orig.sort(key=lambda x: _get_order(x, order_map))
    columns_with_orig.sort(key=lambda x: _get_order(x[:-5], order_map))

    reserved_columns = list(filter(lambda x: x in columns_without_orig, COMBINE_harmonizer.RESERVED_COLUMNS))
    other_columns = list(filter(lambda x: x not in COMBINE_harmonizer.RESERVED_COLUMNS, columns_without_orig))

    columns_and_origs = list(itertools.chain.from_iterable([_column_and_orig(each, columns_with_orig) for each in other_columns]))

    columns = reserved_columns + columns_and_origs

    return df[columns]

In [11]:
for idx, filename in enumerate(_FILENAMES):
    df = None
    for study, each_dir in _DIRS.items():
        each_filename = os.sep.join([each_dir, filename])
        if not os.path.exists(each_filename):
            continue

        each_df = pd.read_csv(each_filename, dtype='O')
        each_df['_study'] = study

        if df is None:
            df = each_df
        else:
            df = pd.concat([df, each_df])

    if df is None:
        print(f'[WARN] ({idx}/{len(_FILENAMES)}) not df: filename: {filename}')
        continue

    df = _reorder_columns(df, filename)

    out_filename = os.sep.join([_OUT_DIR, filename])

    df.to_csv(out_filename, index=False)

    print(f'[INFO] ({idx}/{len(_FILENAMES)}) done: filename: {filename} out_filename: {out_filename}')

[INFO] (0/73) done: filename: 00-02-screening.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/00-02-screening.csv


[INFO] (1/73) done: filename: 00-12-neuro-exam.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/00-12-neuro-exam.csv


[INFO] (2/73) done: filename: 01-02-screening.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-02-screening.csv


[INFO] (3/73) done: filename: 01-03-maternal-demographics.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-03-maternal-demographics.csv
[INFO] (4/73) done: filename: 01-04-pregnancy-history.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-04-pregnancy-history.csv


[INFO] (5/73) done: filename: 01-05-labor-delivery.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-05-labor-delivery.csv
[WARN] (6/73) not df: filename: 01-05_1-pse.csv
[WARN] (7/73) not df: filename: 01-05_2-emergency-csection.csv


[INFO] (8/73) done: filename: 01-06-birth.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-06-birth.csv


[INFO] (9/73) done: filename: 01-07-pre-temperature.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-07-pre-temperature.csv
[INFO] (10/73) done: filename: 01-08-pre-cardiovascular.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-08-pre-cardiovascular.csv


[INFO] (11/73) done: filename: 01-09-pre-infection.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-09-pre-infection.csv


[INFO] (12/73) done: filename: 01-10-pre-other-med.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-10-pre-other-med.csv


[INFO] (13/73) done: filename: 01-11-pre-imaging.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-11-pre-imaging.csv
[INFO] (14/73) done: filename: 01-12-neuro-exam.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/01-12-neuro-exam.csv


[WARN] (15/73) not df: filename: 01-12_1-total-modified-sarnat.csv


[INFO] (16/73) done: filename: 02-01-temperature.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-01-temperature.csv


[INFO] (17/73) done: filename: 02-02-cardiovascular.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-02-cardiovascular.csv


[INFO] (18/73) done: filename: 02-03-respiratory.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-03-respiratory.csv


[INFO] (19/73) done: filename: 02-04-blood-gas.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-04-blood-gas.csv


[INFO] (20/73) done: filename: 02-05-hematology.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-05-hematology.csv


[INFO] (21/73) done: filename: 02-05_s-hematology.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-05_s-hematology.csv


[INFO] (22/73) done: filename: 02-06_s-blood-value.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-06_s-blood-value.csv


[INFO] (23/73) done: filename: 02-07-infection.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-07-infection.csv


[INFO] (24/73) done: filename: 02-08-other-med.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-08-other-med.csv


[INFO] (25/73) done: filename: 02-09-imaging.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-09-imaging.csv
[INFO] (26/73) done: filename: 02-11-elevated-temperature.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-11-elevated-temperature.csv


[INFO] (27/73) done: filename: 02-12-fluctuated-temperature.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-12-fluctuated-temperature.csv
[INFO] (28/73) done: filename: 02-13-bradycardia.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-13-bradycardia.csv


[INFO] (29/73) done: filename: 02-14-adverse-event.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-14-adverse-event.csv


[INFO] (30/73) done: filename: 02-15-violation.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-15-violation.csv
[INFO] (31/73) done: filename: 02-16-interrupt.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-16-interrupt.csv


[INFO] (32/73) done: filename: 02-17-discontinue.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/02-17-discontinue.csv


[INFO] (33/73) done: filename: 03-01-post-temperature.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-01-post-temperature.csv


[INFO] (34/73) done: filename: 03-01_s-post-temperature.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-01_s-post-temperature.csv


[INFO] (35/73) done: filename: 03-02-post-blood-value.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-02-post-blood-value.csv


[INFO] (36/73) done: filename: 03-03-post-imaging.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-03-post-imaging.csv
[INFO] (37/73) done: filename: 03-04-post-neuro-exam.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-04-post-neuro-exam.csv


[WARN] (38/73) not df: filename: 03-04_1-total-modified-sarnat.csv
[WARN] not in order_map: subjectID_with_postfix
[WARN] not in order_map: subjectID_postfix
[WARN] not in order_map: subjectID_with_postfix
[WARN] not in order_map: subjectID_postfix


[INFO] (39/73) done: filename: 03-05-mri.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-05-mri.csv


[INFO] (40/73) done: filename: 03-05_s-mri.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/03-05_s-mri.csv
[WARN] (41/73) not df: filename: 03-05_s1-mri.csv


[INFO] (42/73) done: filename: 04-01-status.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-01-status.csv
[WARN] (43/73) not df: filename: 04-01_1-length-of-stay.csv


[INFO] (44/73) done: filename: 04-02-cardiovascular.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-02-cardiovascular.csv


[INFO] (45/73) done: filename: 04-03-respiratory.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-03-respiratory.csv
[INFO] (46/73) done: filename: 04-04-hematology.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-04-hematology.csv


[INFO] (47/73) done: filename: 04-05-metabolic.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-05-metabolic.csv


[INFO] (48/73) done: filename: 04-06-renal.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-06-renal.csv


[INFO] (49/73) done: filename: 04-07-gastrointestinal.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-07-gastrointestinal.csv


[INFO] (50/73) done: filename: 04-08-skin.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-08-skin.csv


[INFO] (51/73) done: filename: 04-09-auditory.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-09-auditory.csv


[INFO] (52/73) done: filename: 04-10-surgery.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-10-surgery.csv


[INFO] (53/73) done: filename: 04-11-infection.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-11-infection.csv


[INFO] (54/73) done: filename: 04-12-neuro-exam.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-12-neuro-exam.csv
[WARN] (55/73) not df: filename: 04-12_1-total-modified-sarnat.csv


[INFO] (56/73) done: filename: 04-13-seizure.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-13-seizure.csv


[INFO] (57/73) done: filename: 04-14-birth-defect.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-14-birth-defect.csv


[INFO] (58/73) done: filename: 04-15-home-therapy.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-15-home-therapy.csv


[INFO] (59/73) done: filename: 04-16-wdraw-support.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-16-wdraw-support.csv
[INFO] (60/73) done: filename: 04-17-limit-care.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/04-17-limit-care.csv


[INFO] (61/73) done: filename: 20-00-follow-up.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-00-follow-up.csv


[INFO] (62/73) done: filename: 20-01-ses.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-01-ses.csv


[INFO] (63/73) done: filename: 20-02-medical-history.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-02-medical-history.csv


[INFO] (64/73) done: filename: 20-03-medical-exam.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-03-medical-exam.csv


[INFO] (65/73) done: filename: 20-04-bayley-iii.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-04-bayley-iii.csv


[INFO] (66/73) done: filename: 20-05-gmfcs.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-05-gmfcs.csv


[INFO] (67/73) done: filename: 20-06-status.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-06-status.csv


[INFO] (68/73) done: filename: 20-07-readmission.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-07-readmission.csv


[INFO] (69/73) done: filename: 20-08-lost.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-08-lost.csv
[INFO] (70/73) done: filename: 20-09-secondary.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-09-secondary.csv


[INFO] (71/73) done: filename: 20-10-outcome.csv out_filename: /Volumes/neuro/labs/grantlab/research/chuanheng.hsiao/HIE-out/out-merged-normalized/20-10-outcome.csv
[WARN] (72/73) not df: filename: 20-10_1-disability-level-death.csv
