In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import os

from tqdm import tqdm

import itertools

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 01. init

In [2]:
root_dir = '../'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
input_dir = f"{cfg.config['out_dir']}/out-publish-normalized"
out_dir = f"{cfg.config['out_dir']}/stats"

### 01-1. data-dict

In [5]:
data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"
df_data_dict_main = COMBINE_harmonizer.load_data_dict(data_dict_filename, COMBINE_harmonizer.SHEET_MAIN)
df_data_dict_follow_up = COMBINE_harmonizer.load_data_dict(data_dict_filename, COMBINE_harmonizer.SHEET_FOLLOW_UP)

df_data_dict_main.loc[:, 'sheet'] = COMBINE_harmonizer.SHEET_MAIN
df_data_dict_follow_up.loc[:, 'sheet'] = COMBINE_harmonizer.SHEET_FOLLOW_UP

for idx in ['LH', 'OC']:
    df_data_dict_main.loc[:, idx] = df_data_dict_main[idx].astype('O')
    df_data_dict_main.loc[:, idx] = df_data_dict_main[idx].fillna('')
    df_data_dict_follow_up.loc[:, idx] = df_data_dict_follow_up[idx].astype('O')
    df_data_dict_follow_up.loc[:, idx] = df_data_dict_follow_up[idx].fillna('')

df_data_dict = pd.concat([df_data_dict_main, df_data_dict_follow_up])
df_data_dict = df_data_dict.drop_duplicates(['sheet', COMBINE_harmonizer.DATA_DICT_VAR_NAME])

### 01-1-1. var-name => LH / OC map

In [6]:
def _var_name_map(df):
    LH_map = {each['Standardized_VariableNames_Dictionary']: each['LH'] for idx, each in df.iterrows()}
    OC_map = {each['Standardized_VariableNames_Dictionary']: each['OC'] for idx, each in df.iterrows()}
    the_type_map = {each['Standardized_VariableNames_Dictionary']: each['type'] for idx, each in df.iterrows()}

    return {'LH': LH_map, 'OC': OC_map, 'the_type': the_type_map}

_DATA_DICT_MAP = {
    COMBINE_harmonizer.SHEET_MAIN: _var_name_map(df_data_dict_main),
    COMBINE_harmonizer.SHEET_FOLLOW_UP: _var_name_map(df_data_dict_follow_up),
}

#### 01-1-1-1. XXX hack for _DATA_DICT_MAP-main

In [7]:
_data_dict_map_main_LH = {
    '_study': 'LH',
    'pre_TemperatureMinDate': 'L6ARDDT',
    'pre_TemperatureMinTime': 'L6ARDTM',
    'pre_SkinTemperatureMin_C': 'L6ASKINT',
    'pre_AxillaryTemperatureMin_C': 'L6AAXILT',
    'pre_EsophagealTemperatureMin_C': 'L6AESPHT',
    'pre_ServoSetMin_C': 'L6ASVOSP',
    'pre_TemperatureMaxDate': 'L6ARDDT',
    'pre_TemperatureMaxTime': 'L6ARDTM',
    'pre_SkinTemperatureMax_C': 'L6ASKINT',
    'pre_AxillaryTemperatureMax_C': 'L6AAXILT',
    'pre_EsophagealTemperatureMax_C': 'L6AESPHT',
    'pre_ServoSetMax_C': 'L6ASVOSP',

    'pre_CardioDate': 'LH6CVRDT',
    'pre_CardioTime': 'LH6CVRTM',
    'pre_CardioSystolicBloodPressure_mmHg': 'LH6CVBPS',
    'pre_CardioDiastolicBloodPressure_mmHg': 'LH6CVBPD',
    'pre_CardioHeartRate_BPM': 'LH6CVHR',
    'pre_CardioVolumeExpand': 'LH6CVVE',
    'pre_CardioInotropicAgent': 'LH6CVIA',
    'pre_CardioBloodTransfusion': 'LH6CVBT',
    'pre_CardioPlatelets': 'LH6CVPLT',

    'pre_OtherMedTargetDate': 'LH6OMTDT',
    'pre_OtherMedTargetTime': 'LH6OMTTM',
    'pre_Anticonvulsants': 'LH6OMACV',
    'pre_Anticonvulsants1': 'LH6OMAC1',
    'pre_Anticonvulsants2': 'LH6OMAC2',
    'pre_Anticonvulsants3': 'LH6OMAC3',
    'pre_Analgesics': 'LH6OMAGS',
    'pre_AnalgesicsSedatives1': 'LH6OMAG1',
    'pre_AnalgesicsSedatives2': 'LH6OMAG2',
    'pre_AnalgesicsSedatives3': 'LH6OMAG3',
    'pre_Antipyretics': 'LH6OMAPY',
    'pre_Antipyretics1': 'LH6OMAP1',
    'pre_Antipyretics2': 'LH6OMAP2',
    'pre_Antipyretics3': 'LH6OMAP3',
    'pre_Paralytics': 'LH6OMNBA',
    'pre_Paralytics1': 'LH6OMNB1',
    'pre_Paralytics2': 'LH6OMNB2',
    'pre_Paralytics3': 'LH6OMNB3',
    'pre_OtherMedFluidIntake_ccPerKg': 'LH6OMFI',
    'pre_OtherMedUrineOutput_ccPerKg': 'LH6OMUO',

    'pre_HeadSonogram': 'LH9HSONO',
    'pre_HeadSonogramDate': 'LH9HSDAT',
    'pre_HeadSonogramTime': 'LH9HSTIM',
    'pre_HeadSonogramResult1': 'LH9HSREA',
    'pre_HeadSonogramResult2': 'LH9HSREB',
    'pre_HeadSonogramResult3': 'LH9HSREC',
    'pre_HeadSonogramResult4': 'LH9HSRED',
    'pre_HeadSonogramResult5': 'LH9HSREE',
    'pre_HeadSonogramResult6': 'LH9HSREF',
    'pre_HeadSonogramResult7': 'LH9HSREG',
    'pre_HeadSonogramResult8': 'LH9HSREH',
    'pre_HeadSonogramResultText': 'LH9HSRES',
    'pre_HeadCT': 'LH9HCT',
    'pre_HeadCTDate': 'LH9HCDAT',
    'pre_HeadCTTime': 'LH9HCTIM',
    'pre_HeadCTResult1': 'LH9HCREA',
    'pre_HeadCTResult2': 'LH9HCREB',
    'pre_HeadCTResult3': 'LH9HCREC',
    'pre_HeadCTResult4': 'LH9HCRED',
    'pre_HeadCTResult5': 'LH9HCREE',
    'pre_HeadCTResult6': 'LH9HCREF',
    'pre_HeadCTResult7': 'LH9HCREG',
    'pre_HeadCTResult8': 'LH9HCREH',
    'pre_HeadCTResultText': 'LH9HCRES',
    'pre_BrainMRI': 'LH9MRI',
    'pre_BrainMRIDate': 'LH9BMDAT',
    'pre_BrainMRITime': 'LH9BMTIM',
    'pre_BrainMRIResult1': 'LH9BMREA',
    'pre_BrainMRIResult2': 'LH9BMREB',
    'pre_BrainMRIResult3': 'LH9BMREC',
    'pre_BrainMRIResult4': 'LH9BMRED',
    'pre_BrainMRIResult5': 'LH9BMREE',
    'pre_BrainMRIResult6': 'LH9BMREF',
    'pre_BrainMRIResult7': 'LH9BMREG',
    'pre_BrainMRIResult8': 'LH9BMREH',
    'pre_BrainMRIResultText': 'LH9BMRES',

    'positiveCultureNumber': '1',

    'post_HeadSonogram': 'LH9HSONO',
    'post_HeadSonogramDate': 'LH9HSDAT',
    'post_HeadSonogramTime': 'LH9HSTIM',
    'post_HeadSonogramResult1': 'LH9HSREA',
    'post_HeadSonogramResult2': 'LH9HSREB',
    'post_HeadSonogramResult3': 'LH9HSREC',
    'post_HeadSonogramResult4': 'LH9HSRED',
    'post_HeadSonogramResult5': 'LH9HSREE',
    'post_HeadSonogramResult6': 'LH9HSREF',
    'post_HeadSonogramResult7': 'LH9HSREG',
    'post_HeadSonogramResult8': 'LH9HSREH',
    'post_HeadSonogramResultText': 'LH9HSRES',
    'post_HeadCT': 'LH9HCT',
    'post_HeadCTDate': 'LH9HCDAT',
    'post_HeadCTTime': 'LH9HCTIM',
    'post_HeadCTResult1': 'LH9HCREA',
    'post_HeadCTResult2': 'LH9HCREB',
    'post_HeadCTResult3': 'LH9HCREC',
    'post_HeadCTResult4': 'LH9HCRED',
    'post_HeadCTResult5': 'LH9HCREE',
    'post_HeadCTResult6': 'LH9HCREF',
    'post_HeadCTResult7': 'LH9HCREG',
    'post_HeadCTResult8': 'LH9HCREH',
    'post_HeadCTResultText': 'LH9HCRES',
    'post_BrainMRI': 'LH9MRI',
    'post_BrainMRIDate': 'LH9BMDAT',
    'post_BrainMRITime': 'LH9BMTIM',
    'post_BrainMRIResult1': 'LH9BMREA',
    'post_BrainMRIResult2': 'LH9BMREB',
    'post_BrainMRIResult3': 'LH9BMREC',
    'post_BrainMRIResult4': 'LH9BMRED',
    'post_BrainMRIResult5': 'LH9BMREE',
    'post_BrainMRIResult6': 'LH9BMREF',
    'post_BrainMRIResult7': 'LH9BMREG',
    'post_BrainMRIResult8': 'LH9BMREH',
    'post_BrainMRIResultText': 'LH9BMRES',

    'statusDate': 'LH12DCDT\nLH12TRDT\nLH12DTDT',
}

_data_dict_map_main_OC = {
    '_study': 'OC',
    'pre_CardioDate': 'OC7CVDT',
    'pre_CardioTime': 'OC7CVTM',
    'pre_CardioSystolicBloodPressure_mmHg': 'OC7CBPS',
    'pre_CardioDiastolicBloodPressure_mmHg': 'OC7CBPD',
    'pre_CardioHeartRate_BPM': 'OC7CVHR',
    'pre_CardioVolumeExpand': 'OC7CVVE',
    'pre_CardioInotropicAgent': 'OC7CVIA',
    'pre_CardioBloodTransfusion': 'OC7CVBT',
    'pre_CardioPlatelets': 'OC7CPLT',

    'pre_Anticonvulsants1': 'OC8OMAC1',
    'pre_Anticonvulsants2': 'OC8OMAC2',
    'pre_Anticonvulsants3': 'OC8OMAC3',
    'pre_AnalgesicsSedatives1': 'OC8OMAG1',
    'pre_AnalgesicsSedatives2': 'OC8OMAG2',
    'pre_AnalgesicsSedatives3': 'OC8OMAG3',
    'pre_Antipyretics1': 'OC8OMAP1',
    'pre_Antipyretics2': 'OC8OMAP2',
    'pre_Antipyretics3': 'OC8OMAP3',
    'pre_Paralytics1': 'OC8OMNB1',
    'pre_Paralytics2': 'OC8OMNB2',
    'pre_Paralytics3': 'OC8OMNB3',
    'pre_OtherMedFluidIntake_ccPerKg': 'OC8OMFI',
    'pre_OtherMedUrineOutput_ccPerKg': 'OC8OMUO',

    'pre_HeadSonogram': 'OC12HSON',
    'pre_HeadSonogramDate': 'OC12HSDA',
    'pre_HeadSonogramTime': 'OC12HSTM',
    'pre_HeadSonogramResult1': 'OC12HSRA',
    'pre_HeadSonogramResult2': 'OC12HSRB',
    'pre_HeadSonogramResult3': 'OC12HSRC',
    'pre_HeadSonogramResult4': 'OC12HSRD',
    'pre_HeadSonogramResult5': 'OC12HSRE',
    'pre_HeadSonogramResult6': 'OC12HSRF',
    'pre_HeadSonogramResult7': 'OC12HSRG',
    'pre_HeadSonogramResult8': 'OC12HSRH',
    'pre_HeadSonogramResultText': 'OC12HSRS',
    'pre_HeadCT': 'OC12HCT',
    'pre_HeadCTDate': 'OC12HCDA',
    'pre_HeadCTTime': 'OC12HCTM',
    'pre_HeadCTResult1': 'OC12HCRA',
    'pre_HeadCTResult2': 'OC12HCRB',
    'pre_HeadCTResult3': 'OC12HCRC',
    'pre_HeadCTResult4': 'OC12HCRD',
    'pre_HeadCTResult5': 'OC12HCRE',
    'pre_HeadCTResult6': 'OC12HCRF',
    'pre_HeadCTResult7': 'OC12HCRG',
    'pre_HeadCTResult8': 'OC12HCRH',
    'pre_HeadCTResultText': 'OC12HCRS',
    'pre_BrainMRI': 'OC12MRI',
    'pre_BrainMRIDate': 'OC12BMDA',
    'pre_BrainMRITime': 'OC12BMTM',
    'pre_BrainMRIResult1': 'OC12BMRA',
    'pre_BrainMRIResult2': 'OC12BMRB',
    'pre_BrainMRIResult3': 'OC12BMRC',
    'pre_BrainMRIResult4': 'OC12BMRD',
    'pre_BrainMRIResult5': 'OC12BMRE',
    'pre_BrainMRIResult6': 'OC12BMRF',
    'pre_BrainMRIResult7': 'OC12BMRG',
    'pre_BrainMRIResult8': 'OC12BMRH',
    'pre_BrainMRIResultText': 'OC12BMRS',

    'bloodGasTimeSlot': 'OC7INTV',
    'hematologyTimeSlot': 'OC8INTV',
    'hematologyDate': 'OC8RSRDT',
    'hematologyTime': 'OC8RSRTM',
    'otherMedTimeSlot': 'OC8INTV',

    'post_HeadSonogram': 'OC12HSON',
    'post_HeadSonogramDate': 'OC12HSDA',
    'post_HeadSonogramTime': 'OC12HSTM',
    'post_HeadSonogramResult1': 'OC12HSRA',
    'post_HeadSonogramResult2': 'OC12HSRB',
    'post_HeadSonogramResult3': 'OC12HSRC',
    'post_HeadSonogramResult4': 'OC12HSRD',
    'post_HeadSonogramResult5': 'OC12HSRE',
    'post_HeadSonogramResult6': 'OC12HSRF',
    'post_HeadSonogramResult7': 'OC12HSRG',
    'post_HeadSonogramResult8': 'OC12HSRH',
    'post_HeadSonogramResultText': 'OC12HSRS',
    'post_HeadCT': 'OC12HCT',
    'post_HeadCTDate': 'OC12HCDA',
    'post_HeadCTTime': 'OC12HCTM',
    'post_HeadCTResult1': 'OC12HCRA',
    'post_HeadCTResult2': 'OC12HCRB',
    'post_HeadCTResult3': 'OC12HCRC',
    'post_HeadCTResult4': 'OC12HCRD',
    'post_HeadCTResult5': 'OC12HCRE',
    'post_HeadCTResult6': 'OC12HCRF',
    'post_HeadCTResult7': 'OC12HCRG',
    'post_HeadCTResult8': 'OC12HCRH',
    'post_HeadCTResultText': 'OC12HCRS',
    'post_BrainMRI': 'OC12MRI',
    'post_BrainMRIDate': 'OC12BMDA',
    'post_BrainMRITime': 'OC12BMTM',
    'post_BrainMRIResult1': 'OC12BMRA',
    'post_BrainMRIResult2': 'OC12BMRB',
    'post_BrainMRIResult3': 'OC12BMRC',
    'post_BrainMRIResult4': 'OC12BMRD',
    'post_BrainMRIResult5': 'OC12BMRE',
    'post_BrainMRIResult6': 'OC12BMRF',
    'post_BrainMRIResult7': 'OC12BMRG',
    'post_BrainMRIResult8': 'OC12BMRH',
    'post_BrainMRIResultText': 'OC12BMRS',

    'discontinueParentsWithdraw': 'OC6NCEPR',
    'discontinuePhysicianWithdraw': 'OC6NCEPR',
    'discontinueAdverseEvent': 'OC6NCEPR',
    'discontinueECMO': 'OC6NCEPR',
    'discontinueDNR': 'OC6NCEPR',
    'discontinueWdrawSupport': 'OC6NCEPR',
    'discontinueDeath': 'OC6NCEPR',
    'discontinueOther': 'OC6NCEPR',

    'status': 'OC13STAT',
    'transferDate': 'OC13DDAT',
    'transferWeight_g': 'OC13WGHT',
    'transferLength_cm': 'OC13LGTH',
    'transferHeadCircumference_cm': 'OC13CIRC',
}

_DATA_DICT_MAP[COMBINE_harmonizer.SHEET_MAIN]['LH'].update(_data_dict_map_main_LH)
_DATA_DICT_MAP[COMBINE_harmonizer.SHEET_MAIN]['OC'].update(_data_dict_map_main_OC)

### 01-1-2. type map

In [8]:
the_types = list(filter(lambda x: not pd.isnull(x) and not x.startswith('list[') and not x.startswith('_'), df_data_dict['type'].unique()))
the_types

['center',
 'text',
 'date',
 'int',
 'bool',
 'consentStatus',
 'time',
 'float',
 'treatmentAssign',
 'blanketType',
 'race',
 'race2',
 'ethnicity',
 'maritalStatus',
 'education',
 'education2',
 'insurance',
 'deliveryMode',
 'antibiotics',
 'encephalopathyLevel',
 'infantAge',
 'infantSex',
 'spontaneousRespirationTime',
 'cordBloodGasSrc',
 'bloodGasSrc',
 'initBloodGasSrc',
 'targetTreatmentTemperature',
 'respiratorySupportType',
 'positiveCultureSrc',
 'positiveCultureOrganism',
 'anticonvulsants',
 'analgesics',
 'antipyretics',
 'paralytics',
 'imaging',
 'noNeuroExamReason',
 'signOfHIELvlOfCons',
 'signOfHIESpontaneousActivity',
 'signOfHIEPosture',
 'signOfHIETone',
 'signOfHIESuck',
 'signOfHIEMoro',
 'signOfHIEPupils',
 'signOfHIEHeartRate',
 'signOfHIERespiratory',
 'elevatedTempDevice',
 'elevatedTempDeviceMode',
 'elevatedTempNoBathReason',
 'bradycardiaEKGResult',
 'bradycardiaDuration',
 'bradycardiaHeartRateMin',
 'SAEAttributable',
 'SAEAction',
 'SAEOutcome',
 

In [9]:
excel = pd.ExcelFile(data_dict_filename)

def _type_map(the_type):
    if the_type == 'center':
        return 'text'
    elif the_type == 'text':
        return 'text'
    elif the_type == 'date':
        return 'date'
    elif the_type == 'int':
        return 'number'
    elif the_type == 'bool':
        return 'boolean'
    elif the_type == 'time':
        return 'time'
    elif the_type == 'float':
        return 'number'
    else:
        df_by_type = pd.read_excel(excel, the_type)
        if COMBINE_harmonizer.DEFAULT_ORDINAL_COLUMN in df_by_type:
            return 'ordinal'
        else:
            return 'nomial'

_TYPE_MAP = {each: _type_map(each) for each in the_types}

In [10]:
df_data_dict_main

Unnamed: 0,Category,Subcategory,Standardized_VariableNames_Dictionary,type,Variable_Description,#studies w/ this var,redcap,comment,lower_var,var_eq_redcap,connect_redcap,LH,OC,sheet
0,Pre-intervention,Identity,center,center,Center:,2,center,,center,True,center,CENTER,CENTER,main
1,Pre-intervention,Identity,subjectID,text,Hypothermia ID (study id),2,subject_id,,subjectid,True,subjectid,HTHRM_ID,HTHRM_ID,main
2,Pre-intervention,Screening,siteID,text,Site number,2,site_id,,siteid,True,siteid,SITENM,SITENM,main
3,Pre-intervention,Screening,birthDate,date,birth date,2,birth_date,,birthdate,True,birthdate,BIRTHDT,BIRTHDT,main
4,Pre-intervention,Screening,birthNumber,int,birth number,2,birth_number,,birthnumber,True,birthnumber,BIRTHNM,BIRTHNM,main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,NICU Discharge,Limitation of Care,limitCareNoFurtherChestCompression,bool,No further chest compressions,2,limit_care_no_further_chest_compression,,limitcarenofurtherchestcompression,True,limitcarenofurtherchestcompression,LH12LCC,OC13NFCC,main
1104,NICU Discharge,Limitation of Care,limitCareNoFurtherEmergencyMedication,bool,"No further use of emergency medications (epi, ...",2,limit_care_no_further_emergency_medication,,limitcarenofurtheremergencymedication,True,limitcarenofurtheremergencymedication,LH12LEM,OC13NFEM,main
1105,NICU Discharge,Limitation of Care,limitCareDNR,bool,Was there a DNR Order?,2,limit_care_dnr,,limitcarednr,True,limitcarednr,LH12DNR,OC13DNR,main
1106,NICU Discharge,Limitation of Care,limitCareDNRDate,date,Date of DNR order,2,limit_care_dnr_date,,limitcarednrdate,True,limitcarednrdate,LH12DDNR,OC13DNRD,main


## 02-01. check columns

In [11]:
def _basic_info(filename_info):
    filename = filename_info['name']
    full_filename = f'{input_dir}/{filename}'
    df = pd.read_csv(full_filename)

    valid_columns = list(filter(lambda x: x not in COMBINE_harmonizer.RESERVED_COLUMNS and x.endswith('.orig') == False, df.columns))
    exclude_columns = filename_info.get('exclude_columns', None)
    if exclude_columns is None:
        exclude_columns = []
    valid_columns = list(filter(lambda x: x not in exclude_columns, valid_columns))

    return {
        'filename': filename,
        'len-columns': len(valid_columns),
        'columns': valid_columns,
    }

valid_filename_infos = list(filter(lambda x: x.get('is_merge', True), COMBINE_harmonizer.FILENAME_INFOS))
df = pd.DataFrame([_basic_info(filename_info) for filename_info in tqdm(valid_filename_infos)])

out_filename = f'{out_dir}/20-01-basic-info-len-columns.csv'
df.to_csv(out_filename, index=False)

  0%|          | 0/71 [00:00<?, ?it/s]

  6%|▌         | 4/71 [00:00<00:01, 38.89it/s]

 13%|█▎        | 9/71 [00:00<00:01, 42.91it/s]

 20%|█▉        | 14/71 [00:00<00:01, 39.54it/s]

 25%|██▌       | 18/71 [00:00<00:01, 35.18it/s]

 39%|███▉      | 28/71 [00:00<00:00, 55.01it/s]

 49%|████▉     | 35/71 [00:00<00:00, 59.30it/s]

 62%|██████▏   | 44/71 [00:00<00:00, 67.17it/s]

 73%|███████▎  | 52/71 [00:00<00:00, 70.11it/s]

 85%|████████▍ | 60/71 [00:01<00:00, 72.22it/s]

 96%|█████████▌| 68/71 [00:01<00:00, 72.52it/s]

100%|██████████| 71/71 [00:01<00:00, 61.52it/s]




In [12]:
df

Unnamed: 0,filename,len-columns,columns
0,01-02-screening.csv,36,"[birthNumber, screenComment, coreTempLess32p5C..."
1,01-03-maternal-demographics.csv,6,"[motherAge_year, motherRace, motherEthnicity, ..."
2,01-04-pregnancy-history.csv,9,"[gravida, parity, multipleBirth, numFetus, pre..."
3,01-05-labor-delivery.csv,27,"[maternalAdmissionDate, maternalAdmissionTime,..."
4,01-05_1-pse.csv,0,[]
...,...,...,...
66,20-07-readmission.csv,6,"[readmissionNumber, readmissionTimePeriod, rea..."
67,20-08-lost.csv,57,"[lostFollowUpInformationAvailableIndirectSrc, ..."
68,20-09-secondary.csv,9,"[blindness, moderateSevereCerebralPalsy, cereb..."
69,20-10-outcome.csv,12,"[flagAdjudicatedOutcome, normalPrimaryOutcome,..."


## 02-02. variable info

In [13]:
def _variable_infos(filename_info):
    filename = filename_info['name']
    full_filename = f'{input_dir}/{filename}'
    df = pd.read_csv(full_filename, dtype='O')

    valid_columns = list(filter(lambda x: x not in COMBINE_harmonizer.RESERVED_COLUMNS and x.endswith('.orig') == False, df.columns))

    exclude_columns = filename_info.get('exclude_columns', None)
    if exclude_columns is None:
        exclude_columns = []
    valid_columns = list(filter(lambda x: x not in exclude_columns, valid_columns))

    data_dict_type = filename_info['data_dict']

    return [_variable_info_by_column(each, df[each], filename, data_dict_type) for each in valid_columns]

def _variable_info_by_column(column: str, the_series: pd.Series, filename: str, data_dict_type: str)-> dict:
    the_len = len(the_series)
    len_empty = the_series.isnull().sum()
    var_name_map = _DATA_DICT_MAP[data_dict_type]
    LH_name = var_name_map['LH'][column]
    OC_name = var_name_map['OC'][column]
    the_type = var_name_map['the_type'][column]

    display_type = _TYPE_MAP[the_type]

    return {'filename': filename, 'variable': column, 'with-value': the_len - len_empty, 'empty': len_empty, 'total': the_len, 'LH': LH_name, 'OC': OC_name, 'display_type': display_type}

valid_filename_infos = list(filter(lambda x: x.get('is_merge', True), COMBINE_harmonizer.FILENAME_INFOS))
df = pd.DataFrame(itertools.chain.from_iterable([_variable_infos(filename_info) for filename_info in tqdm(valid_filename_infos)]))


  0%|          | 0/71 [00:00<?, ?it/s]

 11%|█▏        | 8/71 [00:00<00:00, 75.80it/s]

 23%|██▎       | 16/71 [00:00<00:00, 64.17it/s]

 34%|███▍      | 24/71 [00:00<00:00, 68.35it/s]

 49%|████▉     | 35/71 [00:00<00:00, 77.15it/s]

 61%|██████    | 43/71 [00:00<00:00, 75.55it/s]

 75%|███████▍  | 53/71 [00:00<00:00, 81.37it/s]

 87%|████████▋ | 62/71 [00:00<00:00, 75.06it/s]

 99%|█████████▊| 70/71 [00:00<00:00, 71.80it/s]

100%|██████████| 71/71 [00:00<00:00, 73.72it/s]




In [14]:
df_filename_info = pd.DataFrame(COMBINE_harmonizer.FILENAME_INFOS).rename(columns={'name': 'filename', 'data_dict': 'sheet'})

for idx in ['is_merge', 'exclude_columns']:
    del df_filename_info[idx]
df_merge = df.merge(df_filename_info, on=['filename'], how='left')

df_merge.columns, df_filename_info.columns

(Index(['filename', 'variable', 'with-value', 'empty', 'total', 'LH', 'OC',
        'display_type', 'sheet', 'subcategory', 'category', 'summary'],
       dtype='object'),
 Index(['filename', 'sheet', 'subcategory', 'category', 'summary'], dtype='object'))

In [15]:
data_dict_columns = ['sheet', COMBINE_harmonizer.DATA_DICT_VAR_NAME, COMBINE_harmonizer.DATA_DICT_VAR_TYPE]
df_data_dict_columns = df_data_dict[data_dict_columns].rename(columns={COMBINE_harmonizer.DATA_DICT_VAR_NAME: 'variable'})

# XXX we use inner join because we care about only the variables in the data dictionary.
df_merge2 = df_merge.merge(df_data_dict_columns, on=['sheet', 'variable'])

out_filename = f'{out_dir}/20-02-basic-info-variables.csv'
df_merge2.to_csv(out_filename, index=False)


In [16]:
df

Unnamed: 0,filename,variable,with-value,empty,total,LH,OC,display_type
0,01-02-screening.csv,birthNumber,532,0,532,BIRTHNM,BIRTHNM,number
1,01-02-screening.csv,screenComment,48,484,532,,COMMENT,text
2,01-02-screening.csv,coreTempLess32p5CGreaterEq2Hr_e,95,437,532,,OC2TMP2,boolean
3,01-02-screening.csv,coreTempLess33p5CGreater1Hr_e,269,263,532,,OC2TEMP,boolean
4,01-02-screening.csv,coreTempLess34CGreater1Hr_e,168,364,532,LH2TEMP,,boolean
...,...,...,...,...,...,...,...,...
1171,20-10-outcome.csv,disabilityLevelDeath4Category,498,34,532,disab_die4,disab_die4,ordinal
1172,20-10-outcome.csv,moderateSevereDisabilityOrDeath,504,28,532,disab_die,disab_die,boolean
1173,20-10-outcome.csv,moderateSevereDisabilitySurvivor,430,102,532,disab_ms,disab_ms,boolean
1174,20-10-outcome.csv,outcomeGroup,347,185,532,,out_grp,ordinal


## 03. Check Death and Moderate Severe Disability Distribution

In [17]:
def _load_df(filename):
    full_filename = f'{input_dir}/{filename}'
    df = pd.read_csv(full_filename)

    return df

### 03-01. status

In [18]:
columns = [
    '_study',
    'center',
    'subjectID',
    'uniqueID',
    '20-01:followupID',

    '04-01:status',

    '04-01:dischargeStatus',

    '20-09:blindness',
    '20-09:grossMotorFunctionLevelSeverity',
    '20-09:hearingImpairedWithAid',
    '20-09:hearingImpairedLevel',
    '20-09:afterDischargeSeizure',

    '20-10:normalPrimaryOutcome',
    '20-10:deathBeforeDischarge',
    '20-10:deathBeforeFollowup',
    '20-10:moderateSevereDisabilityOrDeath',
    '20-10:moderateSevereDisabilitySurvivor',
    '20-10:disabilityLevelDeath4Category',
    '20-10:BayleyIIICognitive',

    '20-10_1:disabilityLevelDeath',
]
df = _load_df('zz-merged-flatten.csv')


  df = pd.read_csv(full_filename)


In [19]:
df

Unnamed: 0,_study,center,subjectID,uniqueID,01-02:siteID,01-02:birthNumber,01-02:screenComment,01-02:coreTempLess32p5CGreaterEq2Hr_e,01-02:coreTempLess33p5CGreater1Hr_e,01-02:coreTempLess34CGreater1Hr_e,...,20-10:BayleyIIICognitive,20-10:deathBeforeFollowup,20-10:deathBeforeDischarge,20-10:disabilityLevelSurvivor,20-10:disabilityLevelDeath4Category,20-10:moderateSevereDisabilityOrDeath,20-10:moderateSevereDisabilitySurvivor,20-10:outcomeGroup,20-10_1:followupCenter,20-10_1:disabilityLevelDeath
0,LH,3,LH087,03:LH087,3,1,,,,False,...,,True,True,,severe,True,,,,death
1,LH,3,LH088,03:LH088,3,1,,,,False,...,severe,False,False,severe,severe,True,True,,3.0,severe
2,LH,3,LH092,03:LH092,3,1,,,,False,...,normal,False,False,normal,normal,False,False,,3.0,normal
3,LH,3,LH094,03:LH094,3,1,,,,False,...,moderate,False,False,mild,mild,False,False,,3.0,mild
4,LH,3,LH098,03:LH098,3,1,,,,False,...,normal,False,False,normal,normal,False,False,,3.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,OC,31,OC3411,31:OC3411,A,1,,,False,,...,,False,False,,,,,,31.0,
528,OC,31,OC3441,31:OC3441,A,1,,,False,,...,normal,False,False,normal,normal,False,False,primary,31.0,normal
529,OC,31,OC3471,31:OC3471,A,1,,False,,,...,,True,True,,severe,True,,death,,death
530,OC,31,OC3551,31:OC3551,A,1,,False,,,...,normal,False,False,normal,normal,False,False,primary,31.0,normal


In [20]:
valid_columns = list(filter(lambda x: x.startswith('30-01'), df.columns))
valid_columns

[]

In [21]:
df = df[columns]

In [22]:
df

Unnamed: 0,_study,center,subjectID,uniqueID,20-01:followupID,04-01:status,04-01:dischargeStatus,20-09:blindness,20-09:grossMotorFunctionLevelSeverity,20-09:hearingImpairedWithAid,20-09:hearingImpairedLevel,20-09:afterDischargeSeizure,20-10:normalPrimaryOutcome,20-10:deathBeforeDischarge,20-10:deathBeforeFollowup,20-10:moderateSevereDisabilityOrDeath,20-10:moderateSevereDisabilitySurvivor,20-10:disabilityLevelDeath4Category,20-10:BayleyIIICognitive,20-10_1:disabilityLevelDeath
0,LH,3,LH087,03:LH087,,died,died,,,,,,,True,True,True,,severe,,death
1,LH,3,LH088,03:LH088,LHF01,discharge home,discharge home,False,severe,False,normal,False,False,False,False,True,True,severe,severe,severe
2,LH,3,LH092,03:LH092,LHF02,discharge home,discharge home,False,normal,False,normal,False,True,False,False,False,False,normal,normal,normal
3,LH,3,LH094,03:LH094,LHF03,discharge home,discharge home,False,normal,False,normal,False,False,False,False,False,False,mild,moderate,mild
4,LH,3,LH098,03:LH098,LHF04,discharge home,discharge home,False,normal,False,normal,True,True,False,False,False,False,normal,normal,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,OC,31,OC3411,31:OC3411,OCF3411,discharge home,,,,,,,,False,False,,,,,
528,OC,31,OC3441,31:OC3441,OCF3441,discharge home,,False,normal,False,normal,False,True,False,False,False,False,normal,normal,normal
529,OC,31,OC3471,31:OC3471,,transfer,,,,,,,,True,True,True,,severe,,death
530,OC,31,OC3551,31:OC3551,OCF3551,discharge home,,False,normal,False,normal,False,True,False,False,False,False,normal,normal,normal


### 03-02. severe disability

In [23]:
def _is_severe(x):
    if _is_severe_bayleyIII_cognitive(x):
        return True

    if _is_severe_GMFCS(x):
        return True

    if _is_severe_blindness(x):
        return True

    if _is_severe_hearing_loss(x):
        return True

    if _is_severe_death(x):
        return True

    return False


def _is_severe_bayleyIII_cognitive(x):
    if pd.isnull(x['20-10:BayleyIIICognitive']):
        return False

    return x['20-10:BayleyIIICognitive'] == 'severe'


def _is_severe_GMFCS(x):
    if pd.isnull(x['20-09:grossMotorFunctionLevelSeverity']):
        return False

    return x['20-09:grossMotorFunctionLevelSeverity'] == 'severe'


def _is_severe_blindness(x):
    if pd.isnull(x['20-09:blindness']):
        return False

    return x['20-09:blindness']


def _is_severe_hearing_loss(x):
    if pd.isnull(x['20-09:hearingImpairedLevel']):
        return False

    return x['20-09:hearingImpairedLevel'] == 'severe'


def _is_severe_death(x):
    if pd.isnull(x['20-10:deathBeforeFollowup']):
        return False

    return x['20-10:deathBeforeFollowup']

df['is_severe'] = df.apply(lambda x: _is_severe(x), axis=1)

In [24]:
is_invalid_severe_true = (df['is_severe'] == True) & (df['20-10:disabilityLevelDeath4Category'] != 'severe')
is_invalid_severe_false = (df['is_severe'] == False) & (df['20-10:disabilityLevelDeath4Category'] == 'severe')
is_invalid = is_invalid_severe_true | is_invalid_severe_false
columns = ['uniqueID', 'is_severe', '20-10:disabilityLevelDeath4Category', '20-10:deathBeforeFollowup', '20-10:BayleyIIICognitive', '20-09:grossMotorFunctionLevelSeverity', '20-09:blindness', '20-09:hearingImpairedLevel']
df[is_invalid].reset_index()[columns]

Unnamed: 0,uniqueID,is_severe,20-10:disabilityLevelDeath4Category,20-10:deathBeforeFollowup,20-10:BayleyIIICognitive,20-09:grossMotorFunctionLevelSeverity,20-09:blindness,20-09:hearingImpairedLevel
0,18:LH095,True,mild,False,normal,normal,False,severe
1,05:OC0871,False,severe,False,,,False,


### 03-03. moderate disability

In [25]:
def _is_moderate(x):
    if _is_moderate_bayleyIII_cognitive(x) and _is_moderate_GMFCS(x):
        return True

    '''
    if _is_moderate_seizure(x):
        return True
    '''

    if _is_moderate_hearing_loss(x):
        return True

    return False


def _is_moderate_bayleyIII_cognitive(x):
    if pd.isnull(x['20-10:BayleyIIICognitive']):
        return False

    return x['20-10:BayleyIIICognitive'] == 'moderate'


def _is_moderate_GMFCS(x):
    if pd.isnull(x['20-09:grossMotorFunctionLevelSeverity']):
        return False

    return x['20-09:grossMotorFunctionLevelSeverity'] == 'moderate'


def _is_moderate_seizure(x):
    if pd.isnull(x['20-09:afterDischargeSeizure']):
        return False

    return x['20-09:afterDischargeSeizure']


def _is_moderate_hearing_loss(x):
    if pd.isnull(x['20-09:hearingImpairedLevel']):
        return False

    return x['20-09:hearingImpairedLevel'] == 'moderate'


df['is_moderate'] = df.apply(lambda x: _is_moderate(x), axis=1)

In [26]:
is_invalid_moderate_true = (df['is_moderate'] == True) & (df['20-10:disabilityLevelDeath4Category'] != 'moderate')
is_invalid_moderate_false = (df['is_moderate'] == False) & (df['20-10:disabilityLevelDeath4Category'] == 'moderate')
is_invalid = is_invalid_moderate_true | is_invalid_moderate_false
columns = ['uniqueID', 'is_moderate', '20-10:disabilityLevelDeath4Category', '20-10:BayleyIIICognitive', '20-09:grossMotorFunctionLevelSeverity', '20-09:afterDischargeSeizure', '20-09:hearingImpairedLevel']
df[is_invalid].reset_index()[columns]

Unnamed: 0,uniqueID,is_moderate,20-10:disabilityLevelDeath4Category,20-10:BayleyIIICognitive,20-09:grossMotorFunctionLevelSeverity,20-09:afterDischargeSeizure,20-09:hearingImpairedLevel
0,18:LH105,False,moderate,moderate,normal,True,normal
1,05:OC0301,True,,,normal,False,moderate
2,09:OC0651,True,mild,normal,normal,False,moderate
3,12:OC0351,True,mild,normal,normal,False,moderate
4,14:OC0211,True,mild,normal,normal,False,moderate
5,15:OC0181,True,mild,normal,normal,False,moderate
6,15:OC0251,True,mild,normal,normal,False,moderate


### 03-04. mild disability

In [27]:
def _is_mild(x):
    if _is_moderate_severe(x):
        return False

    if _is_mild_bayleyIII_cognitive(x):
        return True

    if _is_normal_bayleyIII_cognitive(x):
        if _is_mild_GMFCS(x):
            return True

        if _is_moderate_seizure(x):
            return True

        if _is_moderate_hearing_loss(x):
            return True

    return False


def _is_moderate_severe(x):
    return x['is_moderate'] or x['is_severe']


def _is_mild_bayleyIII_cognitive(x):
    if pd.isnull(x['20-10:BayleyIIICognitive']):
        return False

    return x['20-10:BayleyIIICognitive'] == 'mild'


def _is_normal_bayleyIII_cognitive(x):
    if pd.isnull(x['20-10:BayleyIIICognitive']):
        return False

    return x['20-10:BayleyIIICognitive'] == 'normal'


def _is_mild_GMFCS(x):
    if pd.isnull(x['20-09:grossMotorFunctionLevelSeverity']):
        return False

    return x['20-09:grossMotorFunctionLevelSeverity'] == 'mild'


def _is_moderate_seizure(x):
    if pd.isnull(x['20-09:afterDischargeSeizure']):
        return False

    return x['20-09:afterDischargeSeizure']


def _is_moderate_hearing_loss(x):
    if pd.isnull(x['20-09:hearingImpairedLevel']):
        return False

    return x['20-09:hearingImpairedLevel'] in ['mild', 'moderate']


df['is_mild'] = df.apply(lambda x: _is_mild(x), axis=1)

In [28]:
is_invalid_mild_true = (df['is_mild'] == True) & (df['20-10:disabilityLevelDeath4Category'] != 'mild')
is_invalid_mild_false = (df['is_mild'] == False) & (df['20-10:disabilityLevelDeath4Category'] == 'mild')
is_invalid = is_invalid_mild_true | is_invalid_mild_false
columns = ['uniqueID', 'is_mild', '20-10:disabilityLevelDeath4Category', '20-10:BayleyIIICognitive', '20-09:grossMotorFunctionLevelSeverity', '20-09:afterDischargeSeizure', '20-09:hearingImpairedLevel']
df[is_invalid].reset_index()[columns]

Unnamed: 0,uniqueID,is_mild,20-10:disabilityLevelDeath4Category,20-10:BayleyIIICognitive,20-09:grossMotorFunctionLevelSeverity,20-09:afterDischargeSeizure,20-09:hearingImpairedLevel
0,03:LH094,False,mild,moderate,normal,False,normal
1,03:LH098,True,normal,normal,normal,True,normal
2,04:LH063,False,mild,moderate,normal,False,normal
3,04:LH068,False,mild,moderate,normal,False,normal
4,04:LH070,True,normal,normal,normal,True,normal
...,...,...,...,...,...,...,...
76,27:OC8561,False,mild,moderate,normal,False,normal
77,28:OC0031,False,mild,normal,normal,False,normal
78,28:OC0101,False,mild,normal,normal,False,normal
79,28:OC3051,False,mild,moderate,normal,False,normal


In [29]:
# is_valid = df['30-01:normalPrimaryOutcome'].isnull()
is_invalid = df['04-01:dischargeStatus'] != df['04-01:status']
df[is_invalid]

Unnamed: 0,_study,center,subjectID,uniqueID,20-01:followupID,04-01:status,04-01:dischargeStatus,20-09:blindness,20-09:grossMotorFunctionLevelSeverity,20-09:hearingImpairedWithAid,...,20-10:deathBeforeDischarge,20-10:deathBeforeFollowup,20-10:moderateSevereDisabilityOrDeath,20-10:moderateSevereDisabilitySurvivor,20-10:disabilityLevelDeath4Category,20-10:BayleyIIICognitive,20-10_1:disabilityLevelDeath,is_severe,is_moderate,is_mild
84,LH,15,LH002,15:LH002,,died,remain in hospital at 6 months,,,,...,True,True,True,,severe,,death,True,False,False
168,OC,3,OC3261,03:OC3261,OCF0003,discharge home,,False,severe,False,...,False,False,True,True,severe,severe,severe,True,False,False
169,OC,3,OC3281,03:OC3281,OCF0002,discharge home,,False,normal,False,...,False,False,False,False,normal,normal,normal,False,False,False
170,OC,3,OC3291,03:OC3291,OCF0001,discharge home,,False,normal,False,...,False,False,False,False,mild,moderate,mild,False,False,False
171,OC,3,OC3331,03:OC3331,OCF0004,discharge home,,False,normal,False,...,False,False,True,True,severe,severe,severe,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,OC,31,OC3411,31:OC3411,OCF3411,discharge home,,,,,...,False,False,,,,,,False,False,False
528,OC,31,OC3441,31:OC3441,OCF3441,discharge home,,False,normal,False,...,False,False,False,False,normal,normal,normal,False,False,False
529,OC,31,OC3471,31:OC3471,,transfer,,,,,...,True,True,True,,severe,,death,True,False,False
530,OC,31,OC3551,31:OC3551,OCF3551,discharge home,,False,normal,False,...,False,False,False,False,normal,normal,normal,False,False,False


### 03-04. death or disability distribution.

In [30]:
df['deathDisabilityLevel'] = df['20-10:disabilityLevelDeath4Category'].copy()
is_death = df['20-10:deathBeforeFollowup'] == True
df.loc[is_death, 'deathDisabilityLevel'] = 'death'

df_groupby = df.groupby(['_study', 'deathDisabilityLevel']).agg(_count=('deathDisabilityLevel', 'count')).reset_index(drop=False)

out_filename = f'{out_dir}/20-03-basic-info-death-disability-distribution.csv'
df_groupby.to_csv(out_filename, index=False)

df_groupby

Unnamed: 0,_study,deathDisabilityLevel,_count
0,LH,death,18
1,LH,mild,28
2,LH,moderate,2
3,LH,normal,88
4,LH,severe,21
5,OC,death,56
6,OC,mild,71
7,OC,moderate,4
8,OC,normal,160
9,OC,severe,50


In [31]:
df.columns

Index(['_study', 'center', 'subjectID', 'uniqueID', '20-01:followupID',
       '04-01:status', '04-01:dischargeStatus', '20-09:blindness',
       '20-09:grossMotorFunctionLevelSeverity', '20-09:hearingImpairedWithAid',
       '20-09:hearingImpairedLevel', '20-09:afterDischargeSeizure',
       '20-10:normalPrimaryOutcome', '20-10:deathBeforeDischarge',
       '20-10:deathBeforeFollowup', '20-10:moderateSevereDisabilityOrDeath',
       '20-10:moderateSevereDisabilitySurvivor',
       '20-10:disabilityLevelDeath4Category', '20-10:BayleyIIICognitive',
       '20-10_1:disabilityLevelDeath', 'is_severe', 'is_moderate', 'is_mild',
       'deathDisabilityLevel'],
      dtype='object')

In [32]:
is_both_null = df['deathDisabilityLevel'].isnull() & df['20-10_1:disabilityLevelDeath'].isnull()
is_ne_disability_level_death = df['deathDisabilityLevel'] != df['20-10_1:disabilityLevelDeath']
is_invalid = (is_both_null == False) & is_ne_disability_level_death

In [33]:
columns = ['_study', 'uniqueID', 'deathDisabilityLevel', '20-10_1:disabilityLevelDeath', '20-10:deathBeforeFollowup', '20-10:disabilityLevelDeath4Category']
df[is_invalid][columns]

Unnamed: 0,_study,uniqueID,deathDisabilityLevel,20-10_1:disabilityLevelDeath,20-10:deathBeforeFollowup,20-10:disabilityLevelDeath4Category


In [34]:
df[is_both_null][columns]

Unnamed: 0,_study,uniqueID,deathDisabilityLevel,20-10_1:disabilityLevelDeath,20-10:deathBeforeFollowup,20-10:disabilityLevelDeath4Category
23,LH,04:LH598,,,False,
33,LH,05:LH167,,,False,
42,LH,09:LH110,,,False,
57,LH,12:LH038,,,False,
81,LH,14:LH115,,,,
94,LH,15:LH071,,,,
109,LH,18:LH078,,,False,
116,LH,18:LH139,,,,
162,LH,30:LH009,,,False,
163,LH,30:LH010,,,False,
