In [1]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import re

import os
import matplotlib.pyplot as plt
import scienceplots

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

plt.style.use('nature')
plt.rcParams['legend.frameon'] = False
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

## 01. Init

In [2]:
root_dir = '../'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
input_dir = f"{cfg.config['out_dir']}/out-publish-normalized"
out_dir = f"{cfg.config['out_dir']}/stats"

out_figure_dir = f'{out_dir}/figure'
os.makedirs(out_figure_dir, exist_ok=True)

data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"

In [6]:
COMBINE_harmonizer.init_rank_mapping(data_dict_filename)

_NUMERIC_VALUE_MAP_MAIN = COMBINE_harmonizer.build_numeric_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_MAIN)
_NUMERIC_VALUE_MAP_FOLLOWUP = COMBINE_harmonizer.build_numeric_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_FOLLOW_UP)


build_numeric_value_map: (0/1366) variable: center type: center
[INFO] _build_numeric_value_map: to inv-text: type: center
build_numeric_value_map: (1/1366) variable: subjectID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (2/1366) variable: siteID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (3/1366) variable: birthDate type: date
[INFO] _build_numeric_value_map: to inv-text: type: date
build_numeric_value_map: (4/1366) variable: birthNumber type: int
build_numeric_value_map: (5/1366) variable: screenComment type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (6/1366) variable: coreTempLess32p5COverEq2Hr_e type: bool
build_numeric_value_map: (7/1366) variable: coreTempLess33p5COver1Hr_e type: bool
build_numeric_value_map: (8/1366) variable: coreTempLess34COver1Hr_e type: bool
build_numeric_value_map: (9/1366) variable: first6HrCoolByClinicalProtocol_e

In [14]:
_NUMERIC_VALUE_MAP = {
    COMBINE_harmonizer.SHEET_MAIN: _NUMERIC_VALUE_MAP_MAIN,
    COMBINE_harmonizer.SHEET_FOLLOW_UP: _NUMERIC_VALUE_MAP_FOLLOWUP,
}

_FILENAME_INFO_NUMERIC_VALUE_MAP = {each['name']: _NUMERIC_VALUE_MAP[each['data_dict']] for each in COMBINE_harmonizer.FILENAME_INFOS}

_PREFIX_NUMERIC_VALUE_MAP = {COMBINE_harmonizer.flatten_filename_prefix(filename): val for filename, val in _FILENAME_INFO_NUMERIC_VALUE_MAP.items()}


## Load flatten

In [9]:
filename = f'{input_dir}/zz-merged-flatten.csv'

In [10]:
df = pd.read_csv(filename, dtype='O')

## 02. numerical distribution

In [11]:
list(df.columns)

['_study',
 'center',
 'subjectID',
 'uniqueID',
 '01-02:siteID',
 '01-02:birthNumber',
 '01-02:screenComment',
 '01-02:coreTempLess32p5COverEq2Hr_e',
 '01-02:coreTempLess33p5COver1Hr_e',
 '01-02:coreTempLess34COver1Hr_e',
 '01-02:first6HrCoolByClinicalProtocol_e',
 '01-02:chromosomalAbnormality_e',
 '01-02:majorCongenitalAnomaly_e',
 '01-02:birthWeightLessEq1800g_e',
 '01-02:infantUnlikelySurvive_e',
 '01-02:first60MinAllBloodGasPHGreater7p15BaseDeficitLess10mEqPerL_e',
 '01-02:postnatalAgeLess6HrOrGreater24Hr_e',
 '01-02:enrolledConflictingTrial_e',
 '01-02:first60MinAnyBloodGasPHLessEq7_i',
 '01-02:first60MinAnyBloodGasBaseDeficitGreaterEq16mEqPerL_i',
 '01-02:historyPerinatalEvent_i',
 '01-02:at10MinApgarLessEq5OrVent_i',
 '01-02:randomEligible',
 '01-02:consentStatus',
 '01-02:noConsentReason',
 '01-02:noInStudyReason',
 '01-02:random',
 '01-02:noRandomReason',
 '01-02:noRandomReasonText',
 '01-02:randomDate',
 '01-02:randomTime',
 '01-02:ageRand_hr',
 '01-02:randomNumber',
 '01-0

In [12]:
columns = list(filter(lambda x: re.search('disability', x, re.I), df.columns))
columns

['20-10:disabilityLevelSurvivor',
 '20-10:disabilityLevel4',
 '20-10:moderateSevereDisabilityOrDeath',
 '20-10:moderateSevereDisabilitySurvivor',
 '20-10_1:disabilityLevelDeath']

In [16]:
x_columns_map = {
    '01-06:cordBloodGasPH': {'title': 'Cord Blood Gas pH', 'unit': ''},

    '01-06:initBloodGasPH': {'title': 'Initial Blood Gas pH', 'unit': ''},

    '01-06:firstPostnatalBloodGasPH': {'title': 'First Postnatal Blood Gas pH', 'unit': ''},

    '01-06:Apgar1min': {'title': 'Apgar Score at 1 min', 'unit': ''},

    '01-06:Apgar5min': {'title': 'Apgar Score at 5 min', 'unit': ''},

    '01-06:Apgar10min': {'title': 'Apgar Score at 10 min', 'unit': ''},

    '03-05_s1:MRINRNPatternOfInjuryMerge': {'title': 'NRN Pattern of Injury', 'unit': ''},
    '03-05_s1:MRINRNPatternOfInjuryAvg': {'title': 'NRN Pattern of Injury - Mean', 'unit': ''},

    '01-06:birthGestationalAge_week': {'title': 'Birth Gestational Age', 'unit': 'Week'},

    '01-03:motherAge_year': {'title': 'Maternal Age', 'unit': 'Year'},

    '01-06:birthWeight_g': {'title': 'Birth Weight', 'unit': 'g'},

    '01-12_1:pre_TotalModifiedSarnatScore': {'title': 'Total Modified Sarnat Score - Screening', 'unit': ''},
    '03-04_1:post_TotalModifiedSarnatScore': {'title': 'Total Modified Sarnat Score - Post-treatment', 'unit': ''},
    '04-12_1:dischargeTotalModifiedSarnatScore': {'title': 'Total Modified Sarnat Score - Discharge', 'unit': ''},

    '01-06:infantSex': {'title': 'Infant Sex', 'unit': ''},
    '01-03:motherEducation': {'title': 'Maternal Education', 'unit': ''},
    '01-03:motherRace': {'title': 'Maternal Race', 'unit': ''},

    '01-05:maternalSeizure': {'title': 'Maternal Seizure', 'unit': ''},
    '01-12:pre_NeuroExamSeizure': {'title': 'Seizure - Screening', 'unit': ''},
    '03-04:post_NeuroExamSeizure': {'title': 'Seizure - Post-treatment Neuro Exam', 'unit': ''},
    '04-12:dischargeNeuroExamSeizure': {'title': 'Seizure - Discharge Neuro Exam', 'unit': ''},
    '04-13:dischargeSeizure': {'title': 'Seizure - Discharge', 'unit': ''},

    '20-04:BayleyIIICognitiveComposite': {'title': 'Bayley-III Cognitive', 'unit': ''},
    '20-10:moderateSevereDisabilityOrDeath': {'title': 'Moderate/Servere Disability or Death', 'unit': ''},
    '20-10:disabilityLevel4': {'title': 'Disability Level', 'unit': ''},
    '20-10_1:disabilityLevelDeath': {'title': 'Disability Level or Death', 'unit': ''},
}

In [18]:
def _get_order(column, value):
    prefix, var_name, postfix = COMBINE_harmonizer.flatten_column_tuple(column)
    if prefix not in _PREFIX_NUMERIC_VALUE_MAP:
        print(f'[WARN] not in _PREFIX_NUMERIC_VALUE_MAP: column: {column} prefix: {prefix}')
        return None

    numeric_value_map = _PREFIX_NUMERIC_VALUE_MAP[prefix]
    if var_name not in numeric_value_map:
        print(f'[WARN] not in _NUMERIC_VALUE_MAP: {var_name}')
        return None

    return numeric_value_map[var_name](value)


In [19]:
def _hist(df, column, ax, row, col, info, bins=20, color='blue'):
    is_valid = df[column].isnull() == False
    value = df[column][is_valid].astype('float64')

    ax[row, col].hist(value, bins=bins, color=color)

    title = info['title']
    ax[row, col].set_title(title)


In [20]:
def _bar(df, column, ax, row, col, info, color='blue', dtype=None):
    is_valid = df[column].isnull() == False
    df_valid = df[is_valid]

    if dtype is not None:
        df_valid.loc[:, column] = df_valid[column].astype(dtype)
    df_groupby = df_valid.groupby([column]).agg({column: 'count'}).rename(columns={column: 'count'}).reset_index(drop=False)
    df_groupby['_order'] = df_groupby[column].apply(lambda x: _get_order(column, x))
    df_groupby = df_groupby.sort_values(by=['_order'], ascending=[True])

    x_value = df_groupby[column]
    y_value = df_groupby['count']

    ax[row, col].bar(x_value, y_value, color=color)

    title = info['title']
    ax[row, col].set_title(title)

In [21]:
def _pie(df, column, ax, row, col, info, color='blue', dtype=None):
    is_valid = df[column].isnull() == False
    df_valid = df[is_valid]

    is_valid_invalid = df_valid[column].isnull()
    df_valid_invalid = df_valid[is_valid_invalid]
    print(f'df_valid: column: {column} is-invalid: {len(df) - is_valid.sum()} is-null: {df_valid[column].isnull().sum()} is-valid-invalid: {is_valid_invalid.sum()} df_valid_invalid: {df_valid_invalid[column]}')

    if dtype is not None:
        df_valid.loc[:, column] = df_valid[column].astype(dtype)
    df_groupby = df_valid.groupby([column]).agg({column: 'count'}).rename(columns={column: 'count'}).reset_index(drop=False)
    df_groupby['_order'] = df_groupby[column].apply(lambda x: _get_order(column, x))
    df_groupby = df_groupby.sort_values(by=['_order'], ascending=[True])

    count_sum = df_groupby['count'].sum()
    df_groupby['ratio'] = df_groupby['count'] / count_sum

    df_groupby['x_count'] = df_groupby.apply(lambda x: f"{x[column]}\n({x['count']})", axis=1)

    x_value = list(df_groupby['x_count'])

    y_value = list(df_groupby['ratio'])

    angle = -180 * y_value[0]
    ax[row, col].pie(y_value, autopct='%1.1f%%', startangle=angle,
                         labels=x_value)

    title = info['title']
    ax[row, col].set_title(title)


In [None]:
fig, ax = plt.subplots(dpi=600, nrows=8, ncols=3, figsize=(12, 24))

column = '01-06:birthGestationalAge_week'
_bar(df, column, ax, 0, 0, x_columns_map[column], dtype='int')

column = '01-06:birthWeight_g'
_hist(df, column, ax, 0, 1, x_columns_map[column])

column = '01-06:infantSex'
_pie(df, column, ax, 0, 2, x_columns_map[column])

column = '01-03:motherAge_year'
_hist(df, column, ax, 1, 0, x_columns_map[column], bins=18)

column = '01-03:motherEducation'
_pie(df, column, ax, 1, 1, x_columns_map[column])

# column = '02-01:motherRace'
# _bar(df, column, ax, 1, 2, x_columns_map[column])
column = '01-05:maternalSeizure'
_pie(df, column, ax, 1, 2, x_columns_map[column])

column = '01-12:pre_NeuroExamSeizure'
_pie(df, column, ax, 2, 0, x_columns_map[column])

column = '03-04:post_NeuroExamSeizure'
_pie(df, column, ax, 2, 1, x_columns_map[column])

column = '04-12:dischargeNeuroExamSeizure'
_pie(df, column, ax, 2, 2, x_columns_map[column])

column = '01-12_1:pre_TotalModifiedSarnatScore'
_bar(df, column, ax, 3, 0, x_columns_map[column], dtype='int')

column = '03-04_1:post_TotalModifiedSarnatScore'
_bar(df, column, ax, 3, 1, x_columns_map[column], dtype='int')

column = '04-12_1:dischargeTotalModifiedSarnatScore'
_bar(df, column, ax, 3, 2, x_columns_map[column], dtype='int')

column = '03-05_s1:MRINRNPatternOfInjuryMerge'
_bar(df, column, ax, 4, 0, x_columns_map[column])

column = '01-06:cordBloodGasPH'
_hist(df, column, ax, 4, 1, x_columns_map[column])
column = '01-06:firstPostnatalBloodGasPH'
_hist(df, column, ax, 4, 2, x_columns_map[column])

column = '01-06:Apgar1min'
_bar(df, column, ax, 5, 0, x_columns_map[column], dtype='int')
column = '01-06:Apgar5min'
_bar(df, column, ax, 5, 1, x_columns_map[column], dtype='int')
column = '01-06:Apgar10min'
_bar(df, column, ax, 5, 2, x_columns_map[column], dtype='int')

column = '20-04:BayleyIIICognitiveComposite'
_bar(df, column, ax, 6, 0, x_columns_map[column], dtype='int')

column = '20-10:moderateSevereDisabilityOrDeath'
_pie(df, column, ax, 6, 1, x_columns_map[column])

column = '20-10:disabilityLevel4'
_bar(df, column, ax, 6, 2, x_columns_map[column])

column = '20-10_1:disabilityLevelDeath'
_pie(df, column, ax, 6, 2, x_columns_map[column])


plt.show()

for postfix in ['eps', 'png', 'pdf']:
    out_filename = f'{out_dir}/figure/32-distribution.{postfix}'
    fig.savefig(out_filename)