In [1]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from tqdm import tqdm
import os

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg


## 01. Init

In [2]:
root_dir = '../'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
input_dir = f"{cfg.config['out_dir']}/out-publish-normalized"
out_dir = f"{cfg.config['out_dir']}/stats"

os.makedirs(out_dir, exist_ok=True)

data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"

In [5]:
COMBINE_harmonizer.init_inv_mapping(data_dict_filename)

_INV_VALUE_MAP_MAIN = COMBINE_harmonizer.build_inv_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_MAIN)
_INV_VALUE_MAP_FOLLOWUP = COMBINE_harmonizer.build_inv_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_FOLLOW_UP)
_INV_VALUE_MAP_ANALYSIS = COMBINE_harmonizer.build_inv_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_DERIVED_DATA)

build_inv_value_map: (0/979) variable: center type: center
[INFO] _build_inv_value_map: to inv-text: type: center
build_inv_value_map: (1/979) variable: subjectID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (2/979) variable: siteID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (3/979) variable: birthDate type: date
[INFO] _build_inv_value_map: to inv-text: type: date
build_inv_value_map: (4/979) variable: birthNumber type: int
build_inv_value_map: (5/979) variable: screenComment type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (6/979) variable: coreTempLess32p5COverEq2Hr_e type: bool
build_inv_value_map: (7/979) variable: coreTempLess33p5COver1Hr_e type: bool
build_inv_value_map: (8/979) variable: coreTempLess34COver1Hr_e type: bool
build_inv_value_map: (9/979) variable: first6HrCoolByClinicalProtocol_e type: bool
build_inv_value_map: (10/979) variable: chromosomalAbnorma

build_inv_value_map: (0/355) variable: followupCenter type: center
[INFO] _build_inv_value_map: to inv-text: type: center
build_inv_value_map: (1/355) variable: siteID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (2/355) variable: birthDate type: date
[INFO] _build_inv_value_map: to inv-text: type: date
build_inv_value_map: (3/355) variable: visitDate type: date
[INFO] _build_inv_value_map: to inv-text: type: date
build_inv_value_map: (4/355) variable: birthNumber type: int
build_inv_value_map: (5/355) variable: center type: center
[INFO] _build_inv_value_map: to inv-text: type: center
build_inv_value_map: (6/355) variable: subjectID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (7/355) variable: followupID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (8/355) variable: center_orig type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (9/3

build_inv_value_map: (0/148) variable: center type: center
[INFO] _build_inv_value_map: to inv-text: type: center
build_inv_value_map: (1/148) variable: followupCenter type: center
[INFO] _build_inv_value_map: to inv-text: type: center
build_inv_value_map: (2/148) variable: subjectID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (3/148) variable: followupID type: text
[INFO] _build_inv_value_map: to inv-text: type: text
build_inv_value_map: (4/148) variable: acidosis type: bool
build_inv_value_map: (5/148) variable: ageDeath_day type: int
build_inv_value_map: (6/148) variable: ageRand_hr type: float
build_inv_value_map: (7/148) variable: baselineAnticonvulsants type: bool
build_inv_value_map: (8/148) variable: dischargeAnticonvulsants type: bool
build_inv_value_map: (9/148) variable: inotropicAgent type: bool
build_inv_value_map: (10/148) variable: perinatalSentinelEvent type: bool
build_inv_value_map: (11/148) variable: dischargeSeizure type: boo

In [6]:
_INV_VALUE_MAP = {
    COMBINE_harmonizer.SHEET_MAIN: _INV_VALUE_MAP_MAIN,
    COMBINE_harmonizer.SHEET_FOLLOW_UP: _INV_VALUE_MAP_FOLLOWUP,
    COMBINE_harmonizer.SHEET_DERIVED_DATA: _INV_VALUE_MAP_ANALYSIS,
}

_FILENAME_INFO_INV_VALUE_MAP = {each['name']: _INV_VALUE_MAP[each['data_dict']] for each in COMBINE_harmonizer.FILENAME_INFOS}

_PREFIX_INV_VALUE_MAP = {COMBINE_harmonizer.flatten_filename_prefix(filename): val for filename, val in _FILENAME_INFO_INV_VALUE_MAP.items()}


## Load flatten

In [7]:
filename = f'{input_dir}/zz-merged-flatten.csv'

In [8]:
df = pd.read_csv(filename, dtype='O')

In [9]:
columns = list(df.columns)

In [10]:
df_columns = pd.DataFrame({'columns': columns})

In [11]:
df_columns['column_tuple'] = df_columns['columns'].apply(COMBINE_harmonizer.flatten_column_tuple)
df_columns['column_name'] = df_columns['column_tuple'].apply(lambda x: x[1])
df_columns['column_prefix'] = df_columns['column_tuple'].apply(lambda x: x[0])

In [12]:
df_columns['column_prefix'].unique()

array(['', '01-02', '01-03', '01-04', '01-05', '01-06', '01-07', '01-08',
       '01-09', '01-10', '01-11', '01-12', '02-01', '02-02', '02-03',
       '02-04', '02-05', '02-05_s', '02-06_s', '02-07', '02-08', '02-09',
       '02-11', '02-12', '02-13', '02-14', '02-15', '02-16', '02-17',
       '03-01', '03-01_s', '03-02', '03-03', '03-04', '03-05', '03-05_s',
       '04-01', '04-03', '04-04', '04-05', '04-06', '04-07', '04-08',
       '04-09', '04-10', '04-11', '04-12', '04-02', '04-13', '04-14',
       '04-15', '04-16', '04-17', '20-00', '20-01', '20-02', '20-03',
       '20-04', '20-05', '20-06', '20-07', '20-08', '30-01', '30-02',
       '30-03', '31-02', '31-03', '31-04', '31-05', '31-06', '31-07'],
      dtype=object)

In [13]:
df_columns_groupby = df_columns.groupby(['column_name']).agg(count=('column_name', 'count'))

is_invalid = df_columns_groupby['count'] > 1
df_columns_groupby[is_invalid].sort_values(by=['count'], ascending=[False])

Unnamed: 0_level_0,count
column_name,Unnamed: 1_level_1
temperatureTimeSlot,87
axillaryTemperature_C,87
blanketTemperature_C,87
servoSetTemperature_C,87
shiver,87
...,...
maternalHemorrhage,2
maternalSeizure,2
maternalTrauma,2
motherRace,2


In [14]:
def _inv_df(df: pd.DataFrame)-> pd.DataFrame:
    df_inv = df.copy()

    columns = list(df_inv.columns)
    for idx, column in tqdm(enumerate(columns)):
        prefix, var_name, postfix = COMBINE_harmonizer.flatten_column_tuple(column)
        if prefix not in _PREFIX_INV_VALUE_MAP:
            print(f'[WARN] ({idx}/{len(columns)}) not in _PREFIX_INV_VALUE_MAP: column: {column} prefix: {prefix}')
            continue

        inv_value_map = _PREFIX_INV_VALUE_MAP[prefix]
        if var_name not in inv_value_map:
            print(f'[WARN] ({idx}/{len(columns)}) not in _INV_VALUE_MAP: {var_name}')
            continue
        if inv_value_map[var_name] == COMBINE_harmonizer.to_inv_text:
            # print(f'[INFO] ({idx}/{len(columns)}) pass to_inv_text: {var_name}')
            continue
        df_inv[column] = df_inv[column].apply(inv_value_map[var_name])

    return df_inv

In [15]:
df_inv = _inv_df(df)

0it [00:00, ?it/s]

670it [00:00, 6699.13it/s]

1494it [00:00, 7600.21it/s]

[WARN] (0/6131) not in _PREFIX_INV_VALUE_MAP: column: _study prefix: 
[WARN] (1/6131) not in _PREFIX_INV_VALUE_MAP: column: center prefix: 
[WARN] (2/6131) not in _PREFIX_INV_VALUE_MAP: column: subjectID prefix: 
[WARN] (3/6131) not in _PREFIX_INV_VALUE_MAP: column: uniqueID prefix: 


2254it [00:00, 7321.77it/s]

2988it [00:00, 7004.17it/s]

3947it [00:00, 7892.88it/s]

4784it [00:00, 8048.44it/s]

5593it [00:00, 7844.84it/s]

6131it [00:00, 7544.10it/s]

[WARN] (5307/6131) not in _INV_VALUE_MAP: MRI_ID
[WARN] (5308/6131) not in _INV_VALUE_MAP: MRI_ID
[WARN] (5309/6131) not in _INV_VALUE_MAP: MRI_ID
[WARN] (5499/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5507/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5584/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5651/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5749/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5793/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5798/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5809/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (5910/6131) not in _INV_VALUE_MAP: uniqueFollowupID
[WARN] (6117/6131) not in _INV_VALUE_MAP: cordMishap
[WARN] (6118/6131) not in _INV_VALUE_MAP: uterineRupture
[WARN] (6119/6131) not in _INV_VALUE_MAP: placentalProblem
[WARN] (6120/6131) not in _INV_VALUE_MAP: shoulderDystocia
[WARN] (6121/6131) not in _INV_VALUE_MAP: maternalHemorrhage
[WARN] (6122/6131) not in _INV_VA




In [16]:
out_filename = f'{out_dir}/zz-merged-flatten-inv.csv'
df_inv.to_csv(out_filename, index=False)