In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from tqdm import tqdm
import os

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg


## 01. Init

In [2]:
root_dir = '../'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [4]:
input_dir = f"{cfg.config['out_dir']}/out-publish-normalized"
out_dir = f"{cfg.config['out_dir']}/stats"

os.makedirs(out_dir, exist_ok=True)

data_dict_filename = f"{root_dir}/{COMBINE_harmonizer.DATA_DICTIONARY_EXCEL}"

In [5]:
COMBINE_harmonizer.init_rank_mapping(data_dict_filename)

_NUMERIC_VALUE_MAP_MAIN = COMBINE_harmonizer.build_numeric_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_MAIN)
_NUMERIC_VALUE_MAP_FOLLOWUP = COMBINE_harmonizer.build_numeric_value_map(data_dict_filename, COMBINE_harmonizer.SHEET_FOLLOW_UP)


build_numeric_value_map: (0/1108) variable: center type: center
[INFO] _build_numeric_value_map: to inv-text: type: center
build_numeric_value_map: (1/1108) variable: subjectID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (2/1108) variable: siteID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (3/1108) variable: birthDate type: date
[INFO] _build_numeric_value_map: to inv-text: type: date
build_numeric_value_map: (4/1108) variable: birthNumber type: int
build_numeric_value_map: (5/1108) variable: screenComment type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (6/1108) variable: coreTempLess32p5CGreaterEq2Hr_e type: bool
build_numeric_value_map: (7/1108) variable: coreTempLess33p5CGreater1Hr_e type: bool
build_numeric_value_map: (8/1108) variable: coreTempLess34CGreater1Hr_e type: bool
build_numeric_value_map: (9/1108) variable: first6HrCoolByClinicalP

build_numeric_value_map: (0/377) variable: followupCenter type: center
[INFO] _build_numeric_value_map: to inv-text: type: center
build_numeric_value_map: (1/377) variable: siteID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (2/377) variable: birthDate type: date
[INFO] _build_numeric_value_map: to inv-text: type: date
build_numeric_value_map: (3/377) variable: visitDate type: date
[INFO] _build_numeric_value_map: to inv-text: type: date
build_numeric_value_map: (4/377) variable: birthNumber type: int
build_numeric_value_map: (5/377) variable: center type: center
[INFO] _build_numeric_value_map: to inv-text: type: center
build_numeric_value_map: (6/377) variable: subjectID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (7/377) variable: followupID type: text
[INFO] _build_numeric_value_map: to inv-text: type: text
build_numeric_value_map: (8/377) variable: center_orig type: text
[INFO] _build_

In [6]:
_NUMERIC_VALUE_MAP = {
    COMBINE_harmonizer.SHEET_MAIN: _NUMERIC_VALUE_MAP_MAIN,
    COMBINE_harmonizer.SHEET_FOLLOW_UP: _NUMERIC_VALUE_MAP_FOLLOWUP,
}

_FILENAME_INFO_NUMERIC_VALUE_MAP = {each['name']: _NUMERIC_VALUE_MAP[each['data_dict']] for each in COMBINE_harmonizer.FILENAME_INFOS}

_PREFIX_NUMERIC_VALUE_MAP = {COMBINE_harmonizer.flatten_filename_prefix(filename): val for filename, val in _FILENAME_INFO_NUMERIC_VALUE_MAP.items()}


## Load flatten

In [7]:
filename = f'{input_dir}/zz-merged-flatten.csv'

In [8]:
df = pd.read_csv(filename, dtype='O')

In [9]:
columns = list(df.columns)

In [10]:
df_columns = pd.DataFrame({'columns': columns})

In [11]:
df_columns['column_tuple'] = df_columns['columns'].apply(COMBINE_harmonizer.flatten_column_tuple)
df_columns['column_name'] = df_columns['column_tuple'].apply(lambda x: x[1])
df_columns['column_prefix'] = df_columns['column_tuple'].apply(lambda x: x[0])

In [12]:
df_columns['column_prefix'].unique()

array(['', '01-02', '01-03', '01-04', '01-05', '01-05_1', '01-05_2',
       '01-06', '01-06_1', '01-07', '01-08', '01-09', '01-10', '01-11',
       '01-12', '01-12_1', '02-01', '02-02', '02-03', '02-04', '02-05',
       '02-05_s', '02-06_s', '02-07', '02-08', '02-09', '02-11', '02-12',
       '02-13', '02-14', '02-15', '02-16', '02-17', '03-01', '03-01_s',
       '03-02', '03-03', '03-04', '03-04_1', '03-05', '03-05_s',
       '03-05_s1', '04-01', '04-01_1', '04-02', '04-03', '04-04', '04-05',
       '04-06', '04-07', '04-08', '04-09', '04-10', '04-11', '04-12',
       '04-12_1', '04-13', '04-14', '04-15', '04-16', '04-17', '20-00',
       '20-01', '20-02', '20-03', '20-04', '20-05', '20-06', '20-07',
       '20-08', '20-09', '20-10', '20-10_1'], dtype=object)

In [13]:
df_columns_groupby = df_columns.groupby(['column_name']).agg(_count=('column_name', 'count'))

is_invalid = df_columns_groupby['_count'] > 1
df_columns_groupby[is_invalid].sort_values(by=['_count'], ascending=[False])

Unnamed: 0_level_0,_count
column_name,Unnamed: 1_level_1
alterationSkinIntegrity,87
temperatureTimeSlot_min,87
blanketTemperature_C,87
servoSetTemperature_C,87
shiver,87
...,...
maternalCardioRespiratoryArrest,2
maternalHemorrhage,2
maternalSeizure,2
maternalTrauma,2


In [14]:
def _numeric_df(df: pd.DataFrame)-> pd.DataFrame:
    df_numeric = df.copy()

    columns = list(df_numeric.columns)
    for idx, column in tqdm(enumerate(columns)):
        prefix, var_name, postfix = COMBINE_harmonizer.flatten_column_tuple(column)
        if prefix not in _PREFIX_NUMERIC_VALUE_MAP:
            print(f'[WARN] ({idx}/{len(columns)}) not in _PREFIX_NUMERIC_VALUE_MAP: column: {column} prefix: {prefix}')
            continue

        numeric_value_map = _PREFIX_NUMERIC_VALUE_MAP[prefix]
        if var_name not in numeric_value_map:
            print(f'[WARN] ({idx}/{len(columns)}) not in _NUMERIC_VALUE_MAP: {var_name}')
            continue
        if numeric_value_map[var_name] == COMBINE_harmonizer.to_numeric_text:
            # print(f'[INFO] ({idx}/{len(columns)}) pass to_numeric_text: {var_name}')
            continue
        df_numeric.loc[:, column] = df_numeric[column].apply(numeric_value_map[var_name])

    return df_numeric

In [15]:
df_numeric = _numeric_df(df)

0it [00:00, ?it/s]

574it [00:00, 5738.64it/s]

1226it [00:00, 6192.32it/s]

1849it [00:00, 6208.83it/s]

[WARN] (0/6124) not in _PREFIX_NUMERIC_VALUE_MAP: column: _study prefix: 
[WARN] (1/6124) not in _PREFIX_NUMERIC_VALUE_MAP: column: center prefix: 
[WARN] (2/6124) not in _PREFIX_NUMERIC_VALUE_MAP: column: subjectID prefix: 
[WARN] (3/6124) not in _PREFIX_NUMERIC_VALUE_MAP: column: uniqueID prefix: 


2470it [00:00, 5992.97it/s]

3089it [00:00, 6060.44it/s]

4022it [00:00, 7149.86it/s]

4809it [00:00, 7382.70it/s]

5549it [00:00, 7109.69it/s]

6124it [00:00, 6732.36it/s]

[WARN] (5362/6124) not in _NUMERIC_VALUE_MAP: MRI_ID
[WARN] (5363/6124) not in _NUMERIC_VALUE_MAP: MRI_ID
[WARN] (5364/6124) not in _NUMERIC_VALUE_MAP: MRI_ID
[WARN] (5626/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (5634/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (5711/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (5778/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (5876/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (5920/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (5925/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (6036/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (6097/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID
[WARN] (6109/6124) not in _NUMERIC_VALUE_MAP: uniqueFollowupID





In [16]:
out_filename = f'{out_dir}/zz-merged-flatten-numeric.csv'
df_numeric.to_csv(out_filename, index=False)