In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import os

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 01. init

In [2]:
study_name = COMBINE_harmonizer.STUDY_LH
sheet_name = COMBINE_harmonizer.SHEET_DERIVED_DATA

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

## 02. Dictionary

In [4]:
data_dict_filename = f'{root_dir}/Dictionary_HIE_clinical_variables.xlsx'

In [5]:
df_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name)

In [6]:
dict_columns = list(df_dict[study_name].unique())

## 03. LH

In [7]:
the_dir = cfg.config[f'{study_name}_analysis_dir']

In [8]:
filenames = os.listdir(the_dir)
filenames.sort()

In [9]:
dfs = [pd.read_csv(os.sep.join([the_dir, each_filename])) for each_filename in filenames]

### 03.1. check columns in each file

In [10]:
for idx, each_df in enumerate(dfs):
    print(f'({idx}/{len(dfs)}) filename: {filenames[idx]} columns: {list(each_df.columns)}')

(0/2) filename: analysis.csv columns: ['HTHRM_ID', 'CENTER', 'fcenter', 'blindness', 'hearing', 'hear_imp', 'gross', 'cp_out', 'modsevcp', 'bayley3', 'death18', 'all_norm', 'hosp_die', 'any_antic', 'LHFOLNUM', 'any_seiz', 'seiz_fu', 'disab_ms', 'disab_4', 'disab_die', 'disab_die4', 'MRI_DATE', 'MRI_TIME', 'MRI_AGED', 'MRI_UNREAD', 'MRI_RDATE_PB', 'MRI_RDATE_NR', 'MRI_RDATE_WG', 'MRI_RDATE_LC', 'MRI_PATTERN', 'MRI_PATTERN_PB', 'MRI_PATTERN_NR', 'MRI_PATTERN_WG', 'MRI_PATTERN_LC', 'MRI_PCLASS', 'MRI_PCLASS_PB', 'MRI_PCLASS_NR', 'MRI_PCLASS_WG', 'MRI_PCLASS_LC', 'MRI_INFARL', 'MRI_INFARL_PB', 'MRI_INFARL_NR', 'MRI_INFARL_WG', 'MRI_INFARR', 'MRI_INFARR_PB', 'MRI_INFARR_NR', 'MRI_INFARR_WG', 'MRI_NOTDONE']
(1/2) filename: mrilh.csv columns: ['HTHRM_ID', 'CENTER', 'fcenter', 'blindness', 'hearing', 'hear_imp', 'gross', 'cp_out', 'modsevcp', 'bayley3', 'death18', 'all_norm', 'hosp_die', 'any_antic', 'LHFOLNUM', 'any_seiz', 'seiz_fu', 'disab_ms', 'disab_4', 'disab_die', 'disab_die4', 'MRI_DATE

### 03.2. all columns

In [11]:
all_columns = []
for idx, each_df in enumerate(dfs):
    all_columns += list(map(lambda x: {'column': x, 'idx': idx, 'filename': filenames[idx]}, list(each_df.columns)))
df_columns = pd.DataFrame(all_columns)
df_columns['column:filename'] = df_columns.apply(lambda x: f"{x['column']}:{x['filename']}", axis=1)
df_columns['column:filename:idx'] = df_columns.apply(lambda x: f"{x['column']}:{x['filename']}:{x['idx']}", axis=1)
df_columns.sort_values(by=['filename', 'column'], inplace=True)

In [12]:
df_columns

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
1,CENTER,0,analysis.csv,CENTER:analysis.csv,CENTER:analysis.csv:0
0,HTHRM_ID,0,analysis.csv,HTHRM_ID:analysis.csv,HTHRM_ID:analysis.csv:0
14,LHFOLNUM,0,analysis.csv,LHFOLNUM:analysis.csv,LHFOLNUM:analysis.csv:0
23,MRI_AGED,0,analysis.csv,MRI_AGED:analysis.csv,MRI_AGED:analysis.csv:0
21,MRI_DATE,0,analysis.csv,MRI_DATE:analysis.csv,MRI_DATE:analysis.csv:0
...,...,...,...,...,...
53,hear_imp,1,mrilh.csv,hear_imp:mrilh.csv,hear_imp:mrilh.csv:1
52,hearing,1,mrilh.csv,hearing:mrilh.csv,hearing:mrilh.csv:1
60,hosp_die,1,mrilh.csv,hosp_die:mrilh.csv,hosp_die:mrilh.csv:1
56,modsevcp,1,mrilh.csv,modsevcp:mrilh.csv,modsevcp:mrilh.csv:1


In [13]:
df_columns_count = df_columns.groupby(['column']).agg(count=('column', 'count')).sort_values(by=['count'], ascending=False)
df_columns_count.head(20)

Unnamed: 0_level_0,count
column,Unnamed: 1_level_1
CENTER,2
HTHRM_ID,2
MRI_RDATE_PB,2
MRI_RDATE_WG,2
MRI_TIME,2
MRI_UNREAD,2
all_norm,2
any_antic,2
any_seiz,2
bayley3,2


##### column not in dict

In [14]:
is_not_in_dict = df_columns['column'].isin(dict_columns) == False
df_columns_not_in_dict = df_columns[is_not_in_dict]
df_columns_not_in_dict

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
41,MRI_INFARL_NR,0,analysis.csv,MRI_INFARL_NR:analysis.csv,MRI_INFARL_NR:analysis.csv:0
40,MRI_INFARL_PB,0,analysis.csv,MRI_INFARL_PB:analysis.csv,MRI_INFARL_PB:analysis.csv:0
42,MRI_INFARL_WG,0,analysis.csv,MRI_INFARL_WG:analysis.csv,MRI_INFARL_WG:analysis.csv:0
45,MRI_INFARR_NR,0,analysis.csv,MRI_INFARR_NR:analysis.csv,MRI_INFARR_NR:analysis.csv:0
44,MRI_INFARR_PB,0,analysis.csv,MRI_INFARR_PB:analysis.csv,MRI_INFARR_PB:analysis.csv:0
46,MRI_INFARR_WG,0,analysis.csv,MRI_INFARR_WG:analysis.csv,MRI_INFARR_WG:analysis.csv:0
33,MRI_PATTERN_LC,0,analysis.csv,MRI_PATTERN_LC:analysis.csv,MRI_PATTERN_LC:analysis.csv:0
31,MRI_PATTERN_NR,0,analysis.csv,MRI_PATTERN_NR:analysis.csv,MRI_PATTERN_NR:analysis.csv:0
30,MRI_PATTERN_PB,0,analysis.csv,MRI_PATTERN_PB:analysis.csv,MRI_PATTERN_PB:analysis.csv:0
32,MRI_PATTERN_WG,0,analysis.csv,MRI_PATTERN_WG:analysis.csv,MRI_PATTERN_WG:analysis.csv:0


In [15]:
columns_not_in_dict = list(df_columns_not_in_dict['column:filename:idx'].unique())
# columns_not_in_dict.sort()
columns_not_in_dict

['MRI_INFARL_NR:analysis.csv:0',
 'MRI_INFARL_PB:analysis.csv:0',
 'MRI_INFARL_WG:analysis.csv:0',
 'MRI_INFARR_NR:analysis.csv:0',
 'MRI_INFARR_PB:analysis.csv:0',
 'MRI_INFARR_WG:analysis.csv:0',
 'MRI_PATTERN_LC:analysis.csv:0',
 'MRI_PATTERN_NR:analysis.csv:0',
 'MRI_PATTERN_PB:analysis.csv:0',
 'MRI_PATTERN_WG:analysis.csv:0',
 'MRI_PCLASS_LC:analysis.csv:0',
 'MRI_PCLASS_NR:analysis.csv:0',
 'MRI_PCLASS_PB:analysis.csv:0',
 'MRI_PCLASS_WG:analysis.csv:0',
 'MRI_RDATE_LC:analysis.csv:0',
 'MRI_RDATE_NR:analysis.csv:0',
 'MRI_RDATE_PB:analysis.csv:0',
 'MRI_RDATE_WG:analysis.csv:0',
 'MRI_INFARL_NR:mrilh.csv:1',
 'MRI_INFARL_PB:mrilh.csv:1',
 'MRI_INFARL_WG:mrilh.csv:1',
 'MRI_INFARR_NR:mrilh.csv:1',
 'MRI_INFARR_PB:mrilh.csv:1',
 'MRI_INFARR_WG:mrilh.csv:1',
 'MRI_PATTERN_LC:mrilh.csv:1',
 'MRI_PATTERN_NR:mrilh.csv:1',
 'MRI_PATTERN_PB:mrilh.csv:1',
 'MRI_PATTERN_WG:mrilh.csv:1',
 'MRI_PCLASS_LC:mrilh.csv:1',
 'MRI_PCLASS_NR:mrilh.csv:1',
 'MRI_PCLASS_PB:mrilh.csv:1',
 'MRI_PCLASS

#### dict not in column

In [16]:
is_dict_not_in_columns = (df_dict[study_name].isin(df_columns['column']) == False) & (df_dict[study_name].isnull() == False)
df_dict_not_in_columns = df_dict[is_dict_not_in_columns]
df_dict_not_in_columns

Unnamed: 0,Category,Subcategory,Standardized_VariableNames_Dictionary,type,Variable_Description,#studies w/ this var,redcap,comment,LH,OC


## 04. show column values

In [17]:
# XXX no show due to data privacy concern.

'''
for idx, each_df in enumerate(dfs):
    for idx2, column in enumerate(each_df.columns):
        print(f'({idx}/{len(dfs)}/{filenames[idx]}) ({idx2}/{len(each_df.columns)} column: {column} value: {list(each_df[column].unique())}')
    print()
'''

"\nfor idx, each_df in enumerate(dfs):\n    for idx2, column in enumerate(each_df.columns):\n        print(f'({idx}/{len(dfs)}/{filenames[idx]}) ({idx2}/{len(each_df.columns)} column: {column} value: {list(each_df[column].unique())}')\n    print()\n"