In [1]:
import pandas as pd
import os

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 01. init

In [2]:
study_name = COMBINE_harmonizer.STUDY_LH
sheet_name = COMBINE_harmonizer.SHEET_FOLLOW_UP

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

## 02. Dictionary

In [4]:
data_dict_filename = f'{root_dir}/Dictionary_HIE_clinical_variables.xlsx'

In [5]:
df_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name)

In [6]:
dict_columns = list(df_dict[study_name].unique())

## 03. LH

In [7]:
the_dir = cfg.config[f'{study_name}_follow_up_dir']

In [8]:
filenames = os.listdir(the_dir)
filenames.sort()

In [9]:
dfs = [pd.read_csv(os.sep.join([the_dir, each_filename])) for each_filename in filenames]

### 03.1. check columns in each file

In [10]:
for idx, each_df in enumerate(dfs):
    print(f'({idx}/{len(dfs)}) filename: {filenames[idx]} columns: {list(each_df.columns)}')

(0/10) filename: lf01.csv columns: ['LHFOLNUM', 'SITENM', 'LAST', 'FIRST', 'HTHRM_ID', 'PROTID02', 'PROTID03', 'PROTID04', 'PROTID05', 'PROTID06', 'PROTID07', 'PROTID08', 'PROTID09', 'PROTID10', 'PROTID11', 'PROTID12', 'PROTID13', 'PROTID14', 'PROTID15', 'PROTID16', 'PROTID17', 'PROTID18', 'PROTID19', 'PROTID20', 'REC_CMP', 'CENTER', 'BIRTHDT', 'VISITDT', 'BIRTHNM', 'PREVCNTR', 'CMP_DATE', 'CRT_DATE']
(1/10) filename: lf03.csv columns: ['LHFOLNUM', 'CFSUPER', 'NF3MONEY', 'CFWKPCAR', 'CFWKOCAR', 'CFSCCARE', 'CFSCADUL', 'CFOTHLAN', 'CFSECLAN', 'CFOTHSEC', 'CFNDNRSV', 'CFNDNRSH', 'CFNDOTPT', 'CFNDSPCH', 'CFNDEALY', 'CFNDSOCL', 'CFNDCLIN', 'CFNDPUL', 'CFNDOPH', 'CFNDGAS', 'CFNDAUD', 'CFNDNEU', 'CFNDOTH', 'CFCVOSP', 'CFNDEVAL', 'CFNDPFU', 'CFREGDR', 'CFCCFAC', 'CFCAROT', 'CFTRDYCR', 'CFSPDYCR', 'CFHMDYCR', 'CFBBYSIT', 'CFINITS', 'REC_CMP', 'CENTER', 'CFVISDT', 'CFBRTHDT', 'CFCHRAGE', 'CFADJAGE', 'CFCARE', 'CFOCAR', 'CFMARITL', 'CFLIVING', 'CFPEOPLE', 'CFGRPCAR', 'CFGROCAR', 'CFINCOME', 'CFI

### 03.2 all columns

In [11]:
all_columns = []
for idx, each_df in enumerate(dfs):
    all_columns += list(map(lambda x: {'column': x, 'idx': idx, 'filename': filenames[idx]}, list(each_df.columns)))
df_columns = pd.DataFrame(all_columns)
df_columns['column:filename'] = df_columns.apply(lambda x: f"{x['column']}:{x['filename']}", axis=1)
df_columns['column:filename:idx'] = df_columns.apply(lambda x: f"{x['column']}:{x['filename']}:{x['idx']}", axis=1)
df_columns.sort_values(by=['filename', 'column'], inplace=True)

In [12]:
df_columns

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
26,BIRTHDT,0,lf01.csv,BIRTHDT:lf01.csv,BIRTHDT:lf01.csv:0
28,BIRTHNM,0,lf01.csv,BIRTHNM:lf01.csv,BIRTHNM:lf01.csv:0
25,CENTER,0,lf01.csv,CENTER:lf01.csv,CENTER:lf01.csv:0
30,CMP_DATE,0,lf01.csv,CMP_DATE:lf01.csv,CMP_DATE:lf01.csv:0
31,CRT_DATE,0,lf01.csv,CRT_DATE:lf01.csv,CRT_DATE:lf01.csv:0
...,...,...,...,...,...
401,NF12MDPB,9,lf12.csv,NF12MDPB:lf12.csv,NF12MDPB:lf12.csv:9
422,NF12NMWD,9,lf12.csv,NF12NMWD:lf12.csv,NF12NMWD:lf12.csv:9
404,NF12NRDS,9,lf12.csv,NF12NRDS:lf12.csv,NF12NRDS:lf12.csv:9
403,NF12NRPB,9,lf12.csv,NF12NRPB:lf12.csv,NF12NRPB:lf12.csv:9


In [13]:
df_columns_count = df_columns.groupby(['column']).agg(count=('column', 'count')).sort_values(by=['count'], ascending=False)
df_columns_count.head(20)

Unnamed: 0_level_0,count
column,Unnamed: 1_level_1
CRT_DATE,10
CENTER,10
REC_CMP,10
LHFOLNUM,10
CMP_DATE,10
NF3MONEY,1
NF4OTHSP,1
NF4NUTRI,1
NF4LOCSP,1
NF4DIUR,1


### 03.3 check LHFOLNUM

In [14]:
is_LHFOLNUM = df_columns['column'] == 'LHFOLNUM'
df_columns[is_LHFOLNUM]

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
0,LHFOLNUM,0,lf01.csv,LHFOLNUM:lf01.csv,LHFOLNUM:lf01.csv:0
32,LHFOLNUM,1,lf03.csv,LHFOLNUM:lf03.csv,LHFOLNUM:lf03.csv:1
112,LHFOLNUM,2,lf04.csv,LHFOLNUM:lf04.csv,LHFOLNUM:lf04.csv:2
183,LHFOLNUM,3,lf04a.csv,LHFOLNUM:lf04a.csv,LHFOLNUM:lf04a.csv:3
193,LHFOLNUM,4,lf04ar.csv,LHFOLNUM:lf04ar.csv,LHFOLNUM:lf04ar.csv:4
202,LHFOLNUM,5,lf05.csv,LHFOLNUM:lf05.csv,LHFOLNUM:lf05.csv:5
300,LHFOLNUM,6,lf09a.csv,LHFOLNUM:lf09a.csv,LHFOLNUM:lf09a.csv:6
346,LHFOLNUM,7,lf10.csv,LHFOLNUM:lf10.csv,LHFOLNUM:lf10.csv:7
359,LHFOLNUM,8,lf11.csv,LHFOLNUM:lf11.csv,LHFOLNUM:lf11.csv:8
376,LHFOLNUM,9,lf12.csv,LHFOLNUM:lf12.csv,LHFOLNUM:lf12.csv:9


##### column not in dict

In [15]:
is_not_in_dict = df_columns['column'].isin(dict_columns) == False
df_columns_not_in_dict = df_columns[is_not_in_dict]
df_columns_not_in_dict

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
30,CMP_DATE,0,lf01.csv,CMP_DATE:lf01.csv,CMP_DATE:lf01.csv:0
31,CRT_DATE,0,lf01.csv,CRT_DATE:lf01.csv,CRT_DATE:lf01.csv:0
3,FIRST,0,lf01.csv,FIRST:lf01.csv,FIRST:lf01.csv:0
2,LAST,0,lf01.csv,LAST:lf01.csv,LAST:lf01.csv:0
5,PROTID02,0,lf01.csv,PROTID02:lf01.csv,PROTID02:lf01.csv:0
...,...,...,...,...,...
436,CMP_DATE,9,lf12.csv,CMP_DATE:lf12.csv,CMP_DATE:lf12.csv:9
437,CRT_DATE,9,lf12.csv,CRT_DATE:lf12.csv,CRT_DATE:lf12.csv:9
405,LFINITA,9,lf12.csv,LFINITA:lf12.csv,LFINITA:lf12.csv:9
409,NF12INIT,9,lf12.csv,NF12INIT:lf12.csv,NF12INIT:lf12.csv:9


In [16]:
columns_not_in_dict = list(df_columns_not_in_dict['column:filename:idx'].unique())
# columns_not_in_dict.sort()
columns_not_in_dict

['CMP_DATE:lf01.csv:0',
 'CRT_DATE:lf01.csv:0',
 'FIRST:lf01.csv:0',
 'LAST:lf01.csv:0',
 'PROTID02:lf01.csv:0',
 'PROTID03:lf01.csv:0',
 'PROTID04:lf01.csv:0',
 'PROTID05:lf01.csv:0',
 'PROTID06:lf01.csv:0',
 'PROTID07:lf01.csv:0',
 'PROTID08:lf01.csv:0',
 'PROTID09:lf01.csv:0',
 'PROTID10:lf01.csv:0',
 'PROTID11:lf01.csv:0',
 'PROTID12:lf01.csv:0',
 'PROTID13:lf01.csv:0',
 'PROTID14:lf01.csv:0',
 'PROTID15:lf01.csv:0',
 'PROTID16:lf01.csv:0',
 'PROTID17:lf01.csv:0',
 'PROTID18:lf01.csv:0',
 'PROTID19:lf01.csv:0',
 'PROTID20:lf01.csv:0',
 'REC_CMP:lf01.csv:0',
 'CFINITS:lf03.csv:1',
 'CFPRRESP:lf03.csv:1',
 'CMP_DATE:lf03.csv:1',
 'CRT_DATE:lf03.csv:1',
 'REC_CMP:lf03.csv:1',
 'CMP_DATE:lf04.csv:2',
 'CRT_DATE:lf04.csv:2',
 'DFCOMPDT:lf04.csv:2',
 'DFINITS:lf04.csv:2',
 'DFPLACE:lf04.csv:2',
 'NF4LOCSP:lf04.csv:2',
 'REC_CMP:lf04.csv:2',
 'CMP_DATE:lf04a.csv:3',
 'CRT_DATE:lf04a.csv:3',
 'DFINITSA:lf04a.csv:3',
 'REC_CMP:lf04a.csv:3',
 'CMP_DATE:lf04ar.csv:4',
 'CRT_DATE:lf04ar.csv:4'

#### dict not in column

In [17]:
is_dict_not_in_columns = (df_dict[study_name].isin(df_columns['column']) == False) & (df_dict[study_name].isnull() == False)
df_dict_not_in_columns = df_dict[is_dict_not_in_columns]
df_dict_not_in_columns

Unnamed: 0,Category,Subcategory,Standardized_VariableNames_Dictionary,type,Variable_Description,#studies w/ this var,redcap,comment,LH,OC


## 04. show column values

In [18]:
# XXX no show due to data privacy concern.

'''
for idx, each_df in enumerate(dfs):
    for idx2, column in enumerate(each_df.columns):
        print(f'({idx}/{len(dfs)}/{filenames[idx]}) ({idx2}/{len(each_df.columns)} column: {column} value: {list(each_df[column].unique())}')
'''

"\nfor idx, each_df in enumerate(dfs):\n    for idx2, column in enumerate(each_df.columns):\n        print(f'({idx}/{len(dfs)}/{filenames[idx]}) ({idx2}/{len(each_df.columns)} column: {column} value: {list(each_df[column].unique())}')\n"