In [1]:
import pandas as pd
import os

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## 01. init

In [2]:
study_name = COMBINE_harmonizer.STUDY_OC
sheet_name = COMBINE_harmonizer.SHEET_MAIN

root_dir = '..'

In [3]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

## 02. Dictionary

In [4]:
data_dict_filename = f'{root_dir}/Dictionary_HIE_clinical_variables.xlsx'

In [5]:
df_dict = COMBINE_harmonizer.load_data_dict(data_dict_filename, sheet_name)

In [6]:
dict_oc_columns = list(df_dict[study_name].unique())

## 03. OC

In [7]:
the_dir = cfg.config[f'{study_name}_dir']

In [8]:
filenames = os.listdir(the_dir)
filenames.sort()

In [9]:
dfs = [pd.read_csv(os.sep.join([the_dir, each_filename])) for each_filename in filenames]

### 03.1 check columns in each file

In [10]:
for idx, each_df in enumerate(dfs):
    print(f'({idx}/{len(dfs)}) filename: {filenames[idx]} columns: {list(each_df.columns)}')

(0/23) filename: oc01.csv columns: ['HTHRM_ID', 'SITENM', 'LAST', 'FIRST', 'COMMENT', 'REC_CMP', 'CENTER', 'BIRTHDT', 'BIRTHNM', 'CMP_DATE', 'CRT_DATE']
(1/23) filename: oc02.csv columns: ['HTHRM_ID', 'OC2TEMP', 'OC2NORAN', 'OC2CHROM', 'OC2CNGEN', 'OC2WGHT', 'OC2SURV', 'OC2ALLBG', 'OC2PH', 'OC2DEFIC', 'OC2HIST', 'OC2APGA', 'OC2SEIZ', 'OC2NEXDN', 'OC2TONE', 'OC2RESP', 'OC2SIGN', 'OC2SEDA', 'OC2NAME', 'OC2ELIG', 'OC2REAS', 'OC2RAND', 'OC2NRRES', 'OC2RTRTM', 'OC2INIT', 'REC_CMP', 'CENTER', 'OC2NENDR', 'OC2CON', 'OC2SPON', 'OC2POST', 'OC2SUCK', 'OC2MORO', 'OC2PUPL', 'OC2HR', 'OC2NDAT', 'OC2NTIM', 'OC2CONS', 'OC2NRREA', 'OC2RANDT', 'OC2RANTM', 'OC2LEVEL', 'OC2RANNM', 'OC2RANNA', 'OC2BYYPE', 'CMP_DATE', 'CRT_DATE', 'OC2TMP2']
(2/23) filename: oc04.csv columns: ['HTHRM_ID', 'OC4MBIR', 'OC4PCAR', 'OC4HYPE', 'OC4ANTE', 'OC4THYR', 'OC4DIAB', 'OC4LOSS', 'OC4CORD', 'OC4UTER', 'OC4DYST', 'OC4PLAC', 'OC4HEMO', 'OC4TRAU', 'OC4ARRE', 'OC4SEIZ', 'OC4PYRE', 'OC4CHOR', 'OC4PATH', 'OC4HIST', 'OC4ANTI', 'O

### 03.2 all columns

In [11]:
all_columns = []
for idx, each_df in enumerate(dfs):
    all_columns += list(map(lambda x: {'column': x, 'idx': idx, 'filename': filenames[idx]}, list(each_df.columns)))
df_columns = pd.DataFrame(all_columns)
df_columns['column:filename'] = df_columns.apply(lambda x: f"{x['column']}:{x['filename']}", axis=1)
df_columns['column:filename:idx'] = df_columns.apply(lambda x: f"{x['column']}:{x['filename']}:{x['idx']}", axis=1)
df_columns.sort_values(by=['filename', 'column'], inplace=True)

In [12]:
df_columns

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
7,BIRTHDT,0,oc01.csv,BIRTHDT:oc01.csv,BIRTHDT:oc01.csv:0
8,BIRTHNM,0,oc01.csv,BIRTHNM:oc01.csv,BIRTHNM:oc01.csv:0
6,CENTER,0,oc01.csv,CENTER:oc01.csv,CENTER:oc01.csv:0
9,CMP_DATE,0,oc01.csv,CMP_DATE:oc01.csv,CMP_DATE:oc01.csv:0
4,COMMENT,0,oc01.csv,COMMENT:oc01.csv,COMMENT:oc01.csv:0
...,...,...,...,...,...
845,OM3LESTY_code,22,ocmr03b.csv,OM3LESTY_code:ocmr03b.csv,OM3LESTY_code:ocmr03b.csv:22
831,OM3READR,22,ocmr03b.csv,OM3READR:ocmr03b.csv,OM3READR:ocmr03b.csv:22
832,OM3READR_code,22,ocmr03b.csv,OM3READR_code:ocmr03b.csv,OM3READR_code:ocmr03b.csv:22
829,OPTCID,22,ocmr03b.csv,OPTCID:ocmr03b.csv,OPTCID:ocmr03b.csv:22


In [13]:
df_columns_count = df_columns.groupby(['column']).agg(count=('column', 'count')).sort_values(by=['count'], ascending=False)
df_columns_count.head(20)

Unnamed: 0_level_0,count
column,Unnamed: 1_level_1
CENTER,19
HTHRM_ID,19
REC_CMP,12
CMP_DATE,12
CRT_DATE,12
Site,4
OPTCID,4
FormStatus,4
KeyedUser,4
DateCreated,4


### 03.3 check HTHRM_ID

In [14]:
is_HTHRM_ID = df_columns['column'] == 'HTHRM_ID'
df_columns[is_HTHRM_ID]

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
0,HTHRM_ID,0,oc01.csv,HTHRM_ID:oc01.csv,HTHRM_ID:oc01.csv:0
11,HTHRM_ID,1,oc02.csv,HTHRM_ID:oc02.csv,HTHRM_ID:oc02.csv:1
59,HTHRM_ID,2,oc04.csv,HTHRM_ID:oc04.csv,HTHRM_ID:oc04.csv:2
113,HTHRM_ID,3,oc05.csv,HTHRM_ID:oc05.csv,HTHRM_ID:oc05.csv:3
164,HTHRM_ID,4,oc06d.csv,HTHRM_ID:oc06d.csv,HTHRM_ID:oc06d.csv:4
173,HTHRM_ID,5,oc06t.csv,HTHRM_ID:oc06t.csv,HTHRM_ID:oc06t.csv:5
186,HTHRM_ID,6,oc06t120.csv,HTHRM_ID:oc06t120.csv,HTHRM_ID:oc06t120.csv:6
223,HTHRM_ID,7,oc06t72.csv,HTHRM_ID:oc06t72.csv,HTHRM_ID:oc06t72.csv:7
260,HTHRM_ID,8,oc07.csv,HTHRM_ID:oc07.csv,HTHRM_ID:oc07.csv:8
288,HTHRM_ID,9,oc08.csv,HTHRM_ID:oc08.csv,HTHRM_ID:oc08.csv:9


#### column not in dict

In [15]:
is_not_in_dict = df_columns['column'].isin(dict_oc_columns) == False
df_columns_not_in_dict = df_columns[is_not_in_dict]
df_columns_not_in_dict

Unnamed: 0,column,idx,filename,column:filename,column:filename:idx
9,CMP_DATE,0,oc01.csv,CMP_DATE:oc01.csv,CMP_DATE:oc01.csv:0
10,CRT_DATE,0,oc01.csv,CRT_DATE:oc01.csv,CRT_DATE:oc01.csv:0
3,FIRST,0,oc01.csv,FIRST:oc01.csv,FIRST:oc01.csv:0
2,LAST,0,oc01.csv,LAST:oc01.csv,LAST:oc01.csv:0
5,REC_CMP,0,oc01.csv,REC_CMP:oc01.csv,REC_CMP:oc01.csv:0
...,...,...,...,...,...
848,DateComplete,22,ocmr03b.csv,DateComplete:ocmr03b.csv,DateComplete:ocmr03b.csv:22
849,DateCreated,22,ocmr03b.csv,DateCreated:ocmr03b.csv,DateCreated:ocmr03b.csv:22
847,FormStatus,22,ocmr03b.csv,FormStatus:ocmr03b.csv,FormStatus:ocmr03b.csv:22
850,KeyedUser,22,ocmr03b.csv,KeyedUser:ocmr03b.csv,KeyedUser:ocmr03b.csv:22


In [16]:
columns_not_in_dict = list(df_columns_not_in_dict['column:filename:idx'].unique())
# columns_not_in_dict.sort()
columns_not_in_dict

['CMP_DATE:oc01.csv:0',
 'CRT_DATE:oc01.csv:0',
 'FIRST:oc01.csv:0',
 'LAST:oc01.csv:0',
 'REC_CMP:oc01.csv:0',
 'CMP_DATE:oc02.csv:1',
 'CRT_DATE:oc02.csv:1',
 'OC2INIT:oc02.csv:1',
 'OC2NAME:oc02.csv:1',
 'REC_CMP:oc02.csv:1',
 'CMP_DATE:oc04.csv:2',
 'CRT_DATE:oc04.csv:2',
 'OC4INIT:oc04.csv:2',
 'REC_CMP:oc04.csv:2',
 'CMP_DATE:oc05.csv:3',
 'CRT_DATE:oc05.csv:3',
 'OC5INIT:oc05.csv:3',
 'REC_CMP:oc05.csv:3',
 'OC6INIT:oc06t120.csv:6',
 'OC6NCEPR:oc06t120.csv:6',
 'OC6INIT:oc06t72.csv:7',
 'OC6NCEPR:oc06t72.csv:7',
 'CMP_DATE:oc07.csv:8',
 'CRT_DATE:oc07.csv:8',
 'REC_CMP:oc07.csv:8',
 'CMP_DATE:oc08.csv:9',
 'CRT_DATE:oc08.csv:9',
 'REC_CMP:oc08.csv:9',
 'CMP_DATE:oc09.csv:10',
 'CRT_DATE:oc09.csv:10',
 'REC_CMP:oc09.csv:10',
 'CMP_DATE:oc10.csv:12',
 'CRT_DATE:oc10.csv:12',
 'OC10INIT:oc10.csv:12',
 'REC_CMP:oc10.csv:12',
 'CMP_DATE:oc11.csv:13',
 'CRT_DATE:oc11.csv:13',
 'OC11ENAM:oc11.csv:13',
 'REC_CMP:oc11.csv:13',
 'CMP_DATE:oc12.csv:14',
 'CRT_DATE:oc12.csv:14',
 'REC_CMP:o

#### dict not in column

In [17]:
is_dict_not_in_columns = (df_dict['OC'].isin(df_columns['column']) == False) & (df_dict['OC'].isnull() == False)
df_dict_not_in_columns = df_dict[is_dict_not_in_columns]
df_dict_not_in_columns

Unnamed: 0,Category,Subcategory,Standardized_VariableNames_Dictionary,type,Variable_Description,#studies w/ this var,redcap,comment,LH,OC


## 04. show column values

In [18]:
# XXX no show due to data privacy concern.

'''
for idx, each_df in enumerate(dfs):
    for idx2, column in enumerate(each_df.columns):
        print(f'({idx}/{len(dfs)}/{filenames[idx]}) ({idx2}/{len(each_df.columns)} column: {column} value: {list(each_df[column].unique())}')
'''

"\nfor idx, each_df in enumerate(dfs):\n    for idx2, column in enumerate(each_df.columns):\n        print(f'({idx}/{len(dfs)}/{filenames[idx]}) ({idx2}/{len(each_df.columns)} column: {column} value: {list(each_df[column].unique())}')\n"