In [1]:
import pandas as pd
import numpy as np
import os
import json
import pydicom
from tqdm import tqdm

In [2]:
dirs = ['../../Data/cac_0','../../Data/cac_1']

In [3]:
df1 = pd.read_csv('../../2016_2017_cacs.csv', encoding='cp949')[['등록번호', 'dov', '관상동맥질환 위험도']]
df2 = pd.read_csv('../../2018_2019_cacs.csv', encoding='cp949')[['등록번호', '2018', '관상동맥질환 위험도']]
df2.columns = df1.columns

In [4]:
cac_score_df = pd.concat((df1,df2))
cac_score_df.columns = ['patid','dov','cac']

In [5]:
cac_to_int = {
    '저위험군(Very low)':0,
    '저위험군(Low)':1,
    '중등 위험군(Moderate)':2,
    '중등 고위험군(Moderately high)':3,
    '고위험군(High)':4
}
cac_score_df['cac'] = cac_score_df['cac'].apply(lambda x:cac_to_int[x])

In [6]:
def match_date_format(x):
    return str(x).replace('-','').replace("'",'')
cac_score_df['dov'] = cac_score_df['dov'].map(match_date_format)

In [7]:
cac_score_df['patid'] = cac_score_df['patid'].apply(lambda x:int(x.replace("'",'')))

In [8]:
# pat id check
values = cac_score_df['patid'].values
d = {k:True for k in cac_score_df['patid'].values}


for dir_name in dirs:
    items = os.listdir(dir_name)
    success = []
    failure = []
    for item in tqdm(items):
        dcm_path = os.path.join(dir_name, item)
        dcm = pydicom.read_file(dcm_path)
        if d.get(int(dcm.PatientID),None):
            success.append(dcm_path)
        else:
            failure.append(dcm_path)
    print(dir_name, len(success), len(failure))


100%|██████████| 1512/1512 [01:55<00:00, 13.07it/s]
  0%|          | 1/968 [00:00<01:53,  8.54it/s]

../../Data/cac_0 1512 0


100%|██████████| 968/968 [01:41<00:00,  9.56it/s]

../../Data/cac_1 968 0





In [19]:
def getScore(dcm_path , score_df):
    dcm = pydicom.read_file(dcm_path)
    study_date = dcm.StudyDate
    pat_id = int(dcm.PatientID)
    pat_df = score_df[score_df['patid']== pat_id]
    if len(pat_df)>0:
        df = pat_df[pat_df['dov'].map(lambda x:x == study_date[:4])]
        if len(df)>0:
            return [pat_id, study_date, df['cac'].values[0]]
        else:
            return False
    else:
        return False

In [21]:
matched_files_list = []
for dir_name in dirs:
    items = os.listdir(dir_name)
    for item in tqdm(items):
        dcm_path = os.path.join(dir_name, item)
        values = getScore(dcm_path, cac_score_df)
        if values:
            values = [dcm_path.split('Data/')[1]] + values
            # values: dcm_path, pat_id, study_date, cac score
            matched_files_list.append(values)

100%|██████████| 1512/1512 [02:00<00:00, 12.59it/s]
100%|██████████| 968/968 [01:51<00:00,  8.70it/s]


In [23]:
df = pd.DataFrame(matched_files_list)

In [24]:
df.columns = ['dcm_path', 'pat_id','study_date','score']

In [38]:
df

Unnamed: 0,dcm_path,pat_id,study_date,score
0,cac_0/I0005842.dcm,554258,20171208,0
1,cac_0/I0006148.dcm,1519894,20170126,0
2,cac_0/I0006454.dcm,1782239,20170406,0
3,cac_0/I0005537.dcm,773862,20170616,0
4,cac_0/I0005538.dcm,1217194,20171027,0
...,...,...,...,...
2475,cac_1/I0008013.dcm,230019,20170504,0
2476,cac_1/I0008014.dcm,82603494,20170801,4
2477,cac_1/I0008015.dcm,82244917,20170308,4
2478,cac_1/I0008016.dcm,1197040,20170418,4


In [39]:
df_shuffled=df.sample(frac=1).reset_index(drop=True)

In [41]:
n_train = len(df)//10 * 7
n_val = len(df)//10 * 1
df_train = df_shuffled.iloc[:n_train]
df_val = df_shuffled.iloc[n_train:n_train+n_val]
df_test = df_shuffled.iloc[n_train+n_val:]

In [42]:
for i in range(5):
    print((df_train.score == i).sum())
    
print('======')
for i in range(5):
    print((df_val.score == i).sum())
    
print('======')
for i in range(5):
    print((df_test.score == i).sum())
    
print('======')

1062
1
9
522
142
147
0
1
77
23
302
0
2
154
38


In [45]:
len(df), len(df_train), len(df_val), len(df_test)

(2480, 1736, 248, 496)

In [43]:
df_train.to_parquet('../../Data/train_dataset.parquet', engine='pyarrow')
df_val.to_parquet('../../Data/val_dataset.parquet', engine='pyarrow')
df_test.to_parquet('../../Data/test_dataset.parquet', engine='pyarrow')

In [44]:
147/(147+1+77+23)

0.592741935483871