# 데이터 읽기

In [4]:
import pandas as pd
from tqdm import tqdm
import os

path = './dataset/stage_1_train_cls.csv'
# csv 확인
df = pd.read_csv(path)
df.head(5)



Unnamed: 0,filename,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,any,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope
0,ID_63eb1e259,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"['-125', '-8', '180.199951']",1,"['0.48828125', '0.48828125']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0
1,ID_2669954a7,0,0,0,0,0,0,ID_363d5865,ID_a20b80c7bf,ID_3564d584db,"['-156', '45.5728491', '922.530821']",1,"['0.48828125', '0.48828125']",0,"['00047', '00047']","['00080', '00080']",-1024.0,1.0
2,ID_52c9913b1,0,0,0,0,0,0,ID_9c2b4bd7,ID_3e3634f8cf,ID_973274ffc9,"['-125.000', '-115.063', '4.455']",1,"['0.488281', '0.488281']",1,40,150,-1024.0,1.0
3,ID_4e6ff6126,0,0,0,0,0,0,ID_3ae81c2d,ID_a1390c15c2,ID_e5ccad8244,"['-99.5', '28.5', '100']",1,"['0.388671875', '0.388671875']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0
4,ID_7858edd88,0,0,0,0,0,0,ID_c1867feb,ID_c73e81ed3a,ID_28e0531b3a,"['-125.000', '-132.190', '145.793']",1,"['0.488281', '0.488281']",1,40,100,-1024.0,1.0


# 600명의 환자 ID 추출
#### normal:100, hemorrhage:500

In [10]:
pids = df.study_instance_uid.unique()

In [20]:
hm_pids = []
nm_pids = []

# 환자 단위 뇌출혈 여부 판단하여 필요한 개수만큼 가져옴
for pid in pids:
    # hemorrhage
    if df[df.study_instance_uid==pid].iloc[:,1:7].sum().sum() > 1:
        hm_pids.append(pid)
        
    # normal
    else:
        nm_pids.append(pid)
    
    # only get 500(hm), 100 (nm)
    if len(hm_pids)>=500 and len(nm_pids)>=100:
        break
        
hm_pids = hm_pids[:500]
nm_pids = nm_pids[:100]
uids = hm_pids + nm_pids

# 600명 환자로 Dataframe 재구성 (slice id, position2 값 추가)

In [28]:
from tqdm.notebook import tqdm

header=['filename', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any', 'patient_id', 'study_instance_uid', 'series_instance_uid', 'image_position', 'samples_per_pixel', 'pixel_spacing', 'pixel_representation', 'window_center', 'window_width', 'rescale_intercept', 'rescale_slope', 'Position2', 'slice_id']
train_df = pd.DataFrame(columns=header)

for uid in tqdm(uids):
    
    # 환자단위 dataframe가져오기
    tmp_df = df[df['study_instance_uid']==uid].copy()
    
    # position2를 기준으로 정렬
    position2 = tmp_df.image_position.apply(lambda x:float(x.split(',')[2][2:-2]))
    tmp_df['Position2'] = position2
    tmp_df = tmp_df.sort_values(by='Position2')
    
    # slice id 추가
    tmp_df['slice_id']=range(len(position2))
    
    train_df = pd.concat([train_df,tmp_df])

# 저장
train_df.to_csv('./dataset/stage_1_train_cls_only600.csv', index=None)


HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))




# 위 600명의 환자만 이미지 파일로 저장

In [2]:
# dicom 이미지 변환 및 확인
import pydicom
import matplotlib.pyplot as plt
import numpy as np
import PIL

def save_img(img, save_dir):
    if len(img.shape)==2:
        im = PIL.Image.fromarray((img).astype(np.int8), mode='L')
    else:
        im = PIL.Image.fromarray((img).astype(np.int8), mode='RGB')
    im.save(save_dir)

def get_image_8bits(img, window_center, window_width):

    img_min = window_center - window_width//2
    img_max = window_center + window_width//2
    img[img<img_min] = img_min
    img[img>img_max] = img_max
    img = normalize_minmax(img).astype(np.int)
    
    return img

def get_image_24bits(img, window_center, window_width):
    
    imgs = []
    for i in range(len(window_center)):
        imgs.append(get_image_8bits(img.copy(), window_center[i], window_width[i]))
    imgs = np.array(imgs)
    
    return imgs.transpose(1,2,0)

def window_image(img, window_center, window_width, intercept, slope):
    
    img = (img * slope + intercept)
    
    if type(window_center) is int:
        return get_image_8bits(img, window_center, window_width)
    
    if len(window_center)==1 and len(window_width)==1:
        return get_image_8bits(img, window_center[0], window_width[0])
        
    elif len(window_center)==3 and len(window_width)==3:
        return get_image_24bits(img, window_center, window_width)
    
    else:
        raise ValueError('Window Center, Width is not supported value [{},{}]'.format(window_center, window_width))

        
    return img 

def get_first_of_dicom_field_as_int(x):
    if type(x) == pydicom.multival.MultiValue:
        return int(x[0])
    else:
        return int(x)

def get_meta_from_dicom(data):
    dicom_fields = [data[('0028','1050')].value, #window center
                    data[('0028','1051')].value, #window width
                    data[('0028','1052')].value, #intercept
                    data[('0028','1053')].value] #slope
    return [get_first_of_dicom_field_as_int(x) for x in dicom_fields]

def normalize_minmax(img):
    mi, ma = img.min(), img.max()
    return 255 * (img - mi) / (ma - mi)

def get_img_from_dicom_dir(dicom_dir, window=False):
    dicom = pydicom.read_file(dicom_dir)
    meta = get_meta_from_dicom(dicom)
    
    if not window or not window[0]:
        dcm_img = window_image(dicom.pixel_array, *meta)
    else:
        dcm_img = window_image(dicom.pixel_array, window[0], window[1], meta[2], meta[3])
    
    return dcm_img

window_index = 2
windows = [
    ([],[]), # 0번: dicom 내의 window center, window width 사용
    ([40],[80]), # 1번: 1채널로 window center, width를 각각 40, 80으로 사용
    ([40, 80, 600], [80, 200, 2800]) # 2번: 3채널로 (40,80), (80,200), (600,2800)으로 사용
]
window = windows[window_index]

In [7]:
train_df = pd.read_csv('./dataset/stage_1_train_cls_only600.csv')

img_save_dir = os.path.expanduser('~/dataset/kaggle_rsna(only600)/imgs/')
train_dicom_dir = os.path.expanduser('~/Downloads/아주대 백업/dataset/rsna_hemorrhage_kaggle/rsna-intracranial-hemorrhage-detection/stage_1_train_images/')

for fn in tqdm(train_df.filename):
    fn_dcm = fn+'.dcm'
    fn_png = fn+'.png'
    img = get_img_from_dicom_dir(train_dicom_dir+fn_dcm, window)
    save_img(img, img_save_dir+fn_png)

100%|██████████| 20787/20787 [38:16<00:00,  9.05it/s]    


# Train Test Valid split
* 학습 80명, test/valid 각각 10명씩

In [1]:
import os
import pandas as pd

In [2]:
data_dir = '../dataset/kaggle_rsna(only100)/imgs'
df = pd.read_csv('stage_1_train_cls_only100.csv')

In [3]:
df.head()

Unnamed: 0,filename,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,any,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope,Position2,slice_id
0,ID_d3eaf5452,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"['-125', '-8', '5.19995117']",1,"['0.48828125', '0.48828125']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0,5.199951,0
1,ID_84cd9c956,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"['-125', '-8', '10.1999512']",1,"['0.48828125', '0.48828125']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0,10.199951,1
2,ID_53ec1c229,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"['-125', '-8', '15.1999512']",1,"['0.48828125', '0.48828125']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0,15.199951,2
3,ID_6a0764005,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"['-125', '-8', '20.1999512']",1,"['0.48828125', '0.48828125']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0,20.199951,3
4,ID_272504a24,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"['-125', '-8', '25.1999512']",1,"['0.48828125', '0.48828125']",0,"['00036', '00036']","['00080', '00080']",-1024.0,1.0,25.199951,4


In [4]:
train_fns = df.study_instance_uid.unique()[20:]
valid_fns = df.study_instance_uid.unique()[10:20]
test_fns = df.study_instance_uid.unique()[:10]

train_df = df[df.study_instance_uid.isin(train_fns)]
valid_df = df[df.study_instance_uid.isin(valid_fns)]
test_df  = df[df.study_instance_uid.isin(test_fns)]

train_df.to_csv('./train.csv', index=None)
valid_df.to_csv('./valid.csv', index=None)
test_df.to_csv('./test.csv', index=None)

train_df.shape, valid_df.shape, test_df.shape

((2760, 20), (336, 20), (333, 20))

In [17]:
train_df.head()

Unnamed: 0,filename,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,any,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope,Position2,slice_id
669,ID_7e11ea124,0,0,0,0,0,0,ID_6b257d60,ID_b574dc579f,ID_d5bea7a164,"['-125.000000', '-85.797981', '-60.335361']",1,"['0.488281', '0.488281']",1,30,80,-1024.0,1.0,-60.335361,0
670,ID_fd64b3539,0,0,0,0,0,0,ID_6b257d60,ID_b574dc579f,ID_d5bea7a164,"['-125.000000', '-85.797981', '-54.942684']",1,"['0.488281', '0.488281']",1,30,80,-1024.0,1.0,-54.942684,1
671,ID_7f15b0c37,0,0,0,0,0,0,ID_6b257d60,ID_b574dc579f,ID_d5bea7a164,"['-125.000000', '-85.797981', '-49.550014']",1,"['0.488281', '0.488281']",1,30,80,-1024.0,1.0,-49.550014,2
672,ID_60820ff67,0,0,0,0,0,0,ID_6b257d60,ID_b574dc579f,ID_d5bea7a164,"['-125.000000', '-85.797981', '-44.157337']",1,"['0.488281', '0.488281']",1,30,80,-1024.0,1.0,-44.157337,3
673,ID_b2ba0706d,0,0,0,0,0,0,ID_6b257d60,ID_b574dc579f,ID_d5bea7a164,"['-125.000000', '-85.797981', '-38.765362']",1,"['0.488281', '0.488281']",1,30,80,-1024.0,1.0,-38.765362,4


# 모든 이미지를 train/test/valid 폴더에 각각 저장(안 해도 됨)

In [8]:
new_dir = '../dataset/kaggle_rsna(only100)/'

In [25]:
from shutil import copy
from tqdm import tqdm

def copy_data(phase, filenames):
    
    origin_data_path = '../dataset/kaggle_rsna(only100)/imgs/'
    new_data_path = '../dataset/kaggle_rsna(only100)/'
    
    if not phase in ['train/', 'test/', 'valid/']:
        raise ValueError('not supported phase [{}]'.format(phase))
    
    new_data_path += phase    
    if not os.path.exists(new_data_path):
        os.mkdir(new_data_path)
    
    for filename in tqdm(filenames):
        origin_dir = origin_data_path + filename +'.png'
        new_dir = new_data_path + filename + '.png'
        
        copy(origin_dir, new_dir)

copy_data('train/', train_df.filename)
copy_data('test/', test_df.filename)
copy_data('valid/', valid_df.filename)

100%|██████████| 2760/2760 [00:02<00:00, 1130.52it/s]
100%|██████████| 333/333 [00:00<00:00, 813.03it/s]
100%|██████████| 336/336 [00:00<00:00, 852.26it/s]
