# stage_1_train 데이터 프레임 생성
* 각 slice의 label값과 dicom파일의 meta정보를 이용하여 Dataframe 생성

### 데이터 읽기

In [2]:
import pandas as pd
import os
os.chdir(os.path.expanduser('~/jmjeon/kaggle-rsna'))

def get_path(path):
    if path[0]=='~':
        return os.path.expanduser(path)
    else:
        return path
    
path = './dataset/'
# csv 확인
df = pd.read_csv(get_path(path+'stage_1_train.csv'))

In [3]:
df.head(6)

Unnamed: 0,ID,Label
0,ID_63eb1e259_epidural,0
1,ID_63eb1e259_intraparenchymal,0
2,ID_63eb1e259_intraventricular,0
3,ID_63eb1e259_subarachnoid,0
4,ID_63eb1e259_subdural,0
5,ID_63eb1e259_any,0


### dicom 정보를 포함하는 dataFrame 생성

In [3]:
import pydicom
from tqdm.notebook import tqdm

train_df = []
row = []

tbar = tqdm(df.iterrows())
for rows in tbar:

    filename = rows[1][0][:12]
    hm_type = rows[1][0][13:]
    label = rows[1][1]
    
    if hm_type=='epidural':
        row.append(filename)
    
    # epidural, intraparenchymal, intraventricular, subarachnoid, subdural, any 순서
    row.append(label)
    
    if hm_type=='any':
        dcm = pydicom.read_file(get_path(path+'stage_1_train_images/'+filename+'.dcm'))
        row.append(dcm.PatientID)
        row.append(dcm.StudyInstanceUID)
        row.append(dcm.SeriesInstanceUID)
        row.append(dcm.ImagePositionPatient)
        row.append(dcm.SamplesPerPixel)
        row.append(dcm.PixelSpacing)
        row.append(dcm.PixelRepresentation)
        row.append(dcm.WindowCenter)
        row.append(dcm.WindowWidth)
        row.append(dcm.RescaleIntercept)
        row.append(dcm.RescaleSlope)
    
        train_df.append(row)

        row = []

    tbar.set_description(f'[{rows[0]}/{len(df)}]')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
# dataFrame 만들고 저장
header=['filename', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any', 'patient_id', 'study_instance_uid', 'series_instance_uid', 'image_position', 'samples_per_pixel', 'pixel_spacing', 'pixel_representation', 'window_center', 'window_width', 'rescale_intercept', 'rescale_slope']
train_df = pd.DataFrame(train_df, columns=header)
train_df.to_csv('stage_1_train_cls.csv', index=None)

In [5]:
train_df.head()

Unnamed: 0,filename,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,any,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope
0,ID_63eb1e259,0,0,0,0,0,0,ID_a449357f,ID_62d125e5b2,ID_0be5c0d1b3,"[-125, -8, 180.199951]",1,"[0.48828125, 0.48828125]",0,"[00036, 00036]","[00080, 00080]",-1024.0,1.0
1,ID_2669954a7,0,0,0,0,0,0,ID_363d5865,ID_a20b80c7bf,ID_3564d584db,"[-156, 45.5728491, 922.530821]",1,"[0.48828125, 0.48828125]",0,"[00047, 00047]","[00080, 00080]",-1024.0,1.0
2,ID_52c9913b1,0,0,0,0,0,0,ID_9c2b4bd7,ID_3e3634f8cf,ID_973274ffc9,"[-125.000, -115.063, 4.455]",1,"[0.488281, 0.488281]",1,40,150,-1024.0,1.0
3,ID_4e6ff6126,0,0,0,0,0,0,ID_3ae81c2d,ID_a1390c15c2,ID_e5ccad8244,"[-99.5, 28.5, 100]",1,"[0.388671875, 0.388671875]",0,"[00036, 00036]","[00080, 00080]",-1024.0,1.0
4,ID_7858edd88,0,0,0,0,0,0,ID_c1867feb,ID_c73e81ed3a,ID_28e0531b3a,"[-125.000, -132.190, 145.793]",1,"[0.488281, 0.488281]",1,40,100,-1024.0,1.0


In [6]:
train_df['patient_id'].unique().shape, train_df['study_instance_uid'].unique().shape

((17079,), (19530,))

# stage_2_test 데이터 프레임 생성
* dicom파일의 meta정보를 이용하여 dataframe생성

### 데이터셋 읽기

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import pydicom

path = '/home/ubuntu/jmjeon/rsna-intracranial-hemorrhage-detection/stage_2_test'
dicom_files = os.listdir(path)

### dicom 정보를 포함하는 dataFrame 생성

In [2]:
test_df = []
for dicom in tqdm(dicom_files):
    
    row = []
    
    filename = dicom

    dcm = pydicom.read_file(os.path.join(path, filename))
    
    row.append(filename.replace('.dcm',''))
    # label just for format
    for i in range(6):
        row.append(0)        
    row.append(dcm.PatientID)
    row.append(dcm.StudyInstanceUID)
    row.append(dcm.SeriesInstanceUID)
    row.append(dcm.ImagePositionPatient)
    row.append(dcm.SamplesPerPixel)
    row.append(dcm.PixelSpacing)
    row.append(dcm.PixelRepresentation)
    row.append(dcm.WindowCenter)
    row.append(dcm.WindowWidth)
    row.append(dcm.RescaleIntercept)
    row.append(dcm.RescaleSlope)
    
    test_df.append(row)
    

100%|██████████| 121232/121232 [11:22<00:00, 177.69it/s]


In [4]:
# dataFrame 만들고 저장
header=['filename', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any', 'patient_id', 'study_instance_uid', 'series_instance_uid', 'image_position', 'samples_per_pixel', 'pixel_spacing', 'pixel_representation', 'window_center', 'window_width', 'rescale_intercept', 'rescale_slope']
test_df = pd.DataFrame(test_df, columns=header)
test_df.to_csv('../dataset/stage_2_test_cls.csv', index=None)

In [5]:
test_df.head()

Unnamed: 0,filename,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,any,patient_id,study_instance_uid,series_instance_uid,image_position,samples_per_pixel,pixel_spacing,pixel_representation,window_center,window_width,rescale_intercept,rescale_slope
0,ID_57c3c0e68,0,0,0,0,0,0,ID_6be49c67,ID_73f1f42302,ID_9c277b7ad1,"[-126.408875, -126.408875, -231.713654]",1,"[0.494750976563, 0.494750976563]",1,35,135,-1024.0,1.0
1,ID_a10185368,0,0,0,0,0,0,ID_13a98073,ID_52b738ab7b,ID_c7595b5b3f,"[-125.000, -108.000, 62.500]",1,"[0.488281, 0.488281]",1,40,150,-1024.0,1.0
2,ID_15c931500,0,0,0,0,0,0,ID_98b1a444,ID_46850e15b0,ID_16e0e1d2a1,"[-125.000, -141.384, 80.664]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
3,ID_a3bac3e6b,0,0,0,0,0,0,ID_007e7be5,ID_154e0d735f,ID_091f4b8f2c,"[-125.000, -129.423, 150.908]",1,"[0.488281, 0.488281]",1,30,80,-1024.0,1.0
4,ID_28feed104,0,0,0,0,0,0,ID_ac5d1815,ID_7dd2a00ba1,ID_01f06c7cb6,"[-116.5, 2.5, 157.900024]",1,"[0.455078125, 0.455078125]",0,"[00036, 00036]","[00080, 00080]",-1024.0,1.0


In [6]:
test_df['patient_id'].unique().shape, test_df['study_instance_uid'].unique().shape

((3518,), (3518,))