In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import pydicom
from glob import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
from skimage import exposure
from PIL import Image
import cv2
import warnings
warnings.filterwarnings('ignore')

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [2]:
def dicom2array(root_dir, file_name, out_pth, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(os.path.join(root_dir, file_name))
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    
    im = Image.fromarray(data)
    
    out_file_name = file_name.replace(".dicom", ".png")
    new_path = f"{out_pth}/{out_file_name}"
    im.save(new_path)
    return data

In [3]:
data = pd.read_csv("/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv")

In [4]:
data

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,
...,...,...,...,...,...,...,...,...
67909,936fd5cff1c058d39817a08f58b72cae,No finding,14,R1,,,,
67910,ca7e72954550eeb610fe22bf0244b7fa,No finding,14,R1,,,,
67911,aa17d5312a0fb4a2939436abca7f9579,No finding,14,R8,,,,
67912,4b56bc6d22b192f075f13231419dfcc8,Cardiomegaly,3,R8,771.0,979.0,1680.0,1311.0


In [5]:
uni_classes = np.unique(data.class_id)

In [6]:
uni_classes

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [7]:
output_dir = "/kaggle/working/vinbigdata_meta"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [8]:
for cls in tqdm(uni_classes):
    data_cls = data[data.class_id == cls]
    samp = data_cls.sample(50, replace=False)
    
    for i, row in samp.iterrows():
        cls_name = row.class_name
        folder_pth = f"{output_dir}/{cls_name}/"
        if not os.path.exists(folder_pth):
            os.makedirs(folder_pth)
        dicom2array("/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train", row.image_id + ".dicom", folder_pth)
        
        

  0%|          | 0/15 [00:00<?, ?it/s]