# Convert *DCM to *JPG

In [1]:
!conda install gdcm -c conda-forge -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt

In [3]:

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [4]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [5]:
train = pd.read_csv('/home/linh/Downloads/siim-covid19-detection/train_image_level.csv')
path = '/home/linh/Downloads/siim-covid19-detection/train/ae3e63d94c13/288554eb6182/e00f9fe0cce5.dcm'
dicom = pydicom.read_file(path)
def plot_img(path, plot_size=(6,6)):
    img = read_xray(path)
    plt.figure(figsize=(plot_size))
    plt.imshow(img, 'gray')
    plt.show()
    
# try the functions out
#sample_path = df.iloc[1,8]
#display(read_xray(sample_path))
display(plot_img(path))

In [6]:
image_id = []
dim0 = []
dim1 = []
splits = []

for split in ['test', 'train']:
    save_dir = f'/home/linh/Downloads/{split}/'

    os.makedirs(save_dir, exist_ok=True)
    
    for dirname, _, filenames in tqdm(os.walk(f'/home/linh/Downloads/siim-covid19-detection/{split}')):
        for file in filenames:
            # set keep_ratio=True to have original aspect ratio
            xray = read_xray(os.path.join(dirname, file))
            im = resize(xray, size=2000)  
            im.save(os.path.join(save_dir, file.replace('dcm', 'jpg')))

            image_id.append(file.replace('.dcm', ''))
            dim0.append(xray.shape[0])
            dim1.append(xray.shape[1])
            splits.append(split)
            


0it [00:00, ?it/s]

0it [00:00, ?it/s]



In [7]:
%%time
!tar -zcf train_dcm2jpg_2000px.tar.gz -C "/home/linh/Downloads/train/" .
!tar -zcf test_dcm2jpg_2000px.tar.gz -C "/home/linh/Downloads/test/" .

CPU times: user 1.76 s, sys: 309 ms, total: 2.07 s
Wall time: 1min 26s


In [8]:
df = pd.DataFrame.from_dict({'image_id': image_id, 'dim0': dim0, 'dim1': dim1, 'split': splits})
df.to_csv('/home/linh/Downloads/meta.csv', index=False)