In [None]:
import pydicom
from pathlib import Path
import numpy as np
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
import multiprocessing as mp
from IPython.display import clear_output
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2

import time

!pip install -q /kaggle/input/rsna-bcd-whl-ds/python_gdcm-3.0.20-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q /kaggle/input/rsna-bcd-whl-ds/pylibjpeg-1.4.0-py3-none-any.whl
!pip install -q /kaggle/input/rsna-bcd-whl-ds/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl


import dicomsdl


### Define preprocessing functions

In [None]:
import matplotlib.pyplot as plt

test_or_train = "train"

RESIZE_TO = (1024, 1024)
!rm -rf {test_or_train}_images_processed_{RESIZE_TO[0]}
!mkdir {test_or_train}_images_processed_{RESIZE_TO[0]}

# https://www.kaggle.com/code/tanlikesmath/brain-tumor-radiogenomic-classification-eda/notebook

# performs the augmentation steps
def dicom_file_to_ary(path):
    dicom = dicomsdl.open(str(path))
    data = dicom.pixelData(storedvalue=False)
    
    data = (data - data.min()) / (data.max() - data.min())
    
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        data = 1 - data
        
    data = align_left(data)
    data = inner_crop(data, .05,addl_vert = .075,addl_right = .175)
    #data = zero_crop(data)
    data = cv2.resize(data, RESIZE_TO)
    data = (data * 255).astype(np.uint8)
    
    return data



directories = list(Path(f'/kaggle/input/rsna-breast-cancer-detection/{test_or_train}_images/').iterdir())


def process_directory(directory_path):
    parent_directory = str(directory_path).split('/')[-1]
    !mkdir -p {test_or_train}_images_processed_{RESIZE_TO[0]}/{parent_directory}
    for image_path in directory_path.iterdir():
        processed_ary = dicom_file_to_ary(image_path)
        
        cv2.imwrite(
            f'{test_or_train}_images_processed_{RESIZE_TO[0]}/{parent_directory}/{image_path.stem}.png',
            processed_ary
        )
# Crop out a defined border area from the image    
def inner_crop(ary, pct, addl_vert = 0,addl_right = 0):
    height = ary.shape[0]
    width = ary.shape[1]
    
    vertical_crop = int((pct+addl_vert) * height)
    horizontal_crop = int(pct * width)
    
    output = ary[vertical_crop:height-vertical_crop, horizontal_crop:width- (horizontal_crop + int(addl_right*width))]
    return output
# Crop out zero pixel values
def zero_crop(ary):
    x, y = np.nonzero(ary)
    xl,xr = x.min(),x.max()
    yl,yr = y.min(),y.max()
    output = ary[xl:xr+1, yl:yr+1]
    return output

# Aligns images they are flipped the same way
def align_left(ary):
    width = ary.shape[1]
    left_size = ary[:,:width//2].mean()
    right_size = ary[:,width//2:].mean()
    
    if (right_size > left_size):
        ary = np.fliplr(ary)
    return ary
    
    

path = "/kaggle/input/rsna-breast-cancer-detection/train_images/10144/1260861056.dcm"

data = dicom_file_to_ary(path)
    
plt.imshow(data)

### Process in paralell

In [None]:
with mp.Pool(2) as p:
    p.map(process_directory, directories)