In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import tensorflow as tf
from tensorflow.keras import layers, losses, optimizers
import tensorflow_addons as tfa

### Load trained model

In [3]:
img_size = 1024
img_augmentation = tf.keras.models.Sequential([
    layers.RandomRotation(factor=0.15),
    layers.RandomFlip(),
    layers.RandomZoom(0.1)
],
name='img_augmentation')


def build_model():
    pretrained_model = tf.keras.applications.EfficientNetB0(include_top=False, weights=None)
    pretrained_model.trainable = True

    inputs = layers.Input(shape=(img_size, img_size, 1))
    x = img_augmentation(inputs)
    x = layers.concatenate([x, x, x])
    x = pretrained_model(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs, outputs)

    return model

In [4]:
cp_dir = "/kaggle/input/model-weights/"

In [5]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    print("Device:", tpu.master())
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy()

# Then build your model within the strategy context:
tf.keras.backend.clear_session()
with strategy.scope():
    model = tf.keras.models.load_model('/kaggle/input/model-weights/trained_model_v2.h5')
#     model = build_model()
#     model.load_weights("/kaggle/input/model-weights/SavedWeights_ep0004.h5")
#     model.compile(
#         loss = 'binary_crossentropy',
#         optimizer = optimizers.Adam(),
#         metrics = [
#             tf.keras.metrics.Recall(),
#             tf.keras.metrics.Precision(),
#             tfa.metrics.F1Score(num_classes=1, threshold=0.5, name='F1')
#         ]
#     )
    
model.summary()

Not connected to a TPU runtime. Using CPU/GPU strategy
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1024, 1024,  0           []                               
                                 1)]                                                              
                                                                                                  
 img_augmentation (Sequential)  (None, 1024, 1024,   0           ['input_2[0][0]']                
                                1)                                                                
                                                                                                  
 concatenate (Concatenate)      (None, 1024, 1024,   0           ['img_augmentation[0][0]',       
                                3)     

In [6]:
# tf.keras.backend.clear_session()
# model = build_model()
# model.load_weights(cp_dir + "cp-0010.ckpt")

In [7]:
# thresholds = 0.5
# model.compile(
#     loss='binary_crossentropy',
#     optimizer=optimizers.Adam(),
#     metrics=[tf.keras.metrics.Recall(thresholds=thresholds), 
#              tf.keras.metrics.Precision(thresholds=thresholds), 
#              tfa.metrics.F1Score(num_classes=1, threshold=thresholds, name='F1'),
#             #  pFBeta(beta=1, name='pF1')
#              ]
# )

### Prepare test dataset

In [8]:
!pip install --no-index /kaggle/input/dicom-packages/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!pip install --no-index /kaggle/input/dicom-packages/python_gdcm-3.0.21-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index /kaggle/input/dicom-packages/pylibjpeg-1.4.0-py3-none-any.whl
!pip install --no-index /kaggle/input/dicom-packages/pylibjpeg_libjpeg-1.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index /kaggle/input/dicom-packages/pylibjpeg_openjpeg-1.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index /kaggle/input/dicom-packages/pylibjpeg_rle-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/dicom-packages/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: dicomsdl
Successfully installed dicomsdl-0.109.1
[0mProcessing /kaggle/input/dicom-packages/python_gdcm-3.0.21-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: python-gdcm
Successfully installed python-gdcm-3.0.21
[0mProcessing /kaggle/input/dicom-packages/pylibjpeg-1.4.0-py3-none-any.whl
Installing collected packages: pylibjpeg
Successfully installed pylibjpeg-1.4.0
[0mProcessing /kaggle/input/dicom-packages/pylibjpeg_libjpeg-1.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: pylibjpeg-libjpeg
Successfully installed pylibjpeg-libjpeg-1.3.3
[0mProcessing /kaggle/input/dicom-packages/pylibjpeg_openjpeg-1.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: pylibjpeg-openjpeg
Successfully installed pylibj

In [9]:
import dicomsdl
import cv2

In [10]:
train_df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/train.csv").sample(frac=1)
test_df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/test.csv")

In [11]:
train_image_dir = "/kaggle/input/rsna-breast-cancer-detection/train_images/"
test_image_dir = "/kaggle/input/rsna-breast-cancer-detection/test_images/"
train_df['file_path'] = train_df.apply(lambda r : os.path.join(train_image_dir, str(r['patient_id']), str(r['image_id']) + ".dcm"), axis=1)
test_df['file_path'] = test_df.apply(lambda r : os.path.join(test_image_dir, str(r['patient_id']), str(r['image_id']) + ".dcm"), axis=1)

In [12]:
def get_path(patient_id, image_id):
    return os.path.join(train_image_dir, str(patient_id), str(image_id) + ".dcm")

# this is a helper function to find the area of the images to crop
# arr is an array with elements of 0 and 1. This function is to find the longest block of continuous ones.
def longest_block(arr):
    prev_pt = -1
    curr_pt = 0
    blocks = []
    start = end = -1
    while curr_pt < len(arr):
        if arr[curr_pt] == 1:
            if prev_pt != -1 and arr[prev_pt] == 1:
                end = curr_pt
            else:
                start = end = curr_pt
        else:
            if prev_pt != -1 and arr[prev_pt] == 1:
                blocks.append((start, end+1))
                
        curr_pt += 1
        prev_pt += 1
    if arr[prev_pt] == 1:
        blocks.append((start, end+1))
    return sorted(blocks, key=lambda x: x[1]-x[0])[-1]

def crop_image(img_arr, crop_threshold = 0.05, margin=0):
    if not isinstance(img_arr, np.ndarray):
        img_arr = np.array(img_arr)
    col_sum = img_arr.sum(axis=0)
    row_sum = img_arr.sum(axis=1)
    col_valid = (col_sum > col_sum.max() * crop_threshold).astype(int)
    row_valid = (row_sum > row_sum.max() * crop_threshold).astype(int)
    col_lo, col_hi = longest_block(col_valid)
    row_lo, row_hi = longest_block(row_valid)
    h, w = img_arr.shape
    col_lo = max(0, col_lo - margin)
    col_hi = min(w, col_hi + margin)
    row_lo = max(0, row_lo - margin)
    row_hi = min(h, row_hi + margin)
    cropped_img = np.zeros((row_hi - row_lo, col_hi - col_lo))
    cropped_img[:,:] = img_arr[row_lo:row_hi, col_lo:col_hi]
    return np.uint8(cropped_img)

def apply_voi_lut(ori_img_arr, window_center, window_width, voilut_func='SIGMOID'):
    output_range = 255
    if not isinstance(ori_img_arr, np.ndarray):
        ori_img_arr = np.array(ori_img_arr)
    
    if voilut_func == 'SIGMOID':
        output_arr = output_range / (1 + np.exp(-4 * (ori_img_arr - window_center) / window_width))
    elif voilut_func == 'LINEAR_EXACT':
        output_arr = ((ori_img_arr - window_center)/window_width + 0.5) * 255
    else:
        output_arr = ((ori_img_arr - (window_center - 0.5))/(window_width - 1) + 0.5) * 255
    
    output_arr[output_arr < 0] = 0
    output_arr[output_arr > 255] = 255
    return output_arr

def process_image(file_path, img_size=512):
    file_path = file_path.decode('utf-8')
    dcm_img = dicomsdl.open(file_path)
    img_arr = dcm_img.pixelData()
    
    try:
        wc = np.float32(dcm_img.WindowCenter)
        if isinstance(wc, np.ndarray):
            wc = wc[0]
    except:
        wc = None
        
    try:
        ww = np.float32(dcm_img.WindowWidth)
        if isinstance(ww, np.ndarray):
            ww = ww[0]
    except:
        ww == None
    
    try:
        voilut_func = dcm_img.VOILUTFunction
    except:
        voilut_func = 'LINEAR'
    
    if wc and ww and voilut_func == 'SIGMOID':
        img_arr = apply_voi_lut(img_arr, wc, ww, 'SIGMOID')
    
    # rescale to 0-255
    img_arr = (img_arr - img_arr.min()) / (img_arr.max() - img_arr.min()) * 255
    
    # inverse image
    if dcm_img.PhotometricInterpretation == 'MONOCHROME1':
        img_arr = 255 - img_arr
    
    # crop image
    img_arr = crop_image(img_arr)
    
    # resize images
    img_arr = cv2.resize(img_arr, (img_size, img_size))
    return tf.cast(tf.expand_dims(img_arr, axis=-1), tf.float32)
    

The model was already trained in another notebook. A small train dataset is only used to check the model.

In [13]:
train_ds = tf.data.Dataset.from_tensor_slices((train_df["file_path"].to_numpy()[:2000], train_df['cancer'].to_numpy()[:2000]))
test_ds = tf.data.Dataset.from_tensor_slices(test_df['file_path'].to_numpy())

In [14]:
# for d in test_ds.take(5):
#     print(d.numpy().decode('utf-8'))

In [15]:
test_ds = test_ds.map(lambda x:tf.numpy_function(process_image, [x], tf.float32))

train_ds = train_ds.map(lambda x,y:(tf.numpy_function(process_image, [x], tf.float32), y))

In [16]:
batch_size = 8
train_ds = train_ds.batch(batch_size).prefetch(batch_size)
test_ds = test_ds.batch(batch_size).prefetch(batch_size)

### Make inference

First, evaluate the model with train dataset.

In [17]:
train_df.iloc[:2000, train_df.columns.get_loc('cancer')].value_counts()

0    1943
1      57
Name: cancer, dtype: int64

In [18]:
# model.evaluate(train_ds)

In [19]:
test_df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,implant,machine_id,prediction_id,file_path
0,2,10008,736471439,L,MLO,81,0,21,10008_L,/kaggle/input/rsna-breast-cancer-detection/tes...
1,2,10008,1591370361,L,CC,81,0,21,10008_L,/kaggle/input/rsna-breast-cancer-detection/tes...
2,2,10008,68070693,R,MLO,81,0,21,10008_R,/kaggle/input/rsna-breast-cancer-detection/tes...
3,2,10008,361203119,R,CC,81,0,21,10008_R,/kaggle/input/rsna-breast-cancer-detection/tes...


In [20]:
test_df['cancer'] = model.predict(test_ds).flatten()



In [21]:
results = test_df.groupby("prediction_id", as_index=False)['cancer'].mean()
results['cancer'] = (results['cancer'] > 0.25).astype(int)
results.to_csv("submission.csv", index=False)

In [22]:
results

Unnamed: 0,prediction_id,cancer
0,10008_L,0
1,10008_R,0
