### Import Libraries

In [1]:
from tensorflow import keras
from keras import layers
import tensorflow as tf
import numpy as np
import pandas as pd
import os, shutil, pathlib
from sklearn.utils import shuffle
from keras import layers
import random
from IPython.display import clear_output
from PIL import Image
from skimage import util 
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

In [2]:
!pip install keras-tuner --upgrade
clear_output()
import keras_tuner

### Set seed

In [3]:
SEED = 12

np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.random.set_seed(SEED)
    

### Enable GPUs

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
strategy = tf.distribute.MirroredStrategy()
print("num replicas", strategy.num_replicas_in_sync)

Num GPUs Available:  2
num replicas 2


### Perform the split and define the target  
Cases with cancer, biopsy, or difficult_negative_case are treated a positive label.  
The validation split is performed at the patient level.

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
base_dir = "/kaggle/input/rsna-extracted-data/train_images_extracted_1024/"
# read in csv data
df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/train.csv")
df["path"] = base_dir + df.patient_id.astype("str") +"/"+ df.image_id.astype("str") +".png"
df = df.sort_values("path",ascending=True)

# create new label column
df["label"] = 0
df.loc[(df.cancer==1) | (df.biopsy ==1)|(df.difficult_negative_case ==True),"label"] = 1

split_df = df[["patient_id","age","cancer","biopsy","invasive","BIRADS"
              ,"site_id","machine_id"]]

# perform a straified split for balance
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=12)
split_df = split_df.groupby("patient_id").max()
X = split_df
y = split_df.pop("cancer")


for (train_index, test_index) in sss.split(X, y):
    train_patients = pd.DataFrame({"patient_id": X.iloc[train_index].index})
    val_patients = pd.DataFrame({"patient_id":X.iloc[test_index].index})
    
# merge the image level info onto the patient level splits
val_df = val_patients.merge(df, on ="patient_id",how = "left")
train_df = train_patients.merge(df, on ="patient_id",how = "left")

val_df.head()

Unnamed: 0,patient_id,site_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,path,label
0,19630,2,1121595436,R,CC,67.0,0,0,0,,0,,48,False,/kaggle/input/rsna-extracted-data/train_images...,0
1,19630,2,1405960470,L,MLO,67.0,0,0,0,,0,,48,False,/kaggle/input/rsna-extracted-data/train_images...,0
2,19630,2,1655504802,R,MLO,67.0,0,0,0,,0,,48,False,/kaggle/input/rsna-extracted-data/train_images...,0
3,19630,2,2105722026,L,CC,67.0,0,0,0,,0,,48,False,/kaggle/input/rsna-extracted-data/train_images...,0
4,19630,2,462258563,R,CC,67.0,0,0,0,,0,,48,False,/kaggle/input/rsna-extracted-data/train_images...,0


### Fix paths which failed to save during extraction

In [6]:
def check_and_correct_paths(df):
    corrected_df = df.copy()
    replaced_count = 0

    for index, row in corrected_df.iterrows():
        path = row['path']
        
        if not os.path.exists(path):
            patient, image = path.split('/')[-2:]
            alternative_path = f'/kaggle/input/rsna-custom-pipeline4/train_images_processed_1024/{patient}/{image}'

            if os.path.exists(alternative_path):
                corrected_df.loc[index, 'path'] = alternative_path
                replaced_count += 1
            else:
                print(f"Both paths not found for patient {patient} and image {image}")

    print(f"Replaced {replaced_count} paths")
    return corrected_df
train_df=check_and_correct_paths(train_df)
val_df = check_and_correct_paths(val_df)

Replaced 2 paths
Replaced 3 paths


### Build the dataset  
Cases with cancer are upsampled (but not with biopsy or difficult_negative_case)

In [7]:
def decode(path, img_size = (1024,1024)):
    file_bytes = tf.io.read_file(path)
    img = tf.image.decode_png(file_bytes, channels=3)
    img = tf.image.resize(img, img_size)
    return img

def decode_with_labels(path, label):
    return decode(path), tf.cast(label, tf.float32)

def build_dataset(paths,decode_fn, labels=None,batch_size=32, img_size = (1024,1024)):
    slices = paths if labels is None else (paths, labels)    
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(decode_fn, num_parallel_calls=2)
    ds = ds.batch(batch_size, drop_remainder=False)
    return ds

def upsample(train_paths, train_labels,cancer_paths_train,cancer_labels_train, upsamples=8):

    for _ in range(upsamples):
        train_paths += cancer_paths_train
        train_labels += cancer_labels_train
        
    random.Random(SEED).shuffle(train_paths)
    random.Random(SEED).shuffle(train_labels)
    return train_paths, train_labels
    
    

img_size = (1024,1024)

val_paths = list(val_df.path)
val_labels = list(val_df.cancer)

cancer_instances = train_df.loc[train_df.cancer==1]

cancer_paths_train = list(cancer_instances.path)
cancer_labels_train = list(cancer_instances.cancer)

train_paths = list(train_df.path)
train_labels = list(train_df.label)

train_paths, train_labels = upsample(train_paths, train_labels,
                                     cancer_paths_train,cancer_labels_train)

val_ds = build_dataset(val_paths,decode_with_labels ,labels = val_labels, batch_size=32)
train_ds = build_dataset(train_paths,decode_with_labels, labels=train_labels, batch_size=32)

### Model building functions

In [8]:
def get_data_augmentation():
    data_augmentation = keras.Sequential(
        [
        layers.RandomRotation(0.1),

        ]
    )
    return data_augmentation

from keras import backend as K

def pfbeta_tf(labels, preds, beta=1):
    preds = tf.clip_by_value(preds, 0, 1)
    y_true_count = tf.reduce_sum(labels)
    ctp = tf.reduce_sum(preds[labels==1])
    cfp = tf.reduce_sum(preds[labels==0])
    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result
    else:
        return 0.0


def get_conv_base():
    conv_base = keras.applications.ResNet50V2(
    include_top=False,
    weights='imagenet',
    input_shape=(*img_size,3),
    #drop_connect_rate=0.4,
    )

    trainable_layers = 45

    for layer in conv_base.layers[:-trainable_layers]:
        layer.trainable = False
    
    for layer in conv_base.layers[-trainable_layers:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True
    return conv_base

def get_metrics():
    auc= tf.keras.metrics.AUC(
    num_thresholds=200,
    curve='ROC',
    summation_method='interpolation',
    name=None,
    dtype=None,
    thresholds=None,
    multi_label=False,
    num_labels=None,
    label_weights=None,
    from_logits=False
    )

    recall = .5

    precision = tf.keras.metrics.PrecisionAtRecall(
        recall, num_thresholds=200, class_id=None, name=None, dtype=None
    )
    metrics = [auc,precision, pfbeta_tf]
    return metrics

def get_optimizer(lr=1e-4):
    optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr,
    beta_1=0.9,
    beta_2=0.999,
    )
    return optimizer

def get_loss():
    loss_fn = tf.keras.losses.BinaryCrossentropy(
    label_smoothing=0.0,)
    return loss_fn

### Build and tune the model

In [9]:
def build_model(layer_size=32, dropout=0.3, lr=1e-4):
    inputs = keras.Input(shape=(*img_size, 3))

    x = keras.applications.resnet_v2.preprocess_input(inputs)

    x = get_conv_base()(x)

    x = layers.GlobalAveragePooling2D()(x)

    x = layers.Dense(layer_size)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Dropout(dropout)(x)


    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(loss="binary_crossentropy",
              optimizer=get_optimizer(lr),
              metrics=get_metrics())
    
    return model 

def build_model_w_tuning(hp):
    with strategy.scope():
        model = build_model(layer_size=hp.Choice('train_layers', [16,32,128] ), 
                            dropout=hp.Choice('dropout', [0.0,0.3,0.5] ), 
                            lr=hp.Float("learning_rate",
                                        min_value=1e-5,
                                        max_value=1e-3,
                                        sampling="log")
                                                        )
    return model

This is the maximum amount of tuning trials and epochs that can be run within the 12 hour cap on Kaggle notebooks. Empirically, we have found that model reaches convergence after a few epochs, so the small number of epochs works fine.

In [10]:
tuner = keras_tuner.RandomSearch(
    build_model_w_tuning,
    objective=keras_tuner.Objective("val_auc", direction="max"),
    max_trials=4)

NUM_EPOCHS = 3
tuner.search(train_ds, epochs=NUM_EPOCHS, validation_data=val_ds)

print(tuner.results_summary())

Trial 4 Complete [01h 58m 52s]
val_auc: 0.7052977085113525

Best val_auc So Far: 0.7485799193382263
Total elapsed time: 07h 49m 45s
Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name="val_auc", direction="max")

Trial 1 summary
Hyperparameters:
train_layers: 16
dropout: 0.3
learning_rate: 0.00016449982036188414
Score: 0.7485799193382263

Trial 0 summary
Hyperparameters:
train_layers: 128
dropout: 0.5
learning_rate: 1.3236614364006188e-05
Score: 0.7367491722106934

Trial 2 summary
Hyperparameters:
train_layers: 128
dropout: 0.0
learning_rate: 8.531941855305935e-05
Score: 0.7082393169403076

Trial 3 summary
Hyperparameters:
train_layers: 128
dropout: 0.3
learning_rate: 1.3381452948593074e-05
Score: 0.7052977085113525
None
