In [2]:
# import required libraries

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import os
import gc
import shutil
import time
import warnings

import cv2
import tensorflow.keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.layers import BatchNormalization, Conv2D, Dense, Dropout, Flatten, MaxPool2D, ReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [3]:
# import training label data

id_label = pd.read_csv("../input/histopathologic-cancer-detection/train_labels.csv")
id_label["file_name"] = id_label["id"].apply(lambda x: x + ".tif")


In [4]:
# examples of positive and negative samples
positive_samples = id_label.loc[id_label["label"] == 1].sample(5)
negative_samples = id_label.loc[id_label["label"] == 0].sample(5)
positive_images = []
negative_images = []
for sample in positive_samples["file_name"]:
    path = os.path.join("../input/histopathologic-cancer-detection/train", sample)
    img = cv2.imread(path)
    positive_images.append(img)
        
for sample in negative_samples["file_name"]:
    path = os.path.join("../input/histopathologic-cancer-detection/train", sample)
    img = cv2.imread(path)
    negative_images.append(img)

fig,axis = plt.subplots(2,5,figsize=(20,8))
for i,img in enumerate(positive_images):
    axis[0,i].imshow(img)
axis[0,0].set_ylabel("Positive samples")
for i,img in enumerate(negative_images):
    axis[1,i].imshow(img)
axis[1,0].set_ylabel("Negative samples")
    

In [5]:
# sample count
id_label["label"].value_counts()

### The training samples are not well-balanced. Thus, I select 89117 samples from the 130908 benign pictures.

In [6]:
# select 89117 samples from the 130908 benign pictures

id_label_neg = id_label.loc[id_label["label"] == 0, :].sample(n=89117)
id_label_pos = id_label.loc[id_label["label"] == 1, :]
id_label = pd.concat([id_label_neg, id_label_pos], ignore_index=True)
id_label = id_label.astype({"label": "str"})

del id_label_neg, id_label_pos


In [7]:
# Split to training and validation dataset

id_label_train, id_label_val = train_test_split(id_label, test_size=0.1, stratify=id_label["label"])
id_label_train.reset_index(drop=True, inplace=True)
id_label_val.reset_index(drop=True, inplace=True)

len_train = id_label_train.shape[0]
len_val = id_label_val.shape[0]
print("# of training data: {}.".format(len_train))
print("# of validation data: {}.".format(len_val))
del id_label

gc.collect()

In [8]:
# Image data generator

img_gen_params = {
    "rescale": 1.0 / 255,
    "samplewise_center": True,
    "samplewise_std_normalization": True,
    "horizontal_flip": True,
    "vertical_flip": True
}
img_gen = ImageDataGenerator(**img_gen_params)

In [9]:
# Image flow

IMAGE_SHAPE = (96, 96, 3)
path_train = "../input/histopathologic-cancer-detection/train"
batch_size = 32

img_flow_params_train = {
    "dataframe": id_label_train,
    "directory": path_train,
    "x_col": "file_name",
    "y_col": "label",
    "has_ext": True,
    "target_size": IMAGE_SHAPE[:2],
    "batch_size": batch_size
}
img_flow_train = img_gen.flow_from_dataframe(**img_flow_params_train)


In [10]:
# Image flow

img_flow_params_val = {
    "dataframe": id_label_val,
    "directory": path_train,
    "x_col": "file_name",
    "y_col": "label",
    "has_ext": True,
    "target_size": IMAGE_SHAPE[:2],
    "batch_size": 1,
    "shuffle": False
}
img_flow_val = img_gen.flow_from_dataframe(**img_flow_params_val)

In [11]:
# Construct the CNN model

kernel_size = (5, 5)
filters = (32, 64, 128)
drop_prob_conv = 0.25
drop_prob_dense = 0.25

model = tensorflow.keras.models.Sequential()

model.add(Conv2D(filters[0], kernel_size, padding="same", kernel_initializer=he_normal(), input_shape=IMAGE_SHAPE))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Conv2D(filters[0], kernel_size, padding="same", kernel_initializer=he_normal()))
model.add(BatchNormalization())
model.add(ReLU())
model.add(MaxPool2D())
model.add(Dropout(drop_prob_conv))

model.add(Conv2D(filters[1], kernel_size, padding="same", kernel_initializer=he_normal()))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Conv2D(filters[1], kernel_size, padding="same", kernel_initializer=he_normal()))
model.add(BatchNormalization())
model.add(ReLU())
model.add(MaxPool2D())
model.add(Dropout(drop_prob_conv))

model.add(Conv2D(filters[2], kernel_size, padding="same", kernel_initializer=he_normal()))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Conv2D(filters[2], kernel_size, padding="same", kernel_initializer=he_normal()))
model.add(BatchNormalization())
model.add(ReLU())
model.add(MaxPool2D())
model.add(Dropout(drop_prob_conv))

model.add(Flatten())
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dropout(drop_prob_dense))
model.add(Dense(256))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dropout(drop_prob_dense))
model.add(Dense(2, activation="softmax"))
model.compile(Adam(0.01), loss="categorical_crossentropy", metrics=["accuracy"])


In [12]:
# learning

lr_decay_params = {
    "monitor": "val_acc",
    "factor": 0.5,
    "patience": 1,
    "min_lr": 1e-5
}
lr_decay = ReduceLROnPlateau(**lr_decay_params)

early_stopping = EarlyStopping(monitor="val_acc", patience=3, verbose=1)

fit_params = {
    "generator": img_flow_train,
    "steps_per_epoch": len_train // batch_size,
    "epochs": 5,
    "verbose": 1,
    "validation_data": img_flow_val,
    "validation_steps": len_val,
    "callbacks": [lr_decay, early_stopping]
}

model.fit_generator(**fit_params)


In [13]:
# Calculate validation accuracy

y_val_pred = model.predict_generator(img_flow_val, steps=len_val)[:, 1]
y_val_true = img_flow_val.classes
acc_val = np.equal((y_val_pred > 0.5).astype("int"), y_val_true).sum() / y_val_pred.shape[0]
print("Validation accuracy: {:.3f}.".format(acc_val))

In [14]:
del img_flow_params_train, img_flow_params_val

gc.collect()

In [15]:
# prepare test dataset

path_test = "../input/histopathologic-cancer-detection/test"
test_files = [x for x in os.listdir(path_test)]
test_files = pd.DataFrame({"file_name": test_files})
len_test = len(test_files)

img_flow_params_test = {
    "dataframe": test_files,
    "directory": path_test,
    "x_col": "file_name",
    "has_ext": True,
    "class_mode": None,
    "target_size": IMAGE_SHAPE[:2],
    "batch_size": 1,
    "shuffle": False
}
img_flow_test = img_gen.flow_from_dataframe(**img_flow_params_test)

In [16]:
#prediction

pred_params = {
    "generator": img_flow_test,
    "steps": len_test,
    "verbose": 1
}
preds = model.predict_generator(**pred_params)[:, 1]

y_pre_int = pd.DataFrame({"id": img_flow_test.filenames, "label2": preds})
y_pre_int["id"] = y_pre_int["id"].apply(lambda x: x.split(".")[0])


In [17]:
# submission format

y_pred = pd.read_csv("../input/histopathologic-cancer-detection/sample_submission.csv")
y_pred = y_pred.merge(y_pre_int, on="id")
y_pred["label2"] = (y_pred["label2"] > 0.5).astype("int")
y_pred.drop(["label"], axis=1, inplace=True)
y_pred.rename(columns={"label2": "label"}, inplace=True)

y_pred.to_csv("submission.csv", index=False)

y_pred.head(10)