In [None]:
# Source code
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from PIL import Image
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Problem Statement

>### The project is aimed to detect Cancer with small pathology images. It is going to use CNN approach to train and predict the data.
>### There are 220025 images in training data using about 3.4MB. The images are labeled to either 0 or 1 referring their catergory (negative or positive result). 130908 are negative and 89117 are positive. and there seems to be no null entries. Since the test set is not labeled. 20% training set will be selected for validation. 
>### Each data is around 28kB file size. It is Tiff image file image with 96x96 pixel in RGB.

In [None]:
dir_images_test = '/kaggle/input/histopathologic-cancer-detection/test/'
dir_images_train = '/kaggle/input/histopathologic-cancer-detection/train/'
labels_train = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
images_train = labels_train
images_train['label'] = images_train['label'].astype(str)
images_train['id'] = labels_train['id'] + '.tif'
print(labels_train.info())
plt.hist(images_train['label'])
plt.show()
pd.Series(images_train['label']).value_counts()
imageData = Image.open(dir_images_train+images_train['id'][0])
print("Image Meta Data:", imageData)

# 2. Exploratory Data Analysis (EDA) 
> ### We could find that the images have various patterns. We could find points, edges and space area in the data. Majority are having intensive points. Some of them have a large space area. Few of them mainly contains edges and big points.

In [None]:
from tifffile import imread
rows, cols = 10, 10

fig, axes = plt.subplots(rows, cols, figsize=(10, 10))

for i in range(10 * 10):
    image = imread(dir_images_train + images_train['id'][i])

    row, col = i // cols, i % cols

    axes[row, col].imshow(image)
    axes[row, col].axis('off')

plt.subplots_adjust(wspace = 0.1, hspace = 0.3)

plt.show()

# 3. Model Architecture 
> ### A VGG-style model is going to be used
> ### The first model is going to have 12 layers. The architecture is as follow:

> ### Conv2D -> Conv2D -> MaxPool2D -> Conv2D -> Conv2D -> MaxPool2D -> Conv2D -> Conv2D -> MaxPool2D -> Flatten -> Dense -> Dense -> Output
> ### As we could find 3 patterns in the training images. It is going to try the above structure to spot out those pattern.

> ### Activation Functions
> ### ReLU is going to be used for the hidden layer, since it has better converagnece, less computation. And sigmoid for output layer which is fitted to binary detection. 

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D , Flatten, BatchNormalization, Activation, Dropout
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
generator = ImageDataGenerator(validation_split=0.2)
batchSize=1000
data_train = generator.flow_from_dataframe(
    dataframe = images_train,
    x_col='id', # filenames
    y_col='label', # labels
    directory=dir_images_train,
    subset='training',
    class_mode='binary',
    batch_size=batchSize,
    target_size=(96, 96))

data_validate=generator.flow_from_dataframe(
    dataframe=images_train,
    x_col='id', # filenames
    y_col='label', # labels
    directory=dir_images_train,
    subset="validation",
    class_mode='binary',
    batch_size=batchSize,
    target_size=(96, 96))

In [None]:
model1 = keras.models.Sequential()
model1.add(Conv2D(filters=8, kernel_size=(3,3), activation='relu', input_shape = (96, 96, 3)))
model1.add(Conv2D(filters=8, kernel_size=(3,3), activation='relu'))
model1.add(MaxPool2D(pool_size=(2,2)))

model1.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu'))
model1.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu'))
model1.add(MaxPool2D(pool_size=(2,2)))

model1.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model1.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model1.add(MaxPool2D(pool_size=(2,2)))

model1.add(Flatten())
model1.add(Dense(units=256, activation='relu'))
model1.add(Dense(units=1, activation='sigmoid'))
#from keras.layers import PReLU
#from keras.initializers import Constant
#model1.add(PReLU(alpha_initializer=Constant(value=0.25)))
model1.build(input_shape=(batchSize, 96, 96, 3))
model1.summary()

In [None]:
opt = Adam(learning_rate=0.01)
model1.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
hist1 = model1.fit(data_train, validation_data=data_validate, epochs=10)


> ### The second model is going to have 16 layers. The architecture is as follow:

> ### Conv2D -> Conv2D -> MaxPool2D -> Batch Normalization -> Conv2D -> Conv2D -> MaxPool2D -> Batch Normalization -> Conv2D -> Conv2D -> MaxPool2D -> Batch Normalization -> Flatten -> Dense -> Dropout -> Dense -> Output
> ### As the loss and accuracy of the first model are too much consistant. It is going to normalize after each pooling layer and include dropout layer, in order to increase the learning effectiveness and efficiency.  

> ### Activation Functions
> ### It has no changes from the first model. 

In [None]:
model2 = keras.models.Sequential()
model2.add(Conv2D(filters=8, kernel_size=(3,3), activation='relu', input_shape = (96, 96, 3)))
model2.add(Conv2D(filters=8, kernel_size=(3,3), activation='relu'))
model2.add(MaxPool2D(pool_size=(2,2)))
model2.add(BatchNormalization())

model2.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu'))
model2.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu'))
model2.add(MaxPool2D(pool_size=(2,2)))
model2.add(BatchNormalization())

model2.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model2.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model2.add(MaxPool2D(pool_size=(2,2)))
model2.add(BatchNormalization())

model2.add(Flatten())
model2.add(Dense(units=256, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(units=1, activation='sigmoid'))
model2.build(input_shape=(batchSize, 96, 96, 3))
model2.summary()


In [None]:
opt = Adam(learning_rate=0.01)
model2.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
hist2 = model2.fit(data_train, validation_data=data_validate, epochs=10)

# 4. Results and Analysis 

> ### In the first model,
The execution time is fair. The final training accuracy was accuracy: 0.5947, and validation accuracy was val_accuracy: 0.5961. It is a fair model. We could not see underfitting or overfitting. However the accuracy are consistant. There is no signicficant improvement among the early epoches.


> ### In the second model,
The execution time is fair. The final training accuracy was accuracy: 0.8953, and validation accuracy was val_accuracy: 0.8266. It is good-fit model. It has a nice training curve. It might have over-fitting but not enough evidence. 
        


In [None]:
plt.plot(hist1.history["accuracy"])
plt.plot(hist1.history['val_accuracy'])
plt.title("Model 1 Evaluation")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Training Accuracy","Validation Accuracy"])

plt.show()
plt.plot(hist2.history["accuracy"])
plt.plot(hist2.history['val_accuracy'])
plt.title("Model 2 Evaluation")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Training Accuracy","Validation Accuracy"])

plt.show()


# 5. Conclusion 
> ### Comparing two models,
The first model performs fair in terms of accuracy. However the accuracy is consistant. After normalized and dropout, the second model solved the issue. The accuracy did improve much.  

> ### Future Improvement
It is suggested to conduct L2 regularization, to better improve the training efficiency. And also increase the epoch and reduce the learning rate to have better training performance if there is enough resource. Maybe SGD or RMSProp could be tried as alternative optimization method.

In [None]:
#Submission
images_test = pd.DataFrame({'id':os.listdir(dir_images_test)})

generator_test = ImageDataGenerator()
batchSize = 1
data_test = generator_test.flow_from_dataframe(
    dataframe = images_test,
    x_col='id', # filenames
    directory=dir_images_test,
    class_mode=None,
    batch_size=batchSize,
    target_size=(96, 96),
    shuffle=False)
predictions = model2.predict(data_test, verbose=1)
print(predictions)
pred = np.transpose(predictions)[0]

print(pred)

submission_df = pd.DataFrame()
submission_df['id'] = images_test['id'].apply(lambda x: x.split('.')[0])
submission_df['label'] = list(map(lambda x: 0 if x < 0.5 else 1, pred))
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
submission_df.head()