## Dataset
The dataset is taken from the ISIC (International Skin Image Collaboration) Archive. It consists of 1800 pictures of benign moles and 1497 pictures of malignant classified moles. The pictures have all been resized to low resolution (224x224x3) RGB. The task of this kernel is to create a model, which can classify a mole visually into benign and malignant. 

As the dataset is pretty balanced, the model will be tested on the accuracy score, thus (TP + TN)/(ALL).

It has 2 different classes of skin cancer which are listed below :<br>
**1. Benign <br>**
**2. Malignant <br>**

Step 1 : Imports

In [2]:
import os
import tqdm
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from PIL import Image
np.random.seed(11) 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
import itertools

import keras
import tensorflow as tf
from keras.utils.np_utils import to_categorical # used for converting labels to one-hot-encoding
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras import backend as K
from keras.layers.normalization import BatchNormalization
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.optimizers import Adam, RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
from keras.wrappers.scikit_learn import KerasClassifier
from keras.applications.resnet50 import ResNet50
from keras import backend as K 
import gc

Using TensorFlow backend.


Step 2 : Loading Data and Creating Dataset
loading images in the pictures and turn them into numpy arrays using their RGB values. Image size 224*224




In [3]:
# Save the file path of each image and separate them to different classes
#
# Labels:
# 0 -> benign
# 1 -> malignant

train_imgs, test_imgs = [], []
train_labels, test_labels = [], []

for img_path in os.listdir('../input/data/train/benign'):
    train_imgs.append('../input/data/train/benign/' + img_path)
    train_labels.append(0)
    
for img_path in os.listdir('../input/data/train/malignant'):
    train_imgs.append('../input/data/train/malignant/' + img_path)
    train_labels.append(1)
    
for img_path in os.listdir('../input/data/test/benign'):
    test_imgs.append('../input/data/test/benign/' + img_path)
    test_labels.append(0)
    
for img_path in os.listdir('../input/data/test/malignant'):
    test_imgs.append('../input/data/test/malignant/' + img_path)
    test_labels.append(1)
    
train_imgs, test_imgs = np.array(train_imgs), np.array(test_imgs)
train_labels, test_labels = np.array(train_labels), np.array(test_labels)
    
class_distribution = np.bincount(np.concatenate([train_labels, test_labels]))
    
print('Size of train set:', len(train_imgs))
print('Size of test set:', len(test_imgs))
print(class_distribution[0], 'benign labeled samples and', class_distribution[1], 'malignant')

# Load the images to memory
xtrain, xtest = [], []
ytrain, ytest = train_labels, test_labels

for filename in tqdm.tqdm(train_imgs):
    xtrain.append(np.array(Image.open(filename)))
    
for filename in tqdm.tqdm(test_imgs):
    xtest.append(np.array(Image.open(filename)))
    
del train_imgs, test_imgs, train_labels, test_labels
xtrain, xtest = np.array(xtrain), np.array(xtest)

# Merge and split train and test set to have more train data
data = np.concatenate([xtrain, xtest])
labels = np.concatenate([ytrain, ytest])

labels=to_categorical(labels,num_classes=2)

# Spliting data to train, validation and test values
xtrain, xtest, ytrain, ytest = train_test_split(data, labels, test_size=.2, random_state=0)
xtra, xval, ytra, yval = train_test_split(xtrain, ytrain, test_size=.2, random_state=0, shuffle=False)

gc.collect()
print('Shape of the new train set:', xtra.shape)
print('Shape of the new test set:', xtest.shape)
print('Shape of the validation set:', xval.shape)

  1%|          | 21/2637 [00:00<00:12, 205.02it/s]

Size of train set: 2637
Size of test set: 660
1800 benign labeled samples and 1497 malignant


100%|██████████| 2637/2637 [00:10<00:00, 249.96it/s]
100%|██████████| 660/660 [00:03<00:00, 220.53it/s]


Shape of the new train set: (2109, 224, 224, 3)
Shape of the new test set: (660, 224, 224, 3)
Shape of the validation set: (528, 224, 224, 3)


In [4]:
data_generator = ImageDataGenerator(rotation_range=90,
                                    width_shift_range=0.15,
                                    height_shift_range=0.15,
                                    horizontal_flip=True,
                                    vertical_flip=True,
                                    brightness_range=[0.8, 1.1],
                                    fill_mode='nearest')

new_samples, new_labels = next(data_generator.flow(xtra, ytra, batch_size=len(xtra)))
xtra = np.concatenate([xtra, new_samples])
ytra = np.concatenate([ytra, new_labels])

del new_samples, new_labels
print('New number of training samples:', len(xtra))

New number of training samples: 4218


In [5]:
# Normalizing values
xtra = xtra.astype('float32') / 255.
xtest = xtest.astype('float32') / 255.
xval = xval.astype('float32') / 255.

print('Training data shape:', xtra.shape)
print('Min value:', xtra.min())
print('Max value:', xtra.max())

Training data shape: (4218, 224, 224, 3)
Min value: 0.0
Max value: 1.0


## CNN model

In [6]:
def build_model(input_shape= (224,224,3), lr = 1e-3, num_classes= 2,
          init= 'normal', activ= 'relu', optim= 'adam'):
    model = Sequential()
    model.add(Conv2D(64, kernel_size=(3, 3),padding = 'Same',input_shape=input_shape,
                     activation= activ, kernel_initializer='glorot_uniform'))
    model.add(MaxPool2D(pool_size = (2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(64, kernel_size=(3, 3),padding = 'Same', 
                     activation =activ, kernel_initializer = 'glorot_uniform'))
    model.add(MaxPool2D(pool_size = (2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer=init))
    model.add(Dense(num_classes, activation='softmax'))
    model.summary()

    if optim == 'rmsprop':
        optimizer = RMSprop(lr=lr)

    else:
        optimizer = Adam(lr=lr)

    model.compile(optimizer = optimizer ,loss = "binary_crossentropy", metrics=["accuracy"])
    return model

# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=5, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=1e-7)



In [8]:
input_shape = (224,224,3)
lr = 1e-5
init = 'normal'
activ = 'relu'
optim = 'adam'
epochs = 50
batch_size = 64

model = build_model(lr=lr, init= init, activ= activ, optim=optim, input_shape= input_shape)

history = model.fit(xtra, ytra, validation_data=(xval, yval),
                    epochs= epochs, batch_size= batch_size, verbose=1, 
                    callbacks=[learning_rate_reduction],shuffle=True
                   )
                   
# list all data in history
print(history.history.keys())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 224, 224, 64)      1792      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 112, 112, 64)      0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 112, 112, 64)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 112, 112, 64)      36928     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 56, 56, 64)        0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 56, 56, 64)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 200704)            0         
__________

Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Epoch 00050: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-07.
dict_keys(['val_loss', 'val_acc', 'loss', 'acc', 'lr'])


In [None]:
# Testing model on test data to evaluate
y_pred = model.predict_classes(xtest)

print(accuracy_score(np.argmax(ytest, axis=1),y_pred))

The CNN above is not a very sophisticated model, thus the resnet50, is also tried