In [7]:
import matplotlib.pyplot as plt
#from PIL import Image
import matplotlib.image as mpimg
import numpy as np
#from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras import layers, models
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy
import tensorflow.metrics
import pandas as pd
from sklearn.model_selection import train_test_split
import os, shutil, random
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler

### Define preprocessing function for demographic data

### Notice: age was rounded in steps of 5

In [8]:
def preprocess_dem(train,test):


    #replace missing age values by median
    train['age'] = train['age'].fillna(train['age'].median())
    test['age'] = test['age'].fillna(train['age'].median())
    
    # performin min-max scaling each continuous feature column to
    # the range [0, 1]
    cs = MinMaxScaler()
    train["age"] = cs.fit_transform(train["age"].values.reshape(-1,1))
    test["age"] = cs.transform(test["age"].values.reshape(-1,1))
 
    train['sex'].replace("unknown",train['sex'].value_counts().index[0], inplace=True)
    train["sex"] = np.where(train['sex']=="female",1,0)
    
    test['sex'].replace("unknown",train['sex'].value_counts().index[0], inplace=True)
    test["sex"] = np.where(test['sex']=="female",1,0)
    
    # return the concatenated training and testing data
    return (train, test)

### Load demographic data for all images (inkl. augmented images)

In [9]:
diagnose = ['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec']

train_dir = "/Users/leona/PythonWork/Github_Projects/Final_Pro/data/ISIC2018_Task3_Training_Input/train/"
validation_dir = "/Users/leona/PythonWork/Github_Projects/Final_Pro/data/ISIC2018_Task3_Training_Input/validation/"

### Load main, train and test dataframe

In [116]:
meta = pd.read_pickle("./meta.pkl")

In [150]:
train = pd.read_pickle("./aug_train.pkl")

In [151]:
validation = pd.read_pickle("./aug_test.pkl")

In [152]:
train_pr, validation_pr = preprocess_dem(train,validation)

### Define custom image generator

In [158]:
lb = LabelBinarizer()
lb.fit(train_pr.dx)

def get_input(path):
    img = load_img(path, target_size=(224, 224))
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0) 
    return(img_array)

def preprocess_input(img):
    """ Same preprocessing function as mobilenets preprocess input"""
    img /= 255.
    img -= 0.5
    img *= 2.
    return img

def multi_input_generator(df, batch_size, source_dir,shuffle=True):
    """Read images and metadata from dataframe. 
    Arguments: 
    - source_dir = either train or validation diectory
    -> important: for test batches set shuffle=False and batch_size=1"""
    
    idx = 0

    while True:
        if shuffle:
            batch = df.sample(n=batch_size, replace=False)
        else:
            batch = df.loc[idx:(idx*batch_size), :] #attention:works only with batch_size=1

        batch_input1 = []
        batch_input2 = []
        batch_output = [] 
          
        # Read in each input, perform preprocessing and get labels
        for i in batch.index:
            
            full_path = source_dir + str(batch.loc[i].dx) + "/" + str(batch.loc[i].aug_id)
            input1 = get_input(full_path)
            input2 = [batch.loc[i].age, batch.loc[i].sex]
            output = batch.loc[i].dx
            
            input_pre = preprocess_input(input1)
            batch_input1 += [ input_pre ]
            batch_input2 += [ input2 ]
            batch_output += [ output ]
        
        # flatten the image list so that it looks like the tensorflow iterator
        batch_input1 = [val for sublist in batch_input1 for val in sublist]
        
        # Return a tuple of ([input,input],output) to feed the network
        batch_x1 = np.array(batch_input1)
        batch_x2 = np.array(batch_input2, dtype="float32")
        batch_y = lb.transform(np.array(batch_output)).astype("float32")
        
        yield[batch_x1, batch_x2], batch_y
        idx += 1
        
        if idx >= len(df):
                break

In [160]:
train_batches = multi_input_generator(train_pr,10,train_dir)

In [170]:
aa,bb = next(test_batches)

In [171]:
aa

[array([[[[ 0.78039217, -0.01960784,  0.09019613],
          [ 0.7647059 , -0.05098039,  0.00392163],
          [ 0.78039217, -0.01960784,  0.09019613],
          ...,
          [ 0.8039216 ,  0.05882359,  0.10588241],
          [ 0.77254903,  0.02745104,  0.05882359],
          [ 0.7882353 ,  0.02745104,  0.11372554]],
 
         [[ 0.7647059 , -0.02745098,  0.03529418],
          [ 0.75686276,  0.00392163,  0.09019613],
          [ 0.7490196 , -0.01176471,  0.07450986],
          ...,
          [ 0.78039217,  0.05098045,  0.11372554],
          [ 0.7647059 ,  0.03529418,  0.09803927],
          [ 0.7647059 ,  0.0196079 ,  0.082353  ]],
 
         [[ 0.77254903,  0.01176476,  0.09803927],
          [ 0.7882353 ,  0.00392163,  0.09803927],
          [ 0.77254903, -0.03529412, -0.00392157],
          ...,
          [ 0.8117647 ,  0.05098045,  0.13725495],
          [ 0.75686276,  0.0196079 ,  0.14509809],
          [ 0.81960785,  0.02745104,  0.15294123]],
 
         ...,
 
         [[ 

In [23]:
valid_batches = multi_input_generator(validation_pr,10,validation_dir)

In [169]:
test_batches = multi_input_generator(validation_pr,1, validation_dir,shuffle=False)

## Combine models

### Setup model input

In [25]:
num_train_samples = 8912 # length of original not-augmented train data
num_val_samples = 1103
train_batch_size = 10
val_batch_size = 10
image_size = 224

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

In [26]:
cnn = tensorflow.keras.models.load_model('combined_model_new.h5', compile=False)

W0818 18:10:03.331333 4680467904 deprecation.py:506] From /Users/leona/anaconda3/envs/finalpro/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0818 18:10:03.332475 4680467904 deprecation.py:506] From /Users/leona/anaconda3/envs/finalpro/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0818 18:10:03.333572 4680467904 deprecation.py:506] From /Users/leona/anaconda3/envs/finalpro/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling

In [66]:
cnn_output = cnn.layers[-3].output # global average pooling layer

In [68]:
input_dem = tensorflow.keras.layers.Input(shape=(2,))
x_dem = Dense(16, activation="relu",name="dense_dem")(input_dem)
x_dem = Dropout(0.25, name="dropout_dem")(x_dem)

In [69]:
merge_layer = layers.concatenate(inputs=[cnn_output,x_dem])

In [70]:
x = Dense(64, activation="relu")(merge_layer)
x = Dropout(0.25)(x)
x = Dense(32, activation="relu")(x)
predictions = Dense(7, activation='softmax')(x)

In [75]:
combined_model_new = Model(inputs=[cnn.input, input_dem], outputs=predictions)

### Compile merged model

In [76]:
# We need to choose how many layers we actually want to be trained.

# Here we are freezing the weights of all layers except the
# last 15 layers in the new model.
# The last 15 layers of the model will be trained.

for layer in combined_model_new.layers[:-10]:
    layer.trainable = False

In [29]:
# Define Top3 Accuracy
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

In [30]:
cnn.compile(Adam(lr=0.01), loss='categorical_crossentropy', 
              metrics=[categorical_accuracy, top_3_accuracy])

#### Define class weights to account for class imbalance

In [59]:
class_weights={
    0: 1.0, # akiec
    1: 1.0, # bcc
    2: 1.0, # bkl
    3: 1.0, # df
    4: 3.0, # mel # Try to make the model more sensitive to Melanoma.
    5: 1.0, # nv
    6: 1.0, # vasc
}

In [78]:
filepath = "combined_model_new.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
                              
                              
callbacks_list = [checkpoint, reduce_lr]

combined_new_history = combined_model_new.fit_generator(train_batches, steps_per_epoch=train_steps, 
                              class_weight=class_weights,
                    validation_data=valid_batches,
                    validation_steps=val_steps,
                    epochs=30, verbose=1,
                   callbacks=callbacks_list)

Epoch 1/30
Epoch 00001: val_categorical_accuracy improved from -inf to 0.71712, saving model to combined_model_new.h5
Epoch 2/30
Epoch 00002: val_categorical_accuracy improved from 0.71712 to 0.77477, saving model to combined_model_new.h5
Epoch 3/30
Epoch 00003: val_categorical_accuracy did not improve from 0.77477
Epoch 4/30
Epoch 00004: val_categorical_accuracy did not improve from 0.77477

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 5/30
Epoch 00005: val_categorical_accuracy did not improve from 0.77477
Epoch 6/30
146/892 [===>..........................] - ETA: 8:24 - loss: 0.6769 - categorical_accuracy: 0.8068 - top_3_accuracy: 0.9726

KeyboardInterrupt: 

In [44]:
cnn.load_weights('combined_model_new.h5')

val_loss, val_cat_acc, val_top_3_acc = \
cnn.evaluate_generator(test_batches, 
                        steps= len(validation_pr))

print('val_loss:', val_loss)
print('val_cat_acc:', val_cat_acc)
print('val_top_3_acc:', val_top_3_acc)

val_loss: 0.6949561887735702
val_cat_acc: 0.77425206
val_top_3_acc: 0.9428831


### Get predictions, ytrue and ypred

In [47]:
predictions = cnn.predict_generator(test_batches, steps=len(validation_pr), verbose=1)



In [None]:
cnn.predict()

In [83]:
ytrue = pd.Categorical(validation_pr.dx).codes
ytrue

array([5, 5, 5, ..., 5, 5, 5], dtype=int8)

In [84]:
y_pred = np.argmax(predictions, axis=1)
y_pred

array([5, 5, 5, ..., 5, 4, 5])

In [77]:
plot_labels = ["akiec", "bcc", "bkl", "df", "mel", "nv", "vasc"]

In [85]:
from sklearn.metrics import classification_report

# Generate a classification report
report = classification_report(ytrue, y_pred, target_names=plot_labels)

print(report)

              precision    recall  f1-score   support

       akiec       0.67      0.27      0.38        30
         bcc       1.00      0.17      0.29        35
         bkl       0.44      0.17      0.25        88
          df       0.55      0.75      0.63         8
         mel       0.15      0.70      0.25        46
          nv       0.96      0.88      0.92       883
        vasc       0.39      0.69      0.50        13

    accuracy                           0.77      1103
   macro avg       0.59      0.52      0.46      1103
weighted avg       0.87      0.77      0.80      1103



### Print confusion matrix

In [89]:
confusion_matrix(ytrue, predictions.argmax(axis=1))

array([[  8,   0,   3,   1,  16,   1,   1],
       [  0,   6,   5,   0,  18,   4,   2],
       [  3,   0,  15,   0,  55,  13,   2],
       [  0,   0,   0,   6,   1,   1,   0],
       [  1,   0,   3,   1,  32,   9,   0],
       [  0,   0,   8,   3,  85, 778,   9],
       [  0,   0,   0,   0,   2,   2,   9]])

### Display loss and accuracy curves

In [None]:
# display the loss and accuracy curves

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette= "Set2", style= 'darkgrid')
sns.set_context("notebook",font_scale=1.5, rc={"lines.linewidth": 2.5})

acc = history.history['categorical_accuracy']
val_acc = history.history['val_categorical_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
train_top3_acc = history.history['top_3_accuracy']
val_top3_acc = history.history['val_top_3_accuracy']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(15,5))
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.figure(figsize=(15,5))

plt.plot(epochs, acc, label='Training cat acc')
plt.plot(epochs, val_acc, label='Validation cat acc')
plt.title('Training and validation cat accuracy')
plt.legend()
plt.figure(figsize=(15,5))


plt.plot(epochs, train_top3_acc, label='Training top3 acc')
plt.plot(epochs, val_top3_acc, label='Validation top3 acc')
plt.title('Training and validation top3 accuracy')
plt.legend()


plt.show()