In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_files
import keras
from keras.utils import np_utils
from glob import glob

Using TensorFlow backend.


import dataset

In [2]:
def load_dataset(path):
    data = load_files(path, shuffle=True)
    img_files = np.array(data['filenames'])
    img_targets = np_utils.to_categorical(np.array(data['target']), 3)
    return img_files, img_targets

In [None]:
train_files, train_labels = load_dataset('data/train')
valid_files, valid_labels = load_dataset('data/valid')

In [None]:
def load_dataset_no_shuffle(path):
    data = load_files(path, shuffle=False)
    img_files = np.array(data['filenames'])
    img_targets = np_utils.to_categorical(np.array(data['target']), 3)
    return img_files, img_targets

In [None]:
test_files, test_labels = load_dataset_no_shuffle('data/test')

In [None]:
from keras.preprocessing import image
from tqdm import tqdm

def path_to_tensor(path):
    img = image.load_img(path, target_size=(224, 224))
    x = image.img_to_array(img)
    return np.expand_dims(x, axis=0)

def paths_to_tensor(paths):
    list_of_tensors = [path_to_tensor(path) for path in tqdm(paths)]
    return np.vstack(list_of_tensors)

In [None]:
train_tensors = paths_to_tensor(train_files).astype('float32')/255
valid_tensors = paths_to_tensor(valid_files).astype('float32')/255
test_tensors = paths_to_tensor(test_files).astype('float32')/255

train on inceptionV3

In [None]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential

In [None]:
from keras.applications.inception_v3 import InceptionV3

model_inception = InceptionV3(weights = 'imagenet', include_top = False)

In [None]:
train_features_inception = model_inception.predict(train_tensors, verbose=1)
valid_features_inception = model_inception.predict(valid_tensors, verbose=1)
test_features_inception = model_inception.predict(test_tensors, verbose=1)

In [None]:
model = Sequential()

model.add(GlobalAveragePooling2D(input_shape = train_features_inception.shape[1:]))
model.add(Dropout(0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.summary()

In [None]:
opt = keras.optimizers.Adam(lr=0.0001, decay=1e-6)
model.compile(optimizer=opt, metrics=['accuracy'], loss='categorical_crossentropy')

In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(save_best_only=True, verbose=1, filepath='saved_models/weights.best.from_inception.hdf5')

model.fit(train_features_inception,
          train_labels,
          epochs=25,
          batch_size=64,
          validation_data=(valid_features_inception, valid_labels),
          callbacks=[checkpoint],
          verbose=1)

In [None]:
model.load_weights('saved_models/weights.best.from_inception.hdf5')

In [None]:
test_predictions = np.argmax(model.predict(test_features_inception), axis=1)
accuracy = 100*np.sum(np.array(test_predictions) == np.argmax(test_labels, axis=1)/len(test_predictions))
print ('Accuracy of Inception model on test set = %.4f%%' % accuracy)

In [None]:
print(np.argmax(test_labels[25]))
print(test_predictions[25])


In [None]:
import cv2

img = cv2.imread(test_files[25])
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()

In [None]:
test_predictions = model.predict(test_features_inception)
test_predictions_task1 = test_predictions[:,0]
test_predictions_task2 = test_predictions[:,2]

In [None]:
submission_inception = pd.DataFrame({'Id':test_files, 'task_1':test_predictions_task1, 'task_2':test_predictions_task2})
pd.DataFrame.to_csv(submission_inception, 'submission.csv', index=False)

In [None]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

from sklearn.metrics import roc_curve, auc, confusion_matrix

def plot_roc_auc(y_true, y_pred):

    #initialise dictionaries and array
    fpr = dict()
    tpr = dict()
    roc_auc = np.zeros(3)

    #prepare for figure
    plt.figure()
    colors = ['aqua', 'cornflowerblue']

    #for both classification tasks (categories 1, 2)
    for i in range(2):
        #obtain ROC curve
        fpr[i], tpr[i], _ = roc_curve(y_true[:,i], y_pred[:,i])
        #obtain ROC AUC
        roc_auc[i] = auc(fpr[i], tpr[i])
        #plot ROC curve
        plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label='ROC curve for task {d} (area = {f:.2f)'.format(d=i+1, f=roc_auc[i]))

    #get score for category 3
    roc_auc[2] = np.average(roc_auc[:2])

    #format figure
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curves')
    plt.legend(loc="lower right")
    plt.show()

    #print scores
    for i in range(3):
        print('Category {d} Score: {f:.3f'.format(d=i+1, f=roc_auc[i]))

def plot_confusion_matrix(y_true, y_pred, thresh, classes):
    #obtain class predictions from probabilities
    y_pred = (y_pred>=thresh)*1
    #obtain unnormalised confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    #normalise confusion matrix
    cm = cm.astype('float')  / cm.sum(axis=1)[:, np.newaxis]

    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], '.2f'),
                 horizontalalignmnet='center',
                 color='white' if cm[i, j] > thresh else 'black')
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

if __name__ == '__main__':

    preds_path = sys.argv[1]

    thresh = 0.5

    #get ground truth labels for test dataset
    truth = pd.read_csv('ground_truth.csv')
    y_true = truth.as_matrix(columns=['task_1', 'task_2'])

    #get model predictions for test dataset
    y_pred = pd.read_csv('submission.csv')
    y_pred = y_pred.as_matrix(columns=['task_1', 'task_2'])

    #plot roc curves and print scores
    plot_roc_auc(y_true, y_pred)

    #plot confusion matrix
    classes = ['benign', 'malignant']
    plot_confusion_matrix(y_true[:,0], y_pred[:,0], thresh, classes)

train on inceptionV2

In [None]:
from keras.applications.inception_resnet_v2 import InceptionResNetV2

model_inception_resnet = InceptionResNetV2(weights = 'imagenet', include_top = False)

In [None]:
model_inception_resnet = Sequential()

model_inception_resnet.add(GlobalAveragePooling2D(input_shape = train_features_inception.shape[1:]))
model_inception_resnet.add(Dropout(0.2))
model_inception_resnet.add(Dense(1024, activation = 'relu'))
model_inception_resnet.add(Dropout(0.2))
model_inception_resnet.add(Dense(512, activation = 'relu'))
model_inception_resnet.add(Dropout(0.2))
model_inception_resnet.add(Dense(128, activation = 'relu'))
model_inception_resnet.add(Dropout(0.2))
model_inception_resnet.add(Dense(3, activation = 'softmax'))

model_inception_resnet.summary()

In [None]:
opt = keras.optimizers.Adam(lr=0.0001, decay=1e-6)
model_inception_resnet.compile(optimizer=opt, metrics=['accuracy'], loss='categorical_crossentropy')

In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint_inception = ModelCheckpoint(
    save_best_only = True,
    verbose = 1,
    filepath = 'saved_models/weights.best.from_inception_resnet_v2.hdf5'
)

model_inception_resnet.fit(train_features_inception,
                           train_labels,
                           epochs=35,
                           batch_size=64,
                           validation_data=(valid_features_inception, valid_labels), callbacks=[checkpoint_inception], verbose=1)

In [None]:
model_inception_resnet.load_weights('saved_models/weights.best.from_inception_resnet_v2.hdf5')

In [None]:
test_predictions = np.argmax(model_inception_resnet.predict(test_features_inception), axis=1)
accuracy = 100*np.sum(np.array(test_predictions) == np.argmax(test_labels, axis=1))/len(test_predictions)

print('Accuracy of Inception model on test set = %.4f%%' % accuracy)

In [None]:
submission_inception_resnet = pd.DataFrame({'Id':test_files, 'task_1':test_predictions_task1, 'task_2':test_predictions_task2})
pd.DataFrame.to_csv(submission_inception_resnet, 'submission.csv', index=False)

In [None]:
preds_path = sys.argv[1]

thresh = 0.5

#get ground truth labels for test dataset
truth = pd.read_csv('ground_truth.csv')
y_true = truth.as_matrix(columns=["task_1", "task_2"])

#get model predictions for test dataset
y_pred = pd.read_csv('submission.csv')
y_pred = y_pred.as_matrix(columns=["task_1", "task_2"])

#plot ROC curves and print scores
plot_roc_auc(y_true, y_pred)
#plot confusion matrix
classes = ['benign', 'malignant']
plot_confusion_matrix(y_true[:,0], y_pred[:,0], thresh, classes)
