In [1]:
import os
import sys
import numpy as np
import keras.callbacks as cb
import keras.utils.np_utils as np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras.layers.core import Activation
from keras import applications # For easy loading the VGG_16 Model
from skimage import color
# Image loading and other helper functions
import dwdii_bc_model_helper as bc

Using Theano backend.


### VGG_Prep
The ollowing function takes the 8bit grayscale images that we are using and converts them to 8bit rgb while at the same time changing the pixles to a scale of 0 to 255. These image parameters are required by the VGG_16 model. 

In [2]:
def VGG_Prep(img_data):
    """
    :param img_data: training or test images of shape [#images, height, width]
    :return: the array transformed to the correct shape for the VGG network
                shape = [#images, height, width, 3] transforms to rgb and reshapes
    """
    images = np.zeros([len(img_data), img_data.shape[1], img_data.shape[2], 3])
    for i in range(0, len(img_data)):
        im = 255 - (img_data[i] * 255)  # Orginal imagnet images were not rescaled
        im = color.gray2rgb(im)
        images[i] = im
    return(images)

### VGG_16 Bottleneck
The following function leverages Daniel's image loader function and performs the following:
1. Loads in the images using the train, test, and validation csv files.
2. Prepares the images using the VGG_Prep function
3. Loads the VGG_16 model with the cassification layers removed.
4. Runs each of the images for the training, test, and validation sets (if included) through the model.
5. Saves out .npy files containing the bottleneck features from the VGG_16 model predictions and the corresponding labels.

In [3]:
def vgg16_bottleneck(trainPath, testPath, imagePath, modelPath, size, balance = True, verbose = True, verboseFreq = 50, valPath = 'None'):
    # Loading data
    metaTr, metaTr2, mCountsTr = bc.load_training_metadata(trainPath, balance, verbose)
    lenTrain = len(metaTr)
    X_train, Y_train = bc.load_data(trainPath, imagePath, maxData = lenTrain, verboseFreq = verboseFreq, imgResize=size)
    
    metaTest, meataT2, mCountsT = bc.load_training_metadata(testPath, balance, verbose)
    lenTest = len(metaTest)
    X_test, Y_test = bc.load_data(testPath, imagePath, maxData = lenTrain, verboseFreq = verboseFreq, imgResize=size)
    
    X_train = VGG_Prep(X_train)
    X_test = VGG_Prep(X_test)
        
    print('Loading the VGG_16 Model')
    model = applications.VGG16(include_top=False, weights='imagenet')
        
    # Generating the bottleneck features for the training data
    print('Evaluating the VGG_16 Model on the Training Data')
    bottleneck_features_train = model.predict(X_train)
    
    # Saving the bottleneck features for the training data
    featuresTrain = os.path.join(modelPath, 'bottleneck_features_train.npy')
    labelsTrain = os.path.join(modelPath, 'labels_train.npy')
    np.save(open(featuresTrain, 'wb'), bottleneck_features_train)
    np.save(open(labelsTrain, 'wb'), Y_train)

    # Generating the bottleneck features for the test data
    print('Evaluating the VGG_16 Model on the Test Data')
    bottleneck_features_test = model.predict(X_test)
    
    # Saving the bottleneck features for the test data
    featuresTest = os.path.join(modelPath, 'bottleneck_features_test.npy')
    labelsTest = os.path.join(modelPath, 'labels_test.npy')
    np.save(open(featuresTest, 'wb'), bottleneck_features_test)
    np.save(open(labelsTest, 'wb'), Y_test)
    
    if valPath != 'None':
        metaVal, metaV2, mCountsV = bc.load_training_metadata(valPath, verbose = verbose, balanceViaRemoval = False)
        lenVal = len(metaVal)
        X_val, Y_val = bc.load_data(valPath, imagePath, maxData = lenVal, verboseFreq = verboseFreq, imgResize=size)
        X_val = VGG_Prep(X_val)
        
        # Generating the bottleneck features for the test data
        print('Evaluating the VGG_16 Model on the Validataion Data')
        bottleneck_features_val = model.predict(X_val)
    
        # Saving the bottleneck features for the test data
        featuresVal = os.path.join(modelPath, 'bottleneck_features_validation.npy')
        labelsVal = os.path.join(modelPath, 'labels_validation.npy')
        np.save(open(featuresVal, 'wb'), bottleneck_features_val)
        np.save(open(labelsVal, 'wb'), Y_val)

## Running the model on the Train, Test, and Validation Data
1) The first test is on the rescaled and squared off images maintaining aspect ratio without the artifacts removed.

In [4]:
# global variables for loading the data
imagePath = '../images/ddsm/png/'
trainDataPath = '../images/ddsm/ddsm_train.csv'
testDataPath = '../images/ddsm/ddsm_test.csv'
valDataPath = '../images/ddsm/ddsm_val.csv'
imgResize = (150, 150) # can go up to (224, 224)
modelPath = '../model/'

In [5]:
vgg16_bottleneck(trainDataPath, testDataPath, imagePath, modelPath, imgResize, 
                 balance = True, verbose = True, verboseFreq = 50, valPath = valDataPath)

Raw Balance
----------------
benign 531
malignant 739
normal 2685
balanaceViaRemoval.avgE: 1318
balanaceViaRemoval.theshold: 1318.0

After Balancing
----------------
benign 531
malignant 739
normal 862
Raw Balance
----------------
benign 531
malignant 739
normal 2685
balanaceViaRemoval.avgE: 1318
balanaceViaRemoval.theshold: 1318.0

After Balancing
----------------
benign 531
malignant 739
normal 862
0.0000: A_0152_1.RIGHT_MLO.LJPEG.png


  X_data = np.zeros([total, x, y])
  Y_data = np.zeros([total, 1], dtype=np.int8)


0.0235: C_0091_1.LEFT_CC.LJPEG.png
0.0469: A_0707_1.LEFT_CC.LJPEG.png
0.0704: A_0534_1.RIGHT_CC.LJPEG.png
0.0938: A_1055_1.LEFT_CC.LJPEG.png
0.1173: C_0415_1.RIGHT_MLO.LJPEG.png
0.1407: C_0305_1.LEFT_CC.LJPEG.png
0.1642: A_1061_1.LEFT_CC.LJPEG.png
0.1876: C_0383_1.RIGHT_CC.LJPEG.png
0.2111: B_3120_1.RIGHT_CC.LJPEG.png
0.2345: C_0321_1.LEFT_MLO.LJPEG.png
0.2580: B_3412_1.LEFT_CC.LJPEG.png
0.2814: A_0139_1.RIGHT_CC.LJPEG.png
0.3049: B_3098_1.RIGHT_MLO.LJPEG.png
0.3283: A_0056_1.LEFT_MLO.LJPEG.png
0.3518: C_0396_1.LEFT_CC.LJPEG.png
0.3752: C_0337_1.RIGHT_CC.LJPEG.png
0.3987: A_0572_1.RIGHT_MLO.LJPEG.png
0.4221: A_1057_1.LEFT_CC.LJPEG.png
0.4456: B_3662_1.LEFT_MLO.LJPEG.png
0.4690: C_0032_1.RIGHT_CC.LJPEG.png
0.4925: A_1056_1.RIGHT_CC.LJPEG.png
0.5159: B_3458_1.RIGHT_MLO.LJPEG.png
0.5394: A_0067_1.RIGHT_CC.LJPEG.png
0.5629: B_3469_1.RIGHT_CC.LJPEG.png
0.5863: A_0227_1.LEFT_CC.LJPEG.png
0.6098: C_0280_1.RIGHT_MLO.LJPEG.png
0.6332: A_0417_1.LEFT_CC.LJPEG.png
0.6567: C_0130_1.LEFT_MLO.LJPEG.p

In [6]:
class LossHistory(cb.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        batch_loss = logs.get('loss')
        self.losses.append(batch_loss)

## Train Top Model
This function takes the bottleneck features from the bottleneck function and applies a shallow CNN to these features to classify the images. The function needs to be pointed at the locations of the training and test features along with the training and test labels. You can use the epoch and batch size variables to control the number of images to show to the model and the number of training epochs. The model save variabler alows for saving of the final model weights.

In [7]:
def train_top_model(train_feats, train_lab, test_feats, test_lab, model_path, model_save, epoch = 50, batch = 64):
    train_bottleneck = os.path.join(model_path, train_feats)
    train_labels = os.path.join(model_path, train_lab)
    test_bottleneck = os.path.join(model_path, test_feats)
    test_labels = os.path.join(model_path, test_lab)
    
    history = LossHistory()
    
    X_train = np.load(train_bottleneck)
    Y_train = np.load(train_labels)
    Y_train = np_utils.to_categorical(Y_train, nb_classes=3)
    
    X_test = np.load(test_bottleneck)
    Y_test = np.load(test_labels)
    Y_test = np_utils.to_categorical(Y_test, nb_classes=3)

    model = Sequential()
    model.add(Flatten(input_shape=X_train.shape[1:]))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(3))
    model.add(Activation('softmax'))
    # try Adadelta and Adam
    model.compile(optimizer='adadelta',
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])

    model.fit(X_train, Y_train,
              nb_epoch=epoch,
              batch_size=batch,
              callbacks=[history],
              validation_data=(X_test, Y_test),
              verbose=2)
    
    score = model.evaluate(X_test, Y_test, batch_size=16, verbose=0)

    print "Network's test score [loss, accuracy]: {0}".format(score)
    
    model.save_weights(model_save)

## Running the Top Model
The following runs the top model classifier on the bottleneck features.

In [9]:
# Locations for the bottleneck and labels files that we need
modelPath = '../model/'
train_bottleneck = 'bottleneck_features_train.npy'
train_labels = 'labels_train.npy'
test_bottleneck = 'bottleneck_features_test.npy'
test_labels = 'labels_test.npy'
validation_bottleneck = 'bottleneck_features_valdation.npy'
validation_label = 'labels_validation.npy'
top_model_weights_path = 'top_weights02.h5'

In [10]:
train_top_model(train_feats=train_bottleneck, train_lab=train_labels, test_feats=test_bottleneck, test_lab=test_labels,
                model_path=modelPath, model_save=top_model_weights_path)

Train on 2132 samples, validate on 536 samples
Epoch 1/50
0s - loss: 6.5851 - acc: 0.4508 - val_loss: 5.6644 - val_acc: 0.4571
Epoch 2/50
0s - loss: 3.4335 - acc: 0.5563 - val_loss: 2.2755 - val_acc: 0.5000
Epoch 3/50
0s - loss: 1.0971 - acc: 0.6393 - val_loss: 1.2428 - val_acc: 0.5243
Epoch 4/50
0s - loss: 0.6560 - acc: 0.7317 - val_loss: 1.3331 - val_acc: 0.4963
Epoch 5/50
0s - loss: 0.5417 - acc: 0.7767 - val_loss: 1.3829 - val_acc: 0.5019
Epoch 6/50
0s - loss: 0.4130 - acc: 0.8354 - val_loss: 1.4304 - val_acc: 0.5392
Epoch 7/50
0s - loss: 0.3163 - acc: 0.8752 - val_loss: 1.5420 - val_acc: 0.5187
Epoch 8/50
0s - loss: 0.2728 - acc: 0.8963 - val_loss: 1.5069 - val_acc: 0.5168
Epoch 9/50
0s - loss: 0.2050 - acc: 0.9282 - val_loss: 1.6816 - val_acc: 0.5243
Epoch 10/50
0s - loss: 0.1788 - acc: 0.9273 - val_loss: 1.6940 - val_acc: 0.5485
Epoch 11/50
0s - loss: 0.1626 - acc: 0.9465 - val_loss: 1.7204 - val_acc: 0.5299
Epoch 12/50
0s - loss: 0.1236 - acc: 0.9578 - val_loss: 1.7611 - val_ac

## Results
All results below are run against the train, test, validate csv files located at [Breast Cancer Github Data](https://github.com/jnarhan/Breast_Cancer/tree/master/data)

### Aspect Ratio Squared Raw DDSM Images with Artifacts
1) Run 1: 150x150 image size 50 Epochs, Batch Size 64
    * Network's test score [loss, accuracy]: [2.4609192387381595, 0.58582089552238803]