In [None]:
import datetime
import Image
import gc
import numpy as np
import os
import random
from scipy import misc
import string
import time
import sys
import sklearn.metrics as skm

# Set some Theano config before initializing
os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=cpu,floatX=float32,allow_gc=False,openmp=True"
import theano

# MatPlotLib - Setup for Jupyter notebook output
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt

# Our modules
import dwdii_bc_model_helper as bc
import bc_models as models

# And Keras so we can emit the version
import keras

random.seed(20275)
np.set_printoptions(precision=2)

In [None]:
# Print some upfront version and config settings
print "Python v" + sys.version
print "Numpy v: " + np.__version__
print "keras v: " + keras.__version__
print "device:", theano.config.device
print "floatX:",  theano.config.floatX
print "mode:", theano.config.mode
print "openmp:", theano.config.openmp
print "allow_gc:", theano.config.allow_gc

In [None]:
#imagePath = "/root/bc_data/ddsm-png.25"
imagePath = "/root/bc_data/Data_Thresholded/DDSM"
trainImagePath = imagePath
trainDataPath = "../../data/ddsm_train.csv"

categories = bc.bcNormVsAbnormNumerics()
normalVsAbnormal = True

#
# Simulated training data
#
#trainImagePath = "/root/bc_data/simulated_images"
#trainDataPath = "/root/bc_data/simulated_images/simulated_images.csv"
#trainImagePath = "/root/bc_data/simulated_images_new"
#trainDataPath = "/root/bc_data/simulated_images_new/simulated_images.csv"

testDataPath = "../../data/ddsm_test.csv"
imgResize = (150, 150)

In [None]:
os.listdir('../../data')

# Load Training and Test Data

In this section, the training/validation data is loaded. The load_data function pre-balances the data set by removing images from over-represented emotion classes.

### Training Data

In [None]:
metaData, meta2, mCounts = bc.load_training_metadata(trainDataPath, balanceViaRemoval=True, verbose=True, 
                                                     normalVsAbnormal=normalVsAbnormal)

In [None]:
# Actually load some representative data for model experimentation
maxData = len(metaData)
X_data, Y_data = bc.load_data(trainDataPath, trainImagePath, 
                              categories=categories,
                              maxData = maxData, 
                              verboseFreq = 50, 
                              imgResize=imgResize, 
                              normalVsAbnormal=normalVsAbnormal)
print X_data.shape
print Y_data.shape

### Load Test Set 

In [None]:
# Actually load some representative data for model experimentation
maxData = len(metaData)
X_test, Y_test = bc.load_data(testDataPath, imagePath, 
                              categories=categories,
                              maxData = maxData, 
                              verboseFreq = 50, 
                              imgResize=imgResize, 
                              normalVsAbnormal=normalVsAbnormal)
print X_test.shape
print Y_test.shape

## Transformations

In this section, we will apply transformations to the existing images to increase of training data, as well as add a bit of noise in the hopes of improving the overall training activities.

In [None]:
#imgDataGenCount = 12
transformCount = 1 #+ imgDataGenCount

newImgs = np.zeros([X_data.shape[0] * transformCount, X_data.shape[1], X_data.shape[2]])
newYs = np.zeros([Y_data.shape[0] * transformCount, Y_data.shape[1]], dtype=np.int8)
print newImgs.shape
print newYs.shape

In [None]:
img = X_data[0]
img.shape

In [None]:
ndx = 0
for i in range(X_data.shape[0]):
    img = X_data[i]
    
    img0 = bc.reflectY(img)
    newImgs[ndx] = img0
    newYs[ndx] = Y_data[i]
    #misc.imsave("test0.png", img0)
    ndx += 1
    
    
    #break
    
print("Done", str(datetime.datetime.now()))

In [None]:
X_data2 = np.concatenate((X_data, newImgs))
Y_data2 = np.concatenate((Y_data, newYs))
print X_data2.shape
print Y_data2.shape

In [None]:
performedTransforms = True
if performedTransforms:
    X_train = X_data2
    Y_train = Y_data2
else:
    X_train = X_data
    Y_train = Y_data

## Training/Test Set Distribution
The following code segment splits the data into training and test data sets. Currently this is a standard 80/20 split for training and test respectively after performing a random shuffle using the unison_shuffled_copies help method.

In [None]:
print X_train.shape
print X_test.shape

print Y_train.shape
print Y_test.shape

In [None]:
import collections
def yDist(y):
    bcCounts = collections.defaultdict(int)
    for a in range(0, y.shape[0]):
        bcCounts[y[a][0]] += 1
    return bcCounts

print "Y_train Dist: " + str(yDist(Y_train))
print "Y_test Dist: " + str(yDist(Y_test))


## Define and Load Trained Model

In [None]:
# Load the bc array for our count in the model definition
print categories
print len(categories)

In [None]:
# Construct the model using our help function
model = models.bc_model_v01(len(categories), verbose=True, 
                                        input_shape=(1,X_train.shape[1],X_train.shape[2]))

## Training the Model

The following code segment trains the model using the run_network helper function. 

In [None]:
loadWeights = False
weightsFileName = "dwdii-bc-v01-normVsabnorm150-thresholded-5474-20170430.hdf5"
if loadWeights:
    model.load_weights('weights/' + weightsFileName)

In [None]:
# Reshape to the appropriate shape for the CNN input
testX = X_test.reshape(X_test.shape[0], 1, X_test.shape[1],X_test.shape[2])
trainX = X_train.reshape(X_train.shape[0], 1, X_train.shape[1],X_train.shape[2])

In [None]:
print "Training start: " + str(datetime.datetime.now())
m, h = models.run_network([trainX, testX, Y_train, Y_test], model, batch=50, epochs=30, verbosity=1)

In [None]:
model.save_weights('weights/' + weightsFileName, overwrite=True)

### Experiment Results

#### Raw DDSM Images

Initial results based on "normal" being masked as "benign":
* bc_model_v0 (150x150, 800/200): 182s - loss: 0.0560 - acc: 0.9813 - val_loss: 1.9918 - val_acc: 0.6800
* bc_model_v0 (150x150, 2000/500): 473s - loss: 0.0288 - acc: 0.9925 - val_loss: 1.4040 - val_acc: 0.7260
   * somewhat balanced, Y_train Dist {0: 1223, 1: 777}, Y_test Dist: {0: 321, 1: 179}

Revised with "normal", "benign" and "malignant" labeled seperately:
* bc_model_v0 (150x150, 1311/328): 298s - loss: 0.0411 - acc: 0.9786 - val_loss: 1.3713 - val_acc: 0.6616

After creating fixed "train", "test" and "validate" data sets, using "train" and "test" as well as including the DDSM Benign cases:
* bc_model_v0 (150x150, 1554/363, 03.27.2017): 264s - loss: 0.0512 - acc: 0.9730 - val_loss: 1.3120 - val_acc: 0.6116
* bc_model_v0 (150x150, 2155/539, 04.02.2017): 362s - loss: 0.0600 - acc: 0.9763 - val_loss: 1.5315 - val_acc: 0.4805

bc_model_v01 - categorical_crossentropy
* bc_model_v01 (150x150, 2155/539, 04.03.2017): 361s - loss: 0.0935 - acc: 0.9800 - val_loss: 2.7872 - val_acc: 0.5065
* bc_model_v01 (150x150, 2132/536, 04.05.2017): 369s - loss: 0.0718 - acc: 0.9794 - val_loss: 2.5604 - val_acc: 0.5243

#### Thresholded Images

Using the "Data_Thresholded" images
* bc_model_v0 (150x150, Thresholded, 661/171, 03.28.2017): 124s - loss: 0.0529 - acc: 0.9743 - val_loss: 1.4331 - val_acc: 0.4971

#### Simulated Images

Using the "simulated_images" images
* bc_model_v01 (150x150, 7776/536, 04.24.2017): 1250s - loss: 0.5543 - acc: 0.7885 - val_loss: 7.1153 - val_acc: 0.4123

#### Normal Vs Abnormal

##### Raw
* bc_model_v01 (150x150, 2893/536, 04.25.2017):  496s - loss: 0.0522 - acc: 0.9865 - val_loss: 2.2328 - val_acc: 0.6309

##### Data Thresholded
* bc_model_v01 (150x150, 924/231,04.26.2017): 154s - loss: 0.0365 - acc: 0.9892 - val_loss: 1.9738 - val_acc: 0.5628
* bc_model_v01 (150x150, 2737/694,04.29.2017): 463s - loss: 0.0390 - acc: 0.9898 - val_loss: 2.4042 - val_acc: 0.6326
* bc_model_v01 ((150x150, 5474/694,04.30.2017)): 1317s - loss: 0.0552 - acc: 0.9845 - val_loss: 2.8678 - val_acc: 0.6138

In [None]:
resultsValAcc = {}
#resultsValAcc["1"] = 0.6800
#resultsValAcc["2"] = 0.7260
#resultsValAcc["3"] = 0.6616
#resultsValAcc["03-27-2017"] = 0.6116
#resultsValAcc["04-02-2017"] = 0.4805
#resultsValAcc["04-03-2017"] = 0.5065
#resultsValAcc["04-05-2017"] = 0.5243
resultsValAcc[924] = 0.5628
resultsValAcc[2737] = 0.6326
resultsValAcc[5474] = 0.6138
import dwdii_test as dwdii
#cmp = matplotlib.colors.Colormap("Blues")
dwdii.barChart(resultsValAcc, filename="../../figures/shallowCnn_thresholded_2class_results_valacc.png", title="Shallow CNN - DDSM Data Thresholded 2 Class Test Accuracy", yAxisLabel="Accuracy %")

### Analyze Predictions with Test Set

In [None]:
predictOutput = model.predict(testX, batch_size=32, verbose=1)

In [None]:
predClass = np.array(predictOutput[0]).argmax()
numBC = bc.reverseDict(categories)
numBC[predClass]

In [None]:
numBC[Y_test[0][0]]

In [None]:
predClasses = []
for i in range(len(predictOutput)):

    arPred = np.array(predictOutput[i])
    predictionProb = arPred.max()
    predictionNdx = arPred.argmax()
    predClassName = numBC[predictionNdx]
    predClasses.append(predictionNdx)

    #print "{0}: {1} ({2})".format(i, predClassName, predictionProb)

### Confusion Matrix

In [None]:
# Use sklearn's helper method to generate the confusion matrix
cnf_matrix = skm.confusion_matrix(Y_test, predClasses)
cnf_matrix

In [None]:
class_names = numBC.values()
np.set_printoptions(precision=2)

In [None]:
# Plot non-normalized confusion matrix
fileCfMatrix = '../../figures/confusion_matrix-' + weightsFileName + '.png'
plt.figure()
bc.plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization, \n' + weightsFileName)
plt.savefig(fileCfMatrix)

In [None]:
# Load the image we just saved
from IPython.display import Image
Image(filename=fileCfMatrix)

In [None]:
# Plot normalized confusion matrix
fileCfMatrixNorm = '../../figures/confusion_matrix_norm-' + weightsFileName + '.png'
plt.figure()
bc.plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix \n' + weightsFileName)
plt.savefig(fileCfMatrixNorm)

In [None]:
# Load the image we just saved
from IPython.display import Image
Image(filename=fileCfMatrixNorm)