# All the imports

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import cPickle as pickle
import feature_calculations as fc
import os
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

# Load the inception model

In [None]:
def create_graph():
  """Creates a graph from saved GraphDef file and returns a saver."""
  # Creates graph from saved graph_def.pb.
  with tf.gfile.FastGFile('../data/classify_image_graph_def.pb', 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())
    _ = tf.import_graph_def(graph_def, name='')

# Load or compute pool3 for frames

In [None]:
my_dict = {} # dictionary to hold pool_3 features for each frame
# the structure is :
# my_dict = {fileID1: {label:'classification1',pool_3:[vector,of,pool3,1]},
#            fileID2: {label:'classification2', pool_3:[vector,of,pool3,2]}}

# first check if data already exists
if os.path.exists('../data/camelyon_pool3.pkl'):
    my_dict = pickle.load(open('../data/camelyon_pool3.pkl', 'rb'))

# otherwise, we need to compute pool3 weights    
else:
    # create inceptions graph
    create_graph()
    sess = tf.InteractiveSession()
    pool3_tensor = sess.graph.get_tensor_by_name('pool_3:0') # these are the weights we want!
    for i in ['../data/slide_data/camelyon_metastatic/', '../data/slide_data/camelyon_normal/']:
        # check that folder exists
        if not os.path.isdir(i): print "WARNING::Directory '", i, "' does not exist! Skipping..."; continue
        # get list of files
        files = os.listdir(i)
        for j,iFile in enumerate(files):
            if (j%100==0): print "working on file #", j
            if not '.jpeg' in iFile: continue # skip file that are not images
            file_path = os.path.join(i,iFile) # full file path
            img = cv2.imread(file_path) # load image for cv2
            if float(fc.compute_white_area_1(img)) > 0.95: continue # same cleaning applied on 6 feature classifier
            thisFrame = {} # dictionary for this frame
            thisFrame['label'] = ('metastatic' in i)*'metastatic' + ('normal' in i)*'normal' # label
            img_data = tf.gfile.FastGFile(file_path, 'rb').read() # load for tensorflow
            thisFrame['pool3'] = np.squeeze(sess.run(pool3_tensor, {'DecodeJpeg/contents:0':img_data})) # features
            file_ID = iFile.split('.')[1] #frame ID
            my_dict[file_ID] = thisFrame
    # save for later :)
    pickle.dump( my_dict, open( "../data/camelyon_pool3.pkl", "wb" ) )

# Use same test/train split as 6 feature classifier

In [None]:
# these were saved in the feature calculator
setIDs = pickle.load(open('../data/train_cv_test_fileIDs.pkl', 'rb'))
# subset into training, cv, test
trainIDs = setIDs['train']
cvIDs = setIDs['cv']
testIDs = setIDs['test']

In [None]:
# [x for x in trainIDs] gives a list of 1-d arrays.  need to grab the first element to get fileID
trainX = [my_dict[x[0]]['pool3'] for x in trainIDs]
trainY = [my_dict[x[0]]['label']=='metastatic' for x in trainIDs]
cvX = [my_dict[x[0]]['pool3'] for x in cvIDs]
cvY = [my_dict[x[0]]['label']=='metastatic' for x in cvIDs]
testX = [my_dict[x[0]]['pool3'] for x in testIDs]
testY = [my_dict[x[0]]['label'] for x in testIDs]

In [None]:
# check the length
len(trainY), len(cvY), len(testY)

In [None]:
# check the shape
np.shape(trainX), np.shape(cvX), np.shape(testX)

# Let's look at the training set 
## Try first six PCA features (e.g. get a comparison to my engineered 6 features)

In [None]:
# apply the standard scaler for 0 mean, unit std
scaler = preprocessing.StandardScaler().fit(trainX)

In [None]:
# try just 6 PCs for comparison with feature classifier
pca = PCA(n_components=6)
pca = pca.fit(scaler.transform(trainX))
# what % of variance is retained?
sum(pca.explained_variance_ratio_)

In [None]:
# transform the data for plotting
pca6_trainX = pca.transform(scaler.transform(trainX))

In [None]:
# function to plot normalized PC distributions
def plot_pca_component(component, dataX, dataY, nBins, title, xlabel, ylabel):
    comp_index = component - 1 #PC1 has index 0
    histogram = plt.figure()
    bins = np.linspace(np.min(dataX), np.max(dataX), nBins) # define the binning
    # normalize to unit area
    plt.hist([dataX[x][comp_index] for x in range(len(dataX)) if not dataY[x]],
             bins, weights=np.ones(len(dataY)-sum(dataY))/(len(dataY)-sum(dataY)),
             alpha=0.5, label='normal',color="#00ff00")
    plt.hist([dataX[x][comp_index] for x in range(len(dataX)) if dataY[x]],
             bins, weights=np.ones(sum(dataY))/(sum(dataY)),
             alpha=0.5, label='metastatic',color="#990099")
    # labels and stuff
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend(loc='best')
    return histogram

In [None]:
# make the labels bigger, plot pretty
import seaborn
seaborn.set(font_scale=1.9)
seaborn.set_style('darkgrid')
%matplotlib inline
# plot the PCs
for i in range(6):
    plot_pca_component(i, pca.transform(scaler.transform(trainX)), trainY, 20, 'Principal Component '+str(i+1), 'arbitrary units', 'arbitrary units')

# Let's try logistic regression as a function of the number of PCs
## ...and compare regularization parameters!

In [None]:
# dictionary to keep track score for different # PCs
# {#PC1: score1, #PC2: score2}
nPCAScoreDict = {}
for i in range(1,500,10):
    if (i-1)%100 == 0: print "iteration", i
    pca = PCA(n_components=i) # new PCA each time
    pca.fit(scaler.transform(trainX)) # fit the PCA
    model = LogisticRegression(C=0.01).fit(pca.transform(scaler.transform(trainX)), trainY) # apply logistic regression
    nPCAScoreDict[i] = model.score(pca.transform(scaler.transform(cvX)), cvY) # score logistic regression

In [None]:
# redo the above, but turn down amount of regulatization
nPCAScoreDictNoReg = {}
for i in range(1,500,10):
    if (i-1)%100 == 0: print "iteration", i
    pca = PCA(n_components=i)
    pca.fit(scaler.transform(trainX))
    model = LogisticRegression(C=30).fit(pca.transform(scaler.transform(trainX)), trainY)
    nPCAScoreDictNoReg[i] = model.score(pca.transform(scaler.transform(cvX)), cvY)

In [None]:
# plot the cv score as a function of the number of PCs
seaborn.set(font_scale=1.6)
# scores from inception + logistic regression
plt.scatter(nPCAScoreDict.keys(), nPCAScoreDict.values(), color='red', label='reg. param. = 0.01')
plt.scatter(nPCAScoreDictNoReg.keys(), nPCAScoreDictNoReg.values(), color='blue', label='reg. param. = 30')
# for reference, include scores from CV-based classifier
plt.axhline(0.973, label = 'Computer vision + MLP Classifier', color='black')
plt.xlabel('No. Principal Components')
plt.ylabel('Cross-validation Accuracy')
plt.legend(loc=4)

In [None]:
# check the best score
max(nPCAScoreDict.values())

# Learning curve!

In [None]:
# plot the cv score as a function of number of training data points
nTrainScoreDict = {}
for i in range(2, len(trainX), 20):
    if (i-2)%1000 == 0: print "i=",i
    nPCA = min(i,200) # can't have more PCs than number of training data points
    # subset the training data
    thisTrainX = trainX[:i]
    thisTrainY = trainY[:i]
    # need a new scaler 
    thisScaler = preprocessing.StandardScaler().fit(thisTrainX)
    # do the pca / model fitting / model scoring
    thisPCA = PCA(n_components=nPCA).fit(thisScaler.transform(thisTrainX))
    model = LogisticRegression(C=0.01).fit(thisPCA.transform(thisScaler.transform(thisTrainX)),thisTrainY)
    nTrainScoreDict[i] = model.score(thisPCA.transform(thisScaler.transform(cvX)), cvY)

In [None]:
# repeat for no regularization
nTrainScoreDictNoReg = {}
for i in range(2, len(trainX), 20):
    if (i-2)%1000 == 0: print "i=",i
    nPCA = min(i,200)
    thisTrainX = trainX[:i]
    thisTrainY = trainY[:i]
    thisScaler = preprocessing.StandardScaler().fit(thisTrainX)
    thisPCA = PCA(n_components=nPCA).fit(thisScaler.transform(thisTrainX))
    model = LogisticRegression(C=30).fit(thisPCA.transform(thisScaler.transform(thisTrainX)),thisTrainY)
    nTrainScoreDictNoReg[i] = model.score(thisPCA.transform(thisScaler.transform(cvX)), cvY)

In [None]:
# plot the results
plt.scatter(nTrainScoreDict.keys(), nTrainScoreDict.values(), color='red', label='reg. param. = 0.01')
plt.scatter(nTrainScoreDictNoReg.keys(), nTrainScoreDictNoReg.values(), color='blue', label='reg. param. = 30')
#plt.axhline(0.973, label = 'Computer vision + MLPClassifier', color='black')
plt.xlabel('No. Training Examples')
plt.ylabel('Cross-validation Accuracy')
plt.legend(loc=4)

In [None]:
# open the dictionary of CVs scores as a function of number of training examples
# from feature classifier
mlpCVScore = pickle.load(open('../data/mlpCVScore.pkl', 'rb'))

In [None]:
# plot to compare the learning curves
plt.scatter(nTrainScoreDict.keys(), nTrainScoreDict.values(), color='red', label='Inception-v3 + Log. Reg.')
plt.scatter(mlpCVScore.keys(), mlpCVScore.values(), color='blue', label='Computer vision + MLPClassifier')
#plt.axhline(0.973, label = 'Computer vision + MLPClassifier', color='black')
plt.xlabel('No. Training Examples')
plt.ylabel('Cross-validation Accuracy')
plt.legend(loc=4)

In [None]:
# train a final model to evaluate precision / recall/ etc.
nPCA = 200
scaler = preprocessing.StandardScaler().fit(trainX)
pca = PCA(n_components=nPCA).fit(thisScaler.transform(trainX))
logReg = LogisticRegression(C=0.01).fit(thisPCA.transform(thisScaler.transform(trainX)),trainY)

In [None]:
# other performance metrics
tp = sum([logReg.predict(pca.transform(scaler.transform(cvX)))[x] == cvY[x] for x in range(len(cvY)) if cvY[x]])
tn = sum([logReg.predict(pca.transform(scaler.transform(cvX)))[x] == cvY[x] for x in range(len(cvY)) if not cvY[x]])
fn = sum([logReg.predict(pca.transform(scaler.transform(cvX)))[x] != cvY[x] for x in range(len(cvY)) if cvY[x]])
fp = sum([logReg.predict(pca.transform(scaler.transform(cvX)))[x] != cvY[x] for x in range(len(cvY)) if not cvY[x]])
#sum([model.predict(scaler.transform(cvX))[x] != cvY[x][0] for x in range(len(cvY)) if cvY[x][0]])
#sum([model.predict(scaler.transform(cvX))[x] != cvY[x][0] for x in range(len(cvY)) if not cvY[x][0]])
tp, tn, fp, fn, len(cvY)
#precision = float(tp) / (tp+fp)
#recall = float(tp) / (tp + fn)
#f1 = 2.*float(precision*recall) / (precision + recall)
#precision, recall, f1

In [None]:
logReg.score(pca.transform(scaler.transform(cvX)), cvY)