In [37]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
import random
import time # just to see how fast/slow things run

from scipy.misc import imread
from sklearn.model_selection import train_test_split
from PIL import Image
from shutil import copyfile

Data is organized in folders, where in every folder there are two subfolders: ```0``` and ```1```. They respectively represent benign and malignant tumours.

In [2]:
data_folder  = 'IDC_regular_ps50_idx5/'
num_positive = 78786
num_negative = 198738

In [3]:
_, _, file_names = list(zip(*(list(os.walk(data_folder)))))
file_names       = list(filter(lambda ls: len(ls) not in [0, 1], file_names))

'''
file_names contains all files names in following format:

file_names = [['9036_idx5_x1051_y2401_class0.png',
               '9036_idx5_x2951_y951_class0.png',
               ...],
               ['10257_idx5_x2101_y601_class1.png',
                '10257_idx5_x1651_y1251_class1.png',
               ...],
              ...]
               
'''

"\nfile_names contains all files names in following format:\n\nfile_names = [['9036_idx5_x1051_y2401_class0.png',\n               '9036_idx5_x2951_y951_class0.png',\n               ...],\n               ['10257_idx5_x2101_y601_class1.png',\n                '10257_idx5_x1651_y1251_class1.png',\n               ...],\n              ...]\n               \n"

In [4]:
def get_train_test(list_file_names, test_ratio=0.2):
    '''
    list_file_names should be under the same format as file_names as above
    '''
    x_train = []
    x_test  = []
    
    for ls in list_file_names:
        
        label      = ls[0][-5] # this list is benign or malignant
        patient_id = re.findall('\d+', ls[0])[0]
        num_pics   = len(ls)
        
        assert label in ['0', '1']
        
        path = data_folder + patient_id + '/' + label + '/'
        to_add_train, to_add_test = train_test_split(ls, test_size=test_ratio)
        
        x_train = x_train + [path + png for png in to_add_train]
        x_test  = x_test + [path + png for png in to_add_test]
    
    
    # We should be able to safely ignore pictures of shape (x, 50, 3) for x /0 50
    # for computational and consistency purposes
    x_train = list(filter(lambda pic: Image.open(pic).size == (50, 50), x_train))
    x_test  = list(filter(lambda pic: Image.open(pic).size == (50, 50), x_test))
    
    random.shuffle(x_train)
    random.shuffle(x_test)
    
    y_train = np.array([int(png[-5]) for png in x_train]).reshape(-1, 1)
    y_test  = np.array([int(png[-5]) for png in x_test]).reshape(-1, 1)
    
    return x_train, y_train, x_test, y_test

class ManualError(Exception):
    pass

In [5]:
# takes 75 seconds
x_train, y_train, x_test, y_test = get_train_test(file_names)

In [6]:
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def new_biases(length):
    return tf.Variable(tf.constant(0.05, shape=[length]))

def new_conv_layer(input,             
                   num_input_channels,
                   filter_size,       
                   num_filters,  
                   name,
                   use_pooling=True): 

    shape   = [filter_size, filter_size, num_input_channels, num_filters]
    weights = new_weights(shape=shape)
    biases  = new_biases(length=num_filters) # one for each filter

    layer = tf.nn.conv2d(input=input,
                         filter=weights,
                         strides=[1, 1, 1, 1],
                         padding='SAME')

    layer += biases

    if use_pooling:
        layer = tf.nn.max_pool(value=layer,
                               ksize=[1, 2, 2, 1],
                               strides=[1, 2, 2, 1],
                               padding='SAME')

    layer = tf.nn.relu(layer, name=name)

    return layer, weights


def flatten_layer(layer):
    layer_shape  = layer.get_shape()
    num_features = layer_shape[1:4].num_elements()
    layer_flat   = tf.reshape(layer, [-1, num_features])

    # Return both the flattened layer and the number of features.
    return layer_flat, num_features


def new_fc_layer(input,          # The previous layer.
                 num_inputs,     # Num. inputs from prev. layer.
                 num_outputs,    # Num. outputs.
                 name,
                 use_relu=True): # Use Rectified Linear Unit (ReLU)?

    # Create new weights and biases.
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length=num_outputs)

    # Calculate the layer as the matrix multiplication of
    # the input and weights, and then add the bias-values.

    # Use ReLU?
    if use_relu:
        layer = tf.nn.relu(tf.add(tf.matmul(input, weights), biases), name=name)
    else:
        layer = tf.add(tf.matmul(input, weights), biases, name=name)

    return layer

def create_batch_tuples(length, batch_size):
    output = np.arange(0, length, batch_size)
    
    if output[-1] != length:
        output = np.append(output, length)
        
    return list(zip(output[:-1], output[1:]))

def indicator(array):
    return np.concatenate(array > 0)

In [52]:
def convolutional_neural_network(x_train, y_train, x_test, y_test, batch_size):
    '''
    x_train is the path of the images
    '''
    NUM_EPOCHS = 1
    
    n = len(x_train)
    
    x = tf.placeholder(dtype=tf.float32, shape=[None, 50, 50, 3], name='x')
    y = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='y')
    
    layer_conv1, weights_conv1 = new_conv_layer(input=x, 
                                                num_input_channels=3, 
                                                filter_size=5, 
                                                num_filters=8, 
                                                name='layer_conv1')
    layer_conv2, weights_conv2 = new_conv_layer(input=layer_conv1, 
                                                num_input_channels=8, 
                                                filter_size=7, 
                                                num_filters=32,
                                                name='layer_conv2')
    layer_flat, num_features = flatten_layer(layer_conv2)
    
    layer_fc = new_fc_layer(input=layer_flat,
                         num_inputs=num_features,
                         num_outputs=32,
                         name='layer_fc',
                         use_relu=True)
    
    out = new_fc_layer(input=layer_fc,
                       num_inputs=32,
                       num_outputs=1,
                       name='out',
                       use_relu=False)
    # range of out is \mathbb{R}^d, with d = 1
    
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=out, labels=y)
    cost          = tf.reduce_mean(cross_entropy)
    optimizer     = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
    
    batch_tuples = create_batch_tuples(len(x_train), batch_size)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for e in range(NUM_EPOCHS):
            
            global prediction  # for debugging
            prediction = []

            for (start, end) in batch_tuples:
                
                print(start)
                x_reshaped = np.array([imread(pic) for pic in x_train[start: end]])
                
                sess.run([optimizer], feed_dict={x: x_reshaped,
                                                 y: y_train[start: end]})
        
            '''for (start, end) in batch_tuples:
                
                x_test_reshaped = np.array([imread(pic) for pic in x_test[start: end]])
                
                batch_preds = list(indicator(out.eval({x: x_test_reshaped})))
                prediction += batch_preds
                
            
            accuracy = np.mean(prediction == y_test)
            print('Epoch {}, accuracy: {}%'.format(e, np.mean(accuracy)*100))'''
        
        '''
        batch_tuples_test = create_batch_tuples(len(x_test), batch_size)
        x_test_reshaped   = x_test.reshape(-1, 50, 50, 3)
        
        for (start, end) in batch_tuples_test:
            
            batch_y_pred = y_pred.eval({x: x_test_reshaped[start:end]})
            y_output.append(batch_y_pred)'''

        saver = tf.train.Saver()
        saver.save(sess, './conv_nn_breast_cancer')

In [56]:
tic = time.clock()
convolutional_neural_network(x_train[:20000], y_train[:20000], x_test, y_test, 64)
toc = time.clock()

0


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.


64
128
192
256
320
384
448
512
576
640
704
768
832
896
960
1024
1088
1152
1216
1280
1344
1408
1472
1536
1600
1664
1728
1792
1856
1920
1984
2048
2112
2176
2240
2304
2368
2432
2496
2560
2624
2688
2752
2816
2880
2944
3008
3072
3136
3200
3264
3328
3392
3456
3520
3584
3648
3712
3776
3840
3904
3968
4032
4096
4160
4224
4288
4352
4416
4480
4544
4608
4672
4736
4800
4864
4928
4992
5056
5120
5184
5248
5312
5376
5440
5504
5568
5632
5696
5760
5824
5888
5952
6016
6080
6144
6208
6272
6336
6400
6464
6528
6592
6656
6720
6784
6848
6912
6976
7040
7104
7168
7232
7296
7360
7424
7488
7552
7616
7680
7744
7808
7872
7936
8000
8064
8128
8192
8256
8320
8384
8448
8512
8576
8640
8704
8768
8832
8896
8960
9024
9088
9152
9216
9280
9344
9408
9472
9536
9600
9664
9728
9792
9856
9920
9984
10048
10112
10176
10240
10304
10368
10432
10496
10560
10624
10688
10752
10816
10880
10944
11008
11072
11136
11200
11264
11328
11392
11456
11520
11584
11648
11712
11776
11840
11904
11968
12032
12096
12160
12224
12288
12352
12416
12480
12

In [44]:
for pic in x_test[:20]:
    copyfile(pic, 'breast_cancer_test_data/' + pic.split('/')[-1])

In [53]:
def restore_model(x_test_file_path):
    # x_test is one data point
    # assuming it will be in color too
    assert isinstance(x_test_file_path, str)
    
    loaded_graph = tf.Graph()
    

    image = imread(x_test_file_path)

    if np.shape(image) != (50, 50, 3):
        image = image.resize((50, 50))

    if np.shape(image) != (50, 50, 3):
        print('Picture is not in colour!')
        raise ManualError
    
    
    with tf.Session(graph=loaded_graph) as sess:
        sess.run(tf.global_variables_initializer())
        
        model = tf.train.import_meta_graph('conv_nn_breast_cancer.meta')
        model.restore(sess, tf.train.latest_checkpoint('./'))
        
        x_   = loaded_graph.get_tensor_by_name('x:0')
        out_ = loaded_graph.get_tensor_by_name('out:0')
        
        pred_labels = out_.eval({x_: np.array([image])})
    
    return int(pred_labels[0] > 0)

In [None]:
def active_learning():
    pass