In [131]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
import random
import time # just to see how fast/slow things run

from scipy.misc import imread
from sklearn.model_selection import train_test_split
from PIL import Image

Data is organized in folders, where in every folder there are two subfolders: ```0``` and ```1```. They respectively represent benign and malignant tumours.

In [132]:
data_folder  = 'IDC_regular_ps50_idx5/'
num_positive = 78786
num_negative = 198738

In [233]:
_, _, file_names = list(zip(*(list(os.walk(data_folder)))))
file_names       = list(filter(lambda ls: len(ls) not in [0, 1], file_names))

'''
file_names contains all files names in following format:

file_names = [['9036_idx5_x1051_y2401_class0.png',
               '9036_idx5_x2951_y951_class0.png',
               ...],
               ['10257_idx5_x2101_y601_class1.png',
                '10257_idx5_x1651_y1251_class1.png',
               ...],
              ...]
               
'''

"\nfile_names contains all files names in following format:\n\nfile_names = [['9036_idx5_x1051_y2401_class0.png',\n               '9036_idx5_x2951_y951_class0.png',\n               ...],\n               ['10257_idx5_x2101_y601_class1.png',\n                '10257_idx5_x1651_y1251_class1.png',\n               ...],\n              ...]\n               \n"

In [224]:
def get_train_test(list_file_names, test_ratio=0.2):
    '''
    list_file_names should be under the same format as file_names as above
    '''
    x_train = []
    x_test  = []
    
    for ls in list_file_names:
        
        label      = ls[0][-5] # this list is benign or malignant
        patient_id = re.findall('\d+', ls[0])[0]
        num_pics   = len(ls)
        
        assert label in ['0', '1']
        
        path = data_folder + patient_id + '/' + label + '/'
        to_add_train, to_add_test = train_test_split(ls, test_size=test_ratio)
        
        x_train = x_train + [path + png for png in to_add_train]
        x_test  = x_test + [path + png for png in to_add_test]
    
    
    # We should be able to safely ignore pictures of shape (x, 50, 3) for x /0 50
    # for computational and consistency purposes
    x_train = list(filter(lambda pic: Image.open(pic).size == (50, 50), x_train))
    x_test  = list(filter(lambda pic: Image.open(pic).size == (50, 50), x_test))
    
    random.shuffle(x_train)
    random.shuffle(x_test)
    
    y_train = np.array([int(png[-5]) for png in x_train]).reshape(-1, 1)
    y_test  = np.array([int(png[-5]) for png in x_test]).reshape(-1, 1)
    
    return x_train, y_train, x_test, y_test

In [225]:
# takes 75 seconds
x_train, y_train, x_test, y_test = get_train_test(file_names)

In [226]:
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def new_biases(length):
    return tf.Variable(tf.constant(0.05, shape=[length]))

def new_conv_layer(input,             
                   num_input_channels,
                   filter_size,       
                   num_filters,       
                   use_pooling=True): 

    shape   = [filter_size, filter_size, num_input_channels, num_filters]
    weights = new_weights(shape=shape)
    biases  = new_biases(length=num_filters) # one for each filter

    layer = tf.nn.conv2d(input=input,
                         filter=weights,
                         strides=[1, 1, 1, 1],
                         padding='SAME')

    layer += biases

    if use_pooling:
        layer = tf.nn.max_pool(value=layer,
                               ksize=[1, 2, 2, 1],
                               strides=[1, 2, 2, 1],
                               padding='SAME')

    layer = tf.nn.relu(layer)

    return layer, weights


def flatten_layer(layer):
    layer_shape  = layer.get_shape()
    num_features = layer_shape[1:4].num_elements()
    layer_flat   = tf.reshape(layer, [-1, num_features])

    # Return both the flattened layer and the number of features.
    return layer_flat, num_features


def new_fc_layer(input,          # The previous layer.
                 num_inputs,     # Num. inputs from prev. layer.
                 num_outputs,    # Num. outputs.
                 use_relu=True): # Use Rectified Linear Unit (ReLU)?

    # Create new weights and biases.
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length=num_outputs)

    # Calculate the layer as the matrix multiplication of
    # the input and weights, and then add the bias-values.
    layer = tf.matmul(input, weights) + biases

    # Use ReLU?
    if use_relu:
        layer = tf.nn.relu(layer)

    return layer

def create_batch_tuples(length, batch_size):
    output = np.arange(0, length, batch_size)
    
    if output[-1] != length:
        output = np.append(output, length)
        
    return list(zip(output[:-1], output[1:]))

def indicator(array):
    return np.concatenate(array > 0)

In [227]:
def convolutional_neural_network(x_train, y_train, x_test, batch_size):
    '''
    x_train is the path of the images
    '''
    NUM_EPOCHS = 1
    
    n = len(x_train)
    
    x = tf.placeholder(dtype=tf.float32, shape=[None, 50, 50, 3])
    
    y = tf.placeholder(dtype=tf.float32, shape=[None, 1])
    
    layer_conv1, weights_conv1 = new_conv_layer(input=x, num_input_channels=3, filter_size=5, num_filters=8)
    layer_conv2, weights_conv2 = new_conv_layer(input=layer_conv1, num_input_channels=8, filter_size=7, num_filters=32)
    layer_flat, num_features   = flatten_layer(layer_conv2)
    
    layer_fc1 = new_fc_layer(input=layer_flat,
                         num_inputs=num_features,
                         num_outputs=128,
                         use_relu=True)
    
    layer_fc2 = new_fc_layer(input=layer_fc1,
                         num_inputs=128,
                         num_outputs=1,
                         use_relu=False)
    # range of layer_fc2 is \mathbb{R}^d, with d = 1
    
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=layer_fc2,
                                                            labels=y)
    cost      = tf.reduce_mean(cross_entropy)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
    
    batch_tuples = create_batch_tuples(len(x_train), batch_size)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for e in range(NUM_EPOCHS):
            
            global prediction  # for debugging
            prediction = []

            for (start, end) in batch_tuples:
                
                print(start)
                x_reshaped = np.array([imread(pic) for pic in x_train[start: end]])
                
                sess.run([optimizer], feed_dict={x: x_reshaped,
                                                 y: y_train[start: end]})
        
            for (start, end) in batch_tuples:
                
                x_reshaped = np.array([imread(pic) for pic in x_train[start: end]])
                
                global batch_preds
                
                batch_preds = list(indicator(layer_fc2.eval({x: x_reshaped})))
                prediction += batch_preds
                
            
            accuracy = np.mean(prediction == y_train)
            print('Epoch {}, accuracy: {}%'.format(e, np.mean(accuracy)*100))
        
        batch_tuples_test = create_batch_tuples(len(x_test), batch_size)
        '''x_test_reshaped   = x_test.reshape(-1, 50, 50, 3)
        
        for (start, end) in batch_tuples_test:
            
            batch_y_pred = y_pred.eval({x: x_test_reshaped[start:end]})
            y_output.append(batch_y_pred)'''

In [234]:
tic = time.clock()
convolutional_neural_network(x_train[:200], y_train[:200], x_test, 64)
toc = time.clock()

0


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.


64
128
192


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.


Epoch 0, accuracy: 72.5%


In [235]:
print(toc - tic)

8.108036999999968


In [232]:
(toc - tic)/60

13.982608066666668