### Import Libraries

In [1]:
#Import Libraries:
from __future__ import print_function
import numpy as np
import pandas as pd
import random
import math
import time
import joblib
import shutil
from shutil import copyfile
import os

import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers import Dense, GlobalAveragePooling2D
from keras.optimizers import adam
from keras.models import Model
from keras.applications.vgg19 import VGG19
from keras.applications.inception_v3 import InceptionV3


Using TensorFlow backend.


### Variables and Paths - Need To Be Customized

In [5]:
#Global variables
all_image_dir = 'C:\\Users\\Aadi\\Documents\\GitHub\\KovalCNN\\all_images\\' #This is the directory with all images (128x128 etc) in it
temp_image_dir = 'C:\\Users\\Aadi\\Documents\\GitHub\\KovalCNN\\tmp_images\\' #this is the directory where images will be stored during training
img_history_dir = 'C:\\Users\\Aadi\\Documents\\GitHub\\KovalCNN\\image_history\\' #this is where we will keep records about image partitions for previous training rounds
csv_path = 'C:\\Users\\Aadi\\Documents\\GitHub\\KovalCNN\\all_images\\data.csv' #this is the csv with all of the metadata for the images
class_variable_in_data_csv = 'benign_malignant' #the variable name in the csv where the above class names are found
filename_col_name_in_csv = 'filename'
useFilter = True
pandas_filter_string = "data_filtered = data" #pandas string to be used for filtering e.g. data_filtered = data[data['age'] == 'young']


### Supporting functions

In [6]:
#if not os.path.exists(temp_image_dir):
#    os.makedirs(temp_image_dir)

def create_directories(image_dir, class_names):
    shutil.rmtree(image_dir)
    if not os.path.exists(temp_image_dir):
        os.makedirs(temp_image_dir)
    train_dir = temp_image_dir+'train\\'
    test_dir = temp_image_dir+'test\\'
    train_test = ['train', 'val']
    for dir_a in train_test:
        directory1 = image_dir + dir_a + '\\'
        if not os.path.exists(directory1):
            os.makedirs(directory1)
        for dir_b in class_names:
            directory2 = image_dir + dir_a + '\\' + dir_b + '\\'
            if not os.path.exists(directory2):
                os.makedirs(directory2)
    
def copyfile_(data, i, dst_dir, src_dir):
    filename = data.iloc[i][filename_col_name_in_csv]
    print(filename, i)
    copyfile(src=src_dir+filename, dst=dst_dir+filename)

def split_data(data, class_names, id_var, var_name, train_prop = 0.8):
    train_test = ['train', 'val']
    train_df=data.sample(frac=train_prop,random_state=200)
    val_df=data.drop(train_df.index)
    train_df.to_csv(img_history_dir+'train_images_'+str(id_var)+'.csv')
    val_df.to_csv(img_history_dir+'val_images_'+str(id_var)+'.csv')
    for dir_a in train_test:
        for class_ in class_names:
            directory = temp_image_dir + dir_a + '\\' + class_ + '\\'
            if dir_a == 'train':
                df = train_df
            else:
                df = val_df
            df_ = df[df[var_name]==class_]
            df_ = df_.reset_index(drop=True)
            joblib.Parallel(n_jobs=8)(joblib.delayed(copyfile_)(df_, i, directory, all_image_dir) for i in range(0, df_.shape[0]))


### Model Architecture and Training

Ignore the actual accuracy results below. I just used made up data. But we can see that the code works

In [None]:

if __name__ == '__main__':
    id_var = math.floor(time.time())
    colormode = 'rgb'
    channels = 3 #color images have 3 channels. grayscale images have 1 channel
    batchsize = 1 #Number of images to be used in each processing batch. Larger batches have a greater impact on training accuracy but that isn't always a good thing
    trainingsamples = 25 #Number of images to be used for training set
    validationsamples = 25 #Number of images to be used for validation set
    model_name = 'KovalModel_'+str(id_var) #Any name for saving and keeping track of this model
    data_filtered = ''
    
    print('importing data')
    data = pd.read_csv(csv_path)
    if useFilter is True:
        exec(pandas_filter_string)
    class_names = data_filtered[class_variable_in_data_csv].unique() #class names of interest
    numclasses = len(class_names)
    print('creating directories')
    create_directories(temp_image_dir, class_names)
    print ('partitioning data')
    split_data(data_filtered, class_names, id_var, class_variable_in_data_csv)
    
        
    # create the base pre-trained model
    base_model = InceptionV3(weights='imagenet', include_top=False)

    # add a global spatial average pooling layer
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    # let's add a fully-connected layer
    x = Dense(1024, activation='relu')(x)
    # and a logistic layer
    predictions = Dense(numclasses, activation='softmax')(x)

    # this is the model we will train
    model = Model(input=base_model.input, output=predictions)

    # first: train only the top layers (which were randomly initialized)
    # i.e. freeze all convolutional InceptionV3 layers
    for layer in base_model.layers:
        layer.trainable = True

    for layer in model.layers:
        layer.trainable = True
    # compile the model (should be done *after* setting layers to non-trainable)
    model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy']) #create model with for binary output with the adam optimization algorithm
    
    train_datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True) # use ImageDataGenerator to enhance the size of our dataset by randomly flipping images. There are many more transformations that are possible
    test_datagen = ImageDataGenerator()

#the following code reads images, trains the model, and saves the training history to a csv file:

    train_generator = train_datagen.flow_from_directory(
            temp_image_dir+"train",
            target_size=(150, 150),
            batch_size=batchsize,
            color_mode=colormode)

    validation_generator = test_datagen.flow_from_directory(
            temp_image_dir+"val",
            target_size=(150, 150),
            batch_size=batchsize,
            color_mode=colormode)

    history = model.fit_generator(
            train_generator,
            steps_per_epoch=trainingsamples/batchsize,
            epochs=100,
            validation_data=validation_generator,
            validation_steps=validationsamples/batchsize)

    hist = history.history
    hist = pd.DataFrame(hist)
    hist.to_csv(root_dir+'results\\'+model_name+'.csv')
    model.save(root_dir+'models\\'+model_name+'.h5')


importing data
creating directories
partitioning data




Found 1797 images belonging to 2 classes.
Found 449 images belonging to 2 classes.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100