In [1]:
# Sets the path based on the environment being used

def get_path(computer_str):
    """
    Returns the correct path based on where the program is run
    """

    mac_data_path = '/Users/justinwhatley/Dropbox/FevensLab'
    linux_data_path = '/home/justin/Dropbox/FevensLab'
    
    if computer_str.lower() == 'mac':
        return os.path.join(mac_data_path)

    elif computer_str.lower() == 'linux': 
        return os.path.join(linux_data_path)

    elif computer_str.lower() == 'colab':
        # Google colab path
        return os.path.join('')

    else: 
        print('Incorrect base path option')
        exit(0)

Separates dataset into folds

In [2]:
import preprocessing
import os

In [6]:
!rm -rf training_validation_dataset/

base_directory = get_path('linux')
raw_data_directory_path = os.path.join(base_directory, 'FNAB_raw')
preprocessed_directory_path = os.path.join(base_directory, 'FNAB_preprocessed')
training_validation_path = os.path.join(base_directory, 'training_validation_dataset')

class_keyword_1 = 'MG2'
class_keyword_2 = 'MG3'
classes_list = [class_keyword_1, class_keyword_2]

# Prepares data
height, width = 224, 224

files_list_by_class = preprocessing.get_raw_file_list(raw_data_directory_path, classes_list)
preprocessing.create_preprocessed_directory(classes_list, files_list_by_class, preprocessed_directory_path, height, width, overwrite_previous_preprocessed_data = False)

# Gets patch file data
patched_class_file_list = preprocessing.get_data_by_class(os.path.join(preprocessed_directory_path, 'patched_data'), classes_list)
original_class_file_list = preprocessing.get_data_by_class(os.path.join(preprocessed_directory_path, 'original_data'), classes_list)

# Separate data into k-folds
number_of_folds = 5
files_per_fold =  [4000, 700]
patched_files_in_folds = preprocessing.separate_into_k_folds(number_of_folds, patched_class_file_list, files_per_fold)
# print(patched_files_in_folds)

files_per_fold =  [90, 20]
original_files_in_folds = preprocessing.separate_into_k_folds(number_of_folds, original_class_file_list, files_per_fold)
# print(original_files_in_folds)

# Iterates through folds

for i in range(number_of_folds):   
    validation_fold = i
    
    # Removes previous training and validation directories
    preprocessing.remove_dir(training_validation_path)
                                       
    # Selects a validation and training set on the original data
    preprocessing.assign_folds_to_training_and_validation(preprocessed_directory_path, training_validation_path, classes_list, original_files_in_folds, validation_fold, type = 'original_data')

    # Selects a validation and training set on the patched data
#     preprocessing.assign_folds_to_training_and_validation(preprocessed_directory_path, training_validation_path, classes_list, patched_files_in_folds, validation_fold, type = 'patched_data')

    # TODO call training and validation from here
    print('Finished! ')
    break


Loading raw data from path: /home/justin/Dropbox/FevensLab/FNAB_raw
Bin size: 4332
Bin size: 4584
Bin size: 4410
Bin size: 4506
Bin size: 4272
Bin size: 993
Bin size: 718
Bin size: 869
Bin size: 885
Bin size: 1011
Bin size: 91
Bin size: 98
Bin size: 94
Bin size: 92
Bin size: 93
Bin size: 22
Bin size: 22
Bin size: 20
Bin size: 20
Bin size: 22
Removing directory: /home/justin/Dropbox/FevensLab/training_validation_dataset
Taking files from input directory: /home/justin/Dropbox/FevensLab/FNAB_preprocessed/original_data/MG2
Storing them in output directory: /home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG2
Taking files from input directory: /home/justin/Dropbox/FevensLab/FNAB_preprocessed/original_data/MG2
Storing them in output directory: /home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/validation/MG2
Taking files from input directory: /home/justin/Dropbox/FevensLab/FNAB_preprocessed/original_data/MG3
Storing them in output direct

Neural Network

In [7]:
import numpy as np
import keras
import os.path as path
from keras.preprocessing.image import ImageDataGenerator

from keras import optimizers
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from keras.layers.normalization import BatchNormalization 
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
""" Hyperparameters """

# Dimensions images will be resized to for processing
img_width, img_height = 224, 224

class_list = ['MG2', 'MG3']

epochs = 50
batch_size = 32

if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

In [9]:
""" Calculates the number of training and validation samples"""
# dataset_name = 'training_validation_dataset/patched_data'

# Loads the unaltered images that have been place in training and validation datasets 
# dataset_name = os.path.join(training_validation_path, 'patched_data')
dataset_name = os.path.join(training_validation_path, 'original_data')
train_data_dir = path.join(dataset_name, 'training')
validation_data_dir = path.join(dataset_name, 'validation')


In [10]:
""" Up Samples classes by copying images at random """
# *Algorithm*
# Calculate number of files in directory 
import os
import glob
import shutil
import random

global_counter = 0 

def get_file_list(path_regex):
    file_list = []
    for filename in glob.glob(path_regex): 
        file_list.append(filename)
    return file_list

def copy_class_file(base_path, _class, file_name, i):
    src_dir = path.join(base_path, _class, file_name)
    
    file_name_list = file_name.split('.')
    file_name, file_ext = file_name_list[0], file_name_list[1] 
    dst_dir = path.join(base_path, _class, file_name + '_copy'+ str(i) + '.' + file_ext)
    print(dst_dir)
    shutil.copy(src_dir, dst_dir)

def upsample(base_path, _class, complete_copies_count, partial_copies_count):
    print(_class)
    
    # Makes copies of the 
    file_list = get_file_list(path.join(base_path, _class, '*'))
    counter1 = 0
    for i in range(complete_copies_count):
        print('Copying:')
        for file_name in file_list:
            copy_class_file(base_path, _class, file_name, i)
            counter1 += 1
    print('Full copies counter:  ' + str(counter1))
    
    # Get a boolean list of which file_indexes to copy
    upsampling_selection = [False] * len(file_list)
    sample_counter = 0 
    while not sample_counter >= partial_copies_count:
        random_file_index = random.randint(0, len(file_list)-1)
        if not upsampling_selection[random_file_index]:
            upsampling_selection[random_file_index] = True
            sample_counter += 1
    
    # Copies the remaining files
    print('Copying:')
    counter2 = 0 
    for file_selected_bool in upsampling_selection:
        if file_selected_bool: 
            copy_class_file(base_path, _class, file_name, i + 1)
            counter2 += 1
    print('Partial copies counter: ' + str(counter2))

def balance_dataset(base_path, class_list, strategy = 'upsample'):
    
    # Loads jpg files using regex, getting the sizes of each dataset
    size_list = []
    for _class in class_list: 
        path_regex = path.join(base_path, _class, '*')
        size_list.append(len(get_file_list(path_regex)))    
        
    # Draws at random from the indexes in the smaller dataset until a list of the size of the larger = smaller + list
    print(size_list)
    if strategy == 'upsample':
        largest_class = max(size_list)
        for i, _class in enumerate(class_list):
            # Check that the class is not already the largest class
            if largest_class > size_list[i]:
                # Get the number of complete copies of the class list to reduce bias toward any given sample
                complete_copies_count = int((largest_class-size_list[i]) / size_list[i])
                # Get the number of partial copies of the class which will be selected at random without replacement
                partial_copies_count = (largest_class-size_list[i]) % size_list[i]
                # Upsamples directory by copying files to the same directory
                upsample(base_path, _class, complete_copies_count, partial_copies_count)

balance_dataset(train_data_dir, class_list)
balance_dataset(validation_data_dir, class_list)

# TODO figure out uneven assignment of files

# Prints number of samples following upsampling
path_regex = path.join(train_data_dir + '/*/*')
nb_training_samples = len(get_file_list(path_regex))
print('Number of training samples: '+ str(nb_training_samples))

path_regex = path.join(validation_data_dir + '/*/*')
nb_validation_samples = len(get_file_list(path_regex))
print('Number of validation samples: '+ str(nb_validation_samples))


# If the larger > 2 * smaller, make two lists, etc


[360, 80]
MG3
Copying:
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/P4-2b_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/251-3b_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/P4-1b_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/171-2a_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/179-1b_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/P5-2b_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/35425-2a_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/251-4a_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/45476-1b_copy0.jpg
/home/justin/Dropbox/FevensLab/training_validatio

/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_dataset/original_data/training/MG3/43246-1a_copy3.jpg
/home/justin/Dropbox/FevensLab/training_validation_data

In [15]:
""" Defines pretrained VGG"""

vgg16_model = keras.applications.vgg16.VGG16()
model = Sequential()
# Transfers the layers from the vgg16 model to a new model that can be trained
for layer in vgg16_model.layers:
    model.add(layer)

# Remove the last (output) layer
model.layers.pop()

# Replace the last layer with two layers for a binary classification
model.add(Dense(2, activation='softmax'))


In [16]:
from keras import optimizers
opt = optimizers.SGD(lr=0.001, decay=1e-6)
model.compile(opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [17]:
# Dataloading
train_batches = ImageDataGenerator().flow_from_directory(train_data_dir, 
                                                         target_size = (img_width, img_height), 
                                                         classes = class_list )

valid_batches = ImageDataGenerator().flow_from_directory(validation_data_dir, 
                                                         target_size = (img_width, img_height), 
                                                         classes = class_list)

Found 681 images belonging to 2 classes.
Found 171 images belonging to 2 classes.


In [None]:
%%time
model.fit_generator(train_batches,
                    steps_per_epoch=4, 
                    validation_data=valid_batches, 
                    validation_steps=4, 
                    epochs=200, 
                    verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200

In [None]:
""" Defines the model """

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(BatchNormalization(axis=-1))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# model.summary()

In [None]:
# Laya's first CNN

model = Sequential()

model.add(Conv2D(128, kernel_size=(3, 3), strides=(1, 1), input_shape=input_shape))
model.add(Activation(Dense(128, activation='relu')))

model.add(Conv2D(128, kernel_size=(3, 3), strides=(1, 1)))
model.add(Activation(Dense(128, activation='relu')))

model.add(Dropout(0.5))
model.add(AveragePooling2D(pool_size=(2, 2), strides=(1, 1)))

model.add(Conv2D(64, kernel_size=(3, 3), strides=(1, 1)))
model.add(Activation(Dense(64, activation='relu')))

model.add(Conv2D(64, kernel_size=(3, 3), strides=(1, 1)))
model.add(Activation(Dense(64, activation='relu')))

model.add(Dropout(0.5))
model.add(AveragePooling2D(pool_size=(2, 2), strides=(1, 1)))

model.add(Conv2D(32, kernel_size=(3, 3), strides=(1, 1)))
model.add(Activation(Dense(32, activation='relu')))

model.add(Conv2D(32, kernel_size=(3, 3), strides=(1, 1)))
model.add(Activation(Dense(32, activation='relu')))

model.add(Dropout(0.5))
model.add(AveragePooling2D(pool_size=(2, 2), strides=(1, 1)))

model.add(Flatten())
model.add(Dense(1000, activation='relu'))
model.add(Dense(1, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
metrics=['accuracy'])

In [None]:
class AccuracyHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.acc = []

    def on_epoch_end(self, batch, logs={}):
        self.acc.append(logs.get('acc'))
        
history = AccuracyHistory()

history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

In [None]:
""" Training and validation for regular network """

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=False)

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(rescale=1. / 255)

train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

model.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps= nb_validation_samples // batch_size)

model.save_weights('first_try.h5')

In [None]:
ls training_validation_dataset/patched_data/training/MG3/

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model CNN:  accuracy history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
# plt.grid(b=False)
# plt.savefig('accuracy.png', bbox_inches='tight', dpi=300)
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model CNN:  loss history')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
# plt.grid(b=False, )
# plt.savefig('loss.png', bbox_inches='tight', dpi=300)
plt.show()

# from google.colab import files
# files.download('loss.png')
# files.download('accuracy.png')