# Imports

In [10]:
%matplotlib inline

import math, os, datetime, random, shutil
import numpy as np
from numpy.random import permutation
from matplotlib import pyplot as plt

from keras.preprocessing import image
from keras import backend as kb
from keras.utils.data_utils import get_file
from keras.models import Sequential
from keras.layers.core import Lambda, Dense, Flatten, Dropout
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, LambdaCallback
from keras_tqdm import TQDMNotebookCallback

from sklearn.metrics import confusion_matrix, roc_auc_score

# Organize data

Data used is from kaggle competition invaisive species monitoring.
https://www.kaggle.com/c/invasive-species-monitoring

By organising data in different folders for different classes, keras parses the classes from the folder hierarchy.

Data was divided into subdirectories for training, validation and test data.

Folder tree:
```
data
    train
        invasive
        non-invasive
    valid
        invasive
        non-invasive
    test
        unknown
```

In [11]:
dataPath = 'data/'
modelPath = 'data/models/'
imageSize = (512,512)

# Load Data

In [12]:
# Batch size can be increased if GPU memory can take it.
batch_size = 12

# Keras ImageDataGenerator class modifies the images in different random ways so that the same image is never seen twice.
# That way there is more data to learn from.

def getTrainingDataGenerator():
    return ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.2,
        zoom_range=0.3,
        horizontal_flip=True,
        channel_shift_range=0.1,
        fill_mode='nearest')

# flow_from_directory iterates the directory structure and returns batches of images 
# with the class name parsed from the directory.
def getTrainingDataBatches():
    return getTrainingDataGenerator().flow_from_directory(
        dataPath + 'train', 
        target_size=imageSize,
        class_mode='binary', 
        shuffle=True, 
        batch_size=batch_size)

def getValidationDataBatches(): 
    return ImageDataGenerator().flow_from_directory(
        dataPath + 'valid', 
        target_size=imageSize, 
        class_mode='binary', 
        batch_size=batch_size, 
        shuffle=False)

def getTestDataBatches():
    return  ImageDataGenerator().flow_from_directory(
        dataPath + 'test/', 
        target_size=imageSize,
        class_mode=None,
        shuffle=False,
        batch_size=batch_size)

# Build Model

In [19]:
def addConvLayer(m, filterCount):
    m.add(ZeroPadding2D((1,1)))
    m.add(Conv2D(filterCount,(3,3), activation='relu'))
    m.add(BatchNormalization(axis=1))
    
def addConvBlock(m, filterCount, layerCount):
    for i in range(layerCount):
        addConvLayer(m, filterCount)
    m.add(MaxPooling2D((2,2), strides=(2,2)))

def addDenseLayer(m, nodeCount, dropout):
    m.add(Dense(nodeCount,activation='relu'))
    m.add(Dropout(dropout))
    m.add(BatchNormalization())

def build(dropout):
    input_shape = (imageSize[0], imageSize[1],3)
    m = Sequential()

    # By applying BatchNormalization as the first layer the input data is automatically normalized.
    m.add(BatchNormalization(axis=1,input_shape=input_shape))

    # To handle the larger image sizes I had to apply a larger conv layer on the input data, with a bigger stride.
    m.add(ZeroPadding2D((1,1)))
    m.add(Conv2D(32,(5,5),strides=(2,2), activation='relu'))    
    m.add(MaxPooling2D((3,3), strides=(2,2)))
    m.add(BatchNormalization(axis=1))
    
    # The rest of the conv layers are 3x3 with a regular 1 step stride.
    addConvBlock(m, 64, 2)
    addConvBlock(m, 128, 2)
    addConvBlock(m, 256, 2)
    addConvBlock(m, 512, 2)
    
    m.add(Flatten())

    addDenseLayer(m, 1024, dropout)
    addDenseLayer(m, 1024, dropout)
    
    m.add(Dense(1,activation='sigmoid'))
    
    return m

# Help methods

In [20]:
def predict(m, test_batches):
    return m.predict_generator(test_batches, test_batches.nb_sample) 

def getDateTimeString():
    return datetime.datetime.now().strftime("%Y-%m-%d.%H-%M-%S")

def fitGenerator(m, lr, nb_epoch):
    m.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])
    
    trainBatches = getTrainingDataBatches()
    valBatches = getValidationDataBatches()
        
    return m.fit_generator(
        trainBatches,
        samples_per_epoch=trainBatches.nb_sample, 
        nb_epoch=nb_epoch,
        validation_data=valBatches,
        nb_val_samples=valBatches.nb_sample,
        verbose=2,
        callbacks=[TQDMNotebookCallback()])

def plotAllHistory(history):
    plotAccHistory(history)
    plotLossHistory(history)

def plotLossHistory(history):
    plotHistory(history, 'loss', 'Loss')

def plotAccHistory(history):
    plotHistory(history, 'acc', 'Accuracy')
    
def plotHistory(history, metric, metricFullName):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_' + metric])
    plt.title(metricFullName)
    plt.ylabel(metricFullName)
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()


# Training

In [21]:
model = build(dropout=0.1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_5 (Batch (None, 512, 512, 3)       2048      
_________________________________________________________________
zero_padding2d_5 (ZeroPaddin (None, 514, 514, 3)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 255, 255, 32)      2432      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 127, 127, 32)      0         
_________________________________________________________________
batch_normalization_6 (Batch (None, 127, 127, 32)      508       
_________________________________________________________________
zero_padding2d_6 (ZeroPaddin (None, 129, 129, 32)      0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 127, 127, 64)      18496     
__________

In [None]:
history = fitGenerator(model, lr=0.001, nb_epoch=20)
plotAllHistory(history)
model.load_weights(modelPath + 'model.h5')

# Predictions

## Helper methods

In [23]:
def getFileIndices():
    return [int(x.split('\\')[1].split('.')[0]) for x in getTestDataBatches().filenames]

def combineIndicesWithPredictions(fileIdxs, preds):
    combined = np.column_stack((fileIdxs, preds))
    combined = combined[combined[:,0].argsort()]
    return combined;
    
def writePredictionsToFile(preds, predictionsPath):
    np.savetxt(
        predictionsPath, 
        preds,
        fmt ='%1.1d, %1.2f',
        header='name,invasive', 
        comments='') #'id,label', 

## Predictions for submitting

In [63]:
test_batches = getTestDataBatches()
predictions = predict(model, test_batches)
fileIdxs = getFileIndices()
combined = combineIndicesWithPredictions(fileIdxs, predictions)
writePredictionsToFile(combined, data + 'predictions/predictions.csv')

Found 1531 images belonging to 1 classes.
