# Create test and validation set

## Initialization

In [1]:
%matplotlib inline
import os, sys
#import modules
from utils import *
from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg16 import VGG16
from keras.models import Model, load_model
from keras import backend as K

import os, json

from keras.layers import Input
from keras.optimizers import SGD, RMSprop, Adam
from enum import Enum
from keras.layers.core import Dense
from utils import save_array, load_array


# DATA_HOME_DIR = '/srv/data/dogscats'
DATA_HOME_DIR = '/home/ubuntu/blocks_classifier/data/dogscats' # on ec2

reload(K)
K.image_dim_ordering()

Using TensorFlow backend.
Using TensorFlow backend.


'tf'

In [2]:
%cd $DATA_HOME_DIR

#Set path to sample/ path if desired
path = DATA_HOME_DIR + '/' #'/sample/'
test_path = DATA_HOME_DIR + '/test/' #We use all the test data
results_path=DATA_HOME_DIR + '/results/'
train_path=path + '/train/'
valid_path=path + '/valid/'

/home/ubuntu/blocks_classifier/data/dogscats


In [3]:
#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=128 # max i can go with the K80
no_of_epochs=2 # seems to be optimal

In [None]:
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, batch_size=4, class_mode='categorical',
                target_size=(224,224)):
    return gen.flow_from_directory(dirname, target_size=target_size,
            class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

In [None]:
from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_array(fpath, filenames, titles=None):
    plots([image.load_img(fpath+f) for f in filenames], titles=titles)
    

    
#Number of images to view for each visualization task
n_view = 4

# Fine tuning

In [None]:
output_classes = ["cats", "dogs"]

base_model = VGG16(include_top=True, weights='imagenet') ## functional API
## replaces last dense layer with another dense layer of size the number of classes
x = base_model.layers[-2].output ## output is prior to last output
predictions = Dense(len(output_classes), activation='softmax', name = "predictions")(x)
ft_model = Model(input=base_model.input, output=predictions)
optimizer = Adam(lr=0.001) #SGD(lr=0.001) Adam seems to work much better than SGD?

## Freezes all but last layers
for layer in ft_model.layers:
    layer.trainable = False

## Make the last 2 dense layers trainable
for layer in ft_model.layers[-1:]:
    layer.trainable = True

ft_model.compile(optimizer=optimizer,
                loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
batches = get_batches(train_path, batch_size=batch_size)
val_batches = get_batches(valid_path, batch_size=batch_size)

In [None]:
ft_model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=no_of_epochs,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
ft_model.save(path+'cats_dogs_ep_3.h5')

# Predict the output

In [None]:
ft_model = load_model(path+'cats_dogs_ep_2.h5')

### Evaluate the model again

In [None]:
def model_evaluate(val_path):
        val_batches = get_batches(val_path, batch_size=batch_size, shuffle=False, class_mode='categorical')
        return ft_model.evaluate_generator(val_batches, val_batches.nb_sample)

In [None]:
metrics = model_evaluate(valid_path)

In [None]:
print metrics

### Predicting on the model

In [None]:
def model_predict(image_path, batch_size = batch_size):
    p_batches = get_batches(image_path, batch_size=batch_size, shuffle=False, class_mode=None)
    pred = ft_model.predict_generator(p_batches,p_batches.nb_sample)
    return p_batches, pred
    
#test_batches = get_batches(test_path, batch_size=batch_size, shuffle=False, class_mode=None)
#val_batches = get_batches(valid_path, batch_size=batch_size, shuffle=False, class_mode=None)
#t_batches = get_batches(train_path, batch_size=batch_size, shuffle=False, class_mode=None)

In [None]:
val_batches, val_predict = model_predict(valid_path,batch_size)

In [None]:
val_filenames = val_batches.filenames
val_classes = val_batches.classes
save_array(path + 'val_predict.dat', val_predict)
save_array(path + 'val_filenames.dat', val_batches.filenames)
save_array(path + 'val_classes.dat', val_batches.classes)

In [None]:
test_batches, test_predict = model_predict(test_path,batch_size)

In [None]:
save_array(path + 'test_predict.dat', test_predict)
save_array(path + 'test_filenames.dat', test_batches.filenames)

## Loading simulations

In [None]:
val_predict = load_array((path + 'val_predict.dat')
val_filenames = load_array(path + 'val_filenames.dat')
val_classes = load_array(path + 'val_classes.dat')
# test_predict = load_array((path + 'test_predict.dat')
# test_filenames =  load_array(path + 'test_filenames.dat') 
# no known classes for test

### Couple of cats examples

In [None]:
our_prediction = 1-val_predict[:,0]
our_class_prediction = np.round(our_prediction)
correct_cats = np.where((val_classes == 0) & (our_class_prediction == 0))[0]
print "correct cats %d" % len(correct_cats)
idx = permutation(correct_cats)[:n_view]
plots_array(valid_path, np.array(val_filenames)[idx], our_class_prediction[correct_cats][idx])

### Couple of dogs examples

In [None]:
correct_dogs = np.where((val_classes == 1) & (our_class_prediction == 1))[0]
print "correct dogs %d" % len(correct_dogs)
idx = permutation(correct_dogs)[:n_view]
plots_array(valid_path, np.array(val_filenames[idx]), our_class_prediction[correct_dogs][idx])
plots_array(valid_path, val_batches.filenames[-8:], our_class_prediction[-8:])
val_batches.classes[-8:]

### Most wrong cats

In [None]:
incorrect_cats = np.where((val_batches.classes == 0) & (our_class_prediction == 1))[0]

In [None]:
print "total incorrect cats: ", len(incorrect_cats), " which is %", 100*len(incorrect_cats)/len(np.where(val_batches.classes == 0)[0])

In [None]:
idx_most_incorrect = np.argsort(our_prediction[incorrect_cats])[::-1][:n_view]

In [None]:
plots_array(valid_path, np.array(val_batches.filenames)[idx_most_incorrect], our_prediction[incorrect_cats][idx_most_incorrect])

### Borderline recognizing cats but seen as dogs

In [None]:
idx_most_incorrect = np.argsort(our_prediction[incorrect_cats])[:n_view]
plots_array(valid_path, np.array(val_batches.filenames)[idx_most_incorrect], our_prediction[incorrect_cats][idx_most_incorrect])

### Most borderline false positives

In [None]:
idx_most_incorrect = np.argsort(our_prediction[incorrect_cats])[::-1[:n_view]
plots_array(valid_path, val_batches.filenames[idx_most_incorrect], our_prediction[idx_most_incorrect])

### Most wrong dogs

In [None]:
#t_predict = np.round(1-t_predict[:,0])
val_predict = np.round(1-val_predict[:,0])
val_batches.classes[:1]
#t_predict[:]


In [None]:
print val_batches.classes[:8]
print v_predict[:]
print test_batches.classes[:8]
print test_predict[:]

In [None]:
val_batches.classes[:,0]

In [None]:
test_predict = ft_model.predict_generator(test_batches,batch_size)

In [None]:
test_batches.filenames[:10]

In [None]:
np.round(test_predict[:10,0])

In [None]:
plots_array(test_path,test_batches.filenames[:4])

## Prep data

In [None]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/

# Prep data

In [None]:
%cd $DATA_HOME_DIR
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown

In [None]:
%cd $DATA_HOME_DIR/train


In [None]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])

In [None]:
from shutil import copyfile

In [None]:
g = glob("*.jpg")

In [None]:
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])

In [None]:
%cd $DATA_HOME_DIR/valid
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])

In [None]:
#Divide cat/dog images into separate directories

%cd $DATA_HOME_DIR/sample/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/sample/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

In [None]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/