# Dogs vs Cat Redux (KT version)

In [1]:
#Verify we are in the nbs directory
%pwd

u'/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs'

In [2]:
#Create references to important directories we will use
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/redux'

In [3]:
#import modules
from utils import *
from vgg16 import Vgg16

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

Using Theano backend.


## Action Plan
1. Create validation and sample sets
2. Re-arrange image files into their respective directories
3. Finetune and train model
4. Generate predictions
5. Validate predictions
6. Submit predictions to Kaggle

## Create validation set

In [None]:
#Set constants
valid_size = 2000

In [None]:
#Create directories
%cd "$DATA_HOME_DIR"
%mkdir valid
%mkdir results
%mkdir -p test/unknown

In [None]:
%cd $DATA_HOME_DIR/train

In [None]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(valid_size): os.rename(shuf[i], DATA_HOME_DIR + '/valid/' + shuf[i])

## Create sample sets

In [4]:
#Set constants
sample_train_size = 200
sample_valid_size = 50
sample_test_size = 200

#--local
# sample_train_size = 16
# sample_valid_size = 8
# sample_test_size = 16

In [5]:
#Create directories
%cd "$DATA_HOME_DIR"
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p sample/test/unknown

/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux


In [8]:
from shutil import copyfile

In [9]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(sample_train_size): copyfile(shuf[i], DATA_HOME_DIR + '/sample/train/' + shuf[i])

In [10]:
%cd $DATA_HOME_DIR/valid

/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/valid


In [11]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(sample_valid_size): copyfile(shuf[i], DATA_HOME_DIR + '/sample/valid/' + shuf[i])

In [12]:
%cd $DATA_HOME_DIR/test

/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/test


In [13]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(sample_test_size): copyfile(shuf[i], DATA_HOME_DIR + '/sample/test/' + shuf[i])

## Rearrange image files into their respective directories

In [14]:
#Divide cat/dog images into separate directories
subpaths = ['/sample/train', '/sample/valid', '/train', '/valid']
path_names = ['cats', 'dogs']
class_names = ['cat', 'dog']

for subpath in subpaths:
    %cd $DATA_HOME_DIR$subpath
    for i, path in enumerate(path_names):
        %mkdir $path
        file_name = class_names[i] + '.*.jpg'
        %mv $file_name $path/

/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/sample/train
/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/sample/valid
/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/train
/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/valid


In [15]:
# Create single 'unknown' class for test set and sample/test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/

%cd $DATA_HOME_DIR/sample/test
%mv *.jpg unknown/

/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/test
/home/krit/github-projects/fast-ai/courses/deeplearning1/nbs/data/redux/sample/test


## Finetuning and Training

In [None]:
%cd $DATA_HOME_DIR

#Set path to sample/ path if desired
path = DATA_HOME_DIR + '/'
#path = DATA_HOME_DIR + '/sample'

test_path = path + '/test/'
results_path = path + '/results/'
train_path = path + '/train/'
valid_path = path + '/valid/'

In [None]:
#import Vgg16 helper class
vgg = Vgg16()

In [None]:
#Set constants. You can experiment with no_of_epochs to improve the model
batch_size = 64
#batch_size = 4

no_of_epochs = 3
#no_of_epochs = 1

In [None]:
#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

In [None]:
#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01

In [None]:
latest_weights_filename = None
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(results_path+latest_weights_filename)
print "Completed %s fit operations" % no_of_epochs

## Generate Predictions

Use our new model to make predictions on the test dataset

In [None]:
batches, preds = vgg.test(test_path, batch_size=batch_size*2)

In [None]:
#For every image, vgg.test() generates two probabilities
#based on the order of directories (cats, dogs)
#Column one: cats, column two: dogs
print preds[:5]

filenames = batches.filenames
print filenames[:5]

In [None]:
#Verify the column ordering by viewing some images
from PIL import Image
Image.open(test_path + filenames[2])

In [None]:
#Save our test results arrays so we can use them again later
save_array(results_path + 'test_preds.dat', preds)
save_array(results_path + 'filenames.dat', filenames)

## Submit Predictions to Kaggle!

Kaggle wants the imageId followed by the probability of the image being a dog. Kaggle uses a metric called [Log Loss](http://wiki.fast.ai/index.php/Log_Loss) to evaluate your submission.

In [None]:
#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')

In [None]:
#Grab the dog prediction column
isdog = preds[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])

[Log Loss](http://wiki.fast.ai/index.php/Log_Loss) doesn't support probability values of 0 or 1--they are undefined (and we have many). Fortunately, Kaggle helps us by offsetting our 0s and 1s by a very small value. So if we upload our submission now we will have lots of .99999999 and .000000001 values. This seems good, right?

Not so. There is an additional twist due to how log loss is calculated--log loss rewards predictions that are confident and correct (p=.9999,label=1), but it punishes predictions that are confident and wrong far more (p=.0001,label=1). See visualization below.

In [None]:
#So to play it safe, we use a sneaky trick to round down our edge predictions
#Swap all ones with .95 and all zeros with .05
isdog = isdog.clip(min=0.05, max=0.95)

In [None]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])

Here we join the two columns into an array of [imageId, isDog]

In [None]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [None]:
%cd $DATA_HOME_DIR
submission_file_name = 'submission1.csv'
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink('data/redux/'+submission_file_name)

You can download this file and submit on the Kaggle website or use the Kaggle command line tool's "submit" method.