# State Farm Distracted Driver Detection


[State Farm Distracted Driver Detection](https://www.kaggle.com/c/state-farm-distracted-driver-detection)
    

## Imports und Konstanten

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os, shutil
import sys

from utils import *
from vgg16bn import Vgg16BN

from IPython.display import FileLink
from keras.preprocessing import image

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [2]:
%pwd

path = os.getcwd()
data_path = os.path.join(path, 'data/')
sample_path = os.path.join(path, 'data', 'sample/')

train_path = os.path.join(data_path, 'train')
valid_path = os.path.join(data_path, 'valid')
test_path = os.path.join(data_path, 'test')
results_path = os.path.join(data_path, 'results')
subm_path = os.path.join(data_path, 'submissions')

weights_postfix = 'h5'
driver_list_path = 'data/driver_imgs_list.csv'

## Vorbereitung der Daten

### CSV-Datei mit Fahrer-Zuordnung analysieren

In [None]:
def get_driver_df(file_path):
    names = ['driver','class','img']
    return pd.read_csv(file_path, sep=',',names=names, header=0)

def get_driver_imgs(df, driver):
    sel = df['driver'] == driver
    return df.loc[sel]

In [None]:
driver_df = get_driver_df(driver_list_path)
print(driver_df.head())

In [None]:
p002 = get_driver_imgs(driver_df, 'p002')
print(p002.head())

In [None]:
drivers = driver_df['driver']
drivers = drivers.drop_duplicates()
print(drivers.values)

In [None]:
result = driver_df.groupby('driver') \
             .agg({'class': pd.Series.nunique, 'img':'count'}).reset_index()
print(result)

In [None]:
num_total = result['img'].sum()
num_move = num_total * 0.2
print(num_total)
print(num_move)

### Verzeichnisse erstellen und Daten bereitstellen

In [None]:
def create_class_dir(parent_path):
    for i in range(10):
        class_name = 'c' + str(i)
        class_path = os.path.join(parent_path, class_name)
        if not os.path.exists(class_path):
            os.mkdir(class_path)

def create_test_dir(parent_path):
    test_path = os.path.join(parent_path, 'test')
    if not os.path.exists(test_path):
        os.mkdir(test_path)
    unknown_path = os.path.join(test_path, 'unknown')
    if not os.path.exists(unknown_path):
        os.mkdir(unknown_path)

def make_dir(parent_path, directory):
    path = os.path.join(parent_path, directory)
    if not os.path.exists(path):
        os.mkdir(path)    
    
def create_train_dir(parent_path):
    train_path = os.path.join(parent_path, 'train')
    if not os.path.exists(train_path):
        os.mkdir(train_path)
    create_class_dir(train_path) 
    
def create_valid_dir(parent_path):
    valid_path = os.path.join(parent_path, 'valid')
    if not os.path.exists(valid_path):
        os.mkdir(valid_path)
    create_class_dir(valid_path)
    
def create_sample_dir(parent_path):
    sample_path = os.path.join(parent_path, 'sample')
    if not os.path.exists(sample_path):
        os.mkdir(sample_path)
    create_train_dir(sample_path)
    create_valid_dir(sample_path)
    create_test_dir(sample_path)
    make_dir(sample_path, 'results')
    make_dir(sample_path, 'submissions')
    
def fill_valid_dir():
    drivers = ['p002', 'p024', 'p051', 'p049']
    df = get_driver_df(driver_list_path)
    for driver in drivers:
        driver_list = get_driver_imgs(df, driver)
        move_driver_imgs(train_path, valid_path, driver_list)

def prepare_test_dir():
    test_files = glob(os.path.join(test_path, '*.jpg'))
    target_path = os.path.join(test_path, 'unknown')
    for f in test_files:
        shutil.move(f, target_path)
        
def prepare_sample_dir():
    sample_train = os.path.join(sample_path, 'train')
    sample_valid = os.path.join(sample_path, 'valid')
    sample_test = os.path.join(sample_path, 'test')

    drivers = ['p014', 'p045', 'p042']
    df = get_driver_df(driver_list_path)

    for driver in drivers:
        driver_list = get_driver_imgs(df, driver)
        move_driver_imgs(train_path, sample_train, driver_list, copy=True)
    
    driver_list = get_driver_imgs(df, 'p072')
    move_driver_imgs(train_path, sample_valid, driver_list, copy=True)
    
    test_files = glob(os.path.join(test_path, 'unknown', '*.jpg'))
    shuf = np.random.permutation(test_files)
    for i in range(200): 
        shutil.copy(shuf[i], sample_test)

def move_driver_imgs(source_path, target_path, driver_list, copy=False):
    for entry in driver_list.values:
        file = os.path.join(source_path, entry[1], entry[2])
        target = os.path.join(target_path, entry[1])
        target_file = os.path.join(target, entry[2])
        if os.path.exists(file) and not os.path.exists(target_file):
            if copy == True:
                shutil.copy(file, target)
            else:
                shutil.move(file, target)
                
def prepare_data():
    make_dir(data_path, 'results')
    make_dir(data_path, 'submission')
    create_valid_dir(data_path)
    create_sample_dir(data_path)
    create_test_dir(data_path)
    fill_valid_dir()
    prepare_test_dir()
    prepare_sample_dir()
    

In [None]:
# Reset all training, validation & sample data
def prepare_sample_dir2():
    sample_train = os.path.join(sample_path, 'train')
    sample_valid = os.path.join(sample_path, 'valid')
    sample_test = os.path.join(sample_path, 'test', 'unknown')
    
    # Copy train and validation files
    for class_num in range(10):
        class_name = 'c' + str(class_num)
        train_files = glob(os.path.join(train_path, class_name, '*.jpg'))
        valid_files = glob(os.path.join(valid_path, class_name, '*.jpg'))
        target_train = os.path.join(sample_train, class_name)
        target_valid = os.path.join(sample_valid, class_name)

        shuf = np.random.permutation(train_files)
        for i in range(200):
            shutil.copy(shuf[i], target_train)
        
        shuf = np.random.permutation(valid_files)
        for i in range(50):
            shutil.copy(shuf[i], target_valid)
    
    # Copy a couple test files
    test_files = glob(os.path.join(test_path, 'unknown', '*.jpg'))
    shuf = np.random.permutation(test_files)
    for i in range(200): 
        shutil.copy(shuf[i], sample_test)

In [None]:
prepare_data()

In [None]:
# Restructure sample directory
create_sample_dir(data_path)
prepare_sample_dir2()

## Validate Data

In [None]:
def get_class_name(class_id):
    class_id_name = { \
        'c0' : 'safe driving', \
        'c1': 'texting - right', \
        'c2': 'talking on the phone - right', \
        'c3': 'texting - left', \
        'c4': 'talking on the phone - left', \
        'c5': 'operating the radio', \
        'c6': 'drinking', \
        'c7': 'reaching behind', \
        'c8': 'hair and makeup', \
        'c9': 'talking to passenger'}
    class_name = class_id_name[class_id]
    return class_name
    

In [None]:
print(get_class_name('c2'))

In [None]:
# Helper function to plot images by index in the validation set 
# Plots is a helper function in utils.py
#def plots_idx(idx, titles=None):
#    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [None]:
def plot_random_class_files(parent_path, class_id):
    print(class_id + ' - ' + get_class_name(class_id) )
    path = os.path.join(parent_path, class_id, '*.jpg')
    print path
    class_files = glob(os.path.join(parent_path, class_id, '*.jpg'))
    shuf = np.random.permutation(class_files)
    imgs = []
    titles = []
    for i in range(4):
        imgs.append(image.load_img(shuf[i]))
        titles.append(os.path.splitext(os.path.basename(shuf[i]))[0])
    plots(imgs, titles=titles)

### Validation Data

In [None]:
plot_random_class_files(valid_path,'c1')

In [None]:
plot_random_class_files(valid_path,'c2')

In [None]:
plot_random_class_files(valid_path,'c3')

In [None]:
plot_random_class_files(valid_path,'c4')

In [None]:
plot_random_class_files(valid_path,'c5')

In [None]:
plot_random_class_files(valid_path,'c6')

In [None]:
plot_random_class_files(valid_path,'c7')

In [None]:
plot_random_class_files(valid_path,'c8')

In [None]:
plot_random_class_files(valid_path,'c9')

### Training data

In [None]:
plot_random_class_files(train_path,'c1')

In [None]:
plot_random_class_files(train_path,'c2')

In [None]:
plot_random_class_files(train_path,'c3')

In [None]:
plot_random_class_files(train_path,'c4')

In [None]:
plot_random_class_files(train_path,'c5')

In [None]:
plot_random_class_files(train_path,'c6')

In [None]:
plot_random_class_files(train_path,'c7')

In [None]:
plot_random_class_files(train_path,'c8')

In [None]:
plot_random_class_files(train_path,'c9')

Einfaches Model

## Finetuning und Training

### Linear Model

In [None]:
batch_size = 64
batches = get_batches(train_path, batch_size=batch_size)
val_batches = get_batches(valid_path, batch_size=batch_size*2, shuffle=False)

In [None]:
model = Sequential([
    BatchNormalization(axis=1, input_shape=(3,224,224)),
    Flatten(), 
    Dense(10, activation='softmax')
])

In [None]:
model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(batches, batches.nb_sample, nb_epoch=2, 
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

In [None]:
model.summary()

### VGG16 (Imagenet-Features)

In [3]:
vgg = Vgg16()
model = vgg.model
last_conv_idx = [i for i,l in enumerate(model.layers) if type(l) is Convolution2D][-1]
conv_layers = model.layers[:last_conv_idx+1]

In [4]:
conv_model = Sequential(conv_layers)

In [5]:
(val_classes, train_classes, val_labels, train_labels,
    val_filenames, filenames, test_filenames) = get_classes(data_path)

Found 18542 images belonging to 10 classes.
Found 3882 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.


In [6]:
batch_size = 64

In [7]:
batches = get_batches(train_path, batch_size=batch_size, shuffle=False)
val_batches = get_batches(valid_path, batch_size=batch_size*2, shuffle=False)
test_batches = get_batches(test_path, batch_size=batch_size*2, shuffle=False)

Found 18542 images belonging to 10 classes.
Found 3882 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.


In [None]:
conv_feat = conv_model.predict_generator(batches, batches.nb_sample)
conv_val_feat = conv_model.predict_generator(val_batches, val_batches.nb_sample)
conv_test_feat = conv_model.predict_generator(test_batches, test_batches.nb_sample)

In [None]:
train_data = get_data(train_path)
val_data = get_data(valid_path)

In [8]:
conv_val_feat_file = os.path.join(results_path, 'conv_val_feat.dat')
conv_feat_file = os.path.join(results_path, 'conv_feat.dat')
conv_test_feat_file = os.path.join(results_path, 'conv_test_feat.dat')

In [None]:
val_data_file = os.path.join(results_path, 'val_data.dat')
train_data_file = os.path.join(results_path, 'train_data.dat')

In [None]:
save_array(conv_feat_file, conv_feat)
save_array(conv_val_feat_file, conv_val_feat)

In [None]:
save_array(train_data_file, train_data)
save_array(val_data_file, val_data)

In [None]:
save_array(conv_test_feat_file, conv_test_feat)

In [None]:
train_data = load_array(train_data_file)
val_data = load_array(val_data_file)

In [9]:
conv_feat = load_array(conv_feat_file)
conv_val_feat = load_array(conv_val_feat_file)

In [None]:
conv_test_feat = load_array(conv_test_feat_file)

In [10]:
conv_val_feat.shape

(3882, 512, 14, 14)

### Batchnorm dense layer on pretrained conv layers

Die Convolutional-Layer des VGG16-Netzwerks wurden separiert und die Ergebnisse wurden vorberechnet und als bcolz-arrays auf der Harddisk gesichert. 

In [11]:
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p/2),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(p/2),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')
    ]

In [12]:
p = 0.8
bn_model = Sequential(get_bn_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
bn_model.fit(conv_feat, train_labels, batch_size=batch_size, nb_epoch=3,
            validation_data=(conv_val_feat, val_labels))

Train on 18542 samples, validate on 3882 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f07b3231d90>

### Bcolz-Array-Iterator

Alternativ zum Laden der gesamten vorberechneten Werte soll ein Array-Iterator verwendet werden

In [22]:
train_label_file = os.path.join(results_path, 'train_labels.dat')
val_label_file = os.path.join(results_path, 'val_labels.dat')

In [23]:
save_array(train_label_file, train_labels)
save_array(val_label_file, val_labels)

In [28]:
bc_conv_features = bcolz.open(conv_feat_file, mode='r')
bc_train_labels = bcolz.open(train_label_file, mode='r')

bc_val_features = bcolz.open(conv_val_feat_file, mode='r')
bc_val_labels = bcolz.open(val_label_file, mode='r')

train_batches = BcolzArrayIterator(bc_conv_features, bc_train_labels, 
                                   batch_size=bc_conv_features.chunklen * 10, shuffle=True)

val_batches = BcolzArrayIterator(bc_val_features, bc_val_labels, 
                                 batch_size=bc_val_features.chunklen * 10, shuffle=True)

bn_model.fit_generator(generator=train_batches, samples_per_epoch=train_batches.N, 
                       validation_data=val_batches, nb_val_samples=val_batches.N, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f09d79ab6d0>

### Pre-computed data augmentation + dropout

In [19]:
print(train_labels.shape)

(18542, 10)


Die Ergebnisse des VGG Convolution-Models (ohne Dense-Layer) werden vorberechnet. Als Eingabedaten werden die augmentierten Trainigsdaten (5-fache Grösse) verwendet.

In [20]:
print(conv_features.shape)

(18542, 512, 14, 14)


In [None]:
gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)
da_batches = get_batches(train_path, gen_t, batch_size=batch_size, shuffle=False)

In [None]:
da_conv_feat_small = conv_model.predict_generator(da_batches, da_batches.nb_sample*2)

In [24]:
conv_da_small_feat_file = os.path.join(results_path, 'conv_da_small_feat.dat')

In [None]:
save_array(conv_da_small_feat_file, da_conv_feat_small)

In [None]:
da_conv_feat = load_array(conv_da_feat_file)

In [25]:
da_con_feat_small = load_array(conv_da_small_feat_file)

Die vorberechneten Ergebnisse 

In [None]:
da_conv_feat = np.concatenate([da_con_feat_small, conv_feat])

In [14]:
da_train_labels = np.concatenate([train_labels]*3)

In [39]:
def get_bn_da_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')       
    ]

In [40]:
p = 0.8

In [41]:
bn_model = Sequential(get_bn_da_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [18]:
bn_model.fit(da_conv_feat, da_train_labels, batch_size=batch_size, nb_epoch=1,
            validation_data=(conv_val_feat, val_labels))

Train on 55626 samples, validate on 3882 samples
Epoch 1/1


<keras.callbacks.History at 0x7f9d40951350>

In [19]:
bn_model.optimizer.lr=0.01

In [20]:
bn_model.fit(da_conv_feat, da_train_labels, batch_size=batch_size, nb_epoch=1,
            validation_data=(conv_val_feat, val_labels))

Train on 55626 samples, validate on 3882 samples
Epoch 1/1


<keras.callbacks.History at 0x7fa80e7ae490>

In [21]:
bn_model.optimizer.lr=0.0001

In [22]:
bn_model.fit(da_conv_feat, da_train_labels, batch_size=batch_size, nb_epoch=3,
            validation_data=(conv_val_feat, val_labels))

Train on 55626 samples, validate on 3882 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa80ee6c750>

In [11]:
latest_weights_file = os.path.join(results_path, 'da_conv8_1.h5')

In [24]:
bn_model.save_weights(latest_weights_file)

### Pre-computed data augmentation + dropout (BcolzArrayIterator)

In [36]:
da_conv_feat_file = os.path.join(results_path, 'conv_da_small_feat.dat')
da_conv_labels_file = os.path.join(results_path, 'conv_da_labels.dat')
conv_labels_file = train_label_file
conv_val_labels_file = val_label_file



In [43]:
da_train_labels = np.concatenate([train_labels]*2)

In [44]:
save_array(da_conv_labels_file, da_train_labels)

In [51]:
# Trainingsdaten
conv_features = bcolz.open(conv_feat_file, mode='r')
conv_labels = bcolz.open(conv_labels_file, mode='r')

# Augmentierte Trainingsdaten
da_conv_features = bcolz.open(da_conv_feat_file, mode='r')
da_conv_labels = bcolz.open(da_conv_labels_file, mode='r')

# Validierungsdaten
val_features = bcolz.open(conv_val_feat_file, mode='r')
val_labels = bcolz.open(conv_val_labels_file, mode='r')

train_batches = BcolzArrayIterator(conv_features, conv_labels, 
                                   batch_size=conv_features.chunklen * 10, shuffle=True)

da_batches = BcolzArrayIterator(da_conv_features, da_conv_labels, 
                                   batch_size=da_conv_features.chunklen * 10, shuffle=True)

val_batches = BcolzArrayIterator(val_features, val_labels, 
                                 batch_size=val_features.chunklen * 10, shuffle=True)

mix_batches = MixIterator((train_batches, da_batches))


In [64]:
bn_model.fit_generator(generator=mix_batches, samples_per_epoch=da_batches.N+train_batches.N, 
                       validation_data=val_batches, nb_val_samples=val_batches.N, nb_epoch=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f078d7299d0>

In [63]:
bn_model.optimizer.lr=0.0001

### Pseudo labeling

In [25]:
val_pseudo = bn_model.predict(conv_val_feat, batch_size=batch_size)

In [26]:
comb_pseudo = np.concatenate([da_train_labels, val_pseudo])

In [27]:
comb_feat = np.concatenate([da_conv_feat, conv_val_feat])

MemoryError: 

In [None]:
bn_model.load_weights(latest_weights_file)

In [None]:
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=1,
            validation_data=(conv_val_feat, val_labels))

In [None]:
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=4,
            validation_data=(conv_val_feat, val_labels))

In [None]:
bn_model.optimizer.lr=0.00001

In [None]:
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=4,
            validation_data=(conv_val_feat, val_labels))

In [None]:
latest_weights_file = os.path.join(results_path, 'bn_ps8.h5')
bn_model.save_weights(latest_weights_file)

### Submit

In [12]:
bn_model.load_weights(latest_weights_file)

In [53]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [54]:
conv_test_feat = load_array(conv_test_feat_file)

In [65]:
preds = bn_model.predict(conv_test_feat, batch_size=batch_size*2)

In [66]:
subm = do_clip(preds,0.93)

In [67]:
subm_name = 'submission_da_3.gz'

In [68]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [69]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[8:] for a in test_filenames])
submission.head()

Unnamed: 0,img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,img_81601.jpg,0.03396,0.007778,0.007778,0.007778,0.007778,0.007778,0.027016,0.010161,0.007778,0.91915
1,img_14887.jpg,0.011181,0.007778,0.007778,0.007778,0.007778,0.010554,0.007778,0.007778,0.007778,0.93
2,img_62885.jpg,0.048401,0.007778,0.007778,0.114139,0.817953,0.008196,0.007778,0.007778,0.007778,0.007778
3,img_45125.jpg,0.007778,0.007778,0.059568,0.007778,0.009552,0.007778,0.517102,0.007778,0.388131,0.014077
4,img_22633.jpg,0.040571,0.036788,0.013607,0.007778,0.007778,0.009744,0.007778,0.007778,0.125493,0.766594


In [70]:
submission.to_csv(subm_name, index=False, compression='gzip')

In [71]:
FileLink(subm_name)

### VGG16 Imagenet

In [None]:
# --> 0,59
#gen = image.ImageDataGenerator(rotation_range=5, width_shift_range=0.05, 
#                               height_shift_range=0.05, zoom_range=0.05, horizontal_flip=False)

#gen = image.ImageDataGenerator(rotation_range=10, width_shift_range=0.1, 
#                               height_shift_range=0.1, zoom_range=0.1, horizontal_flip=False)

gen = image.ImageDataGenerator(rotation_range=15,width_shift_range=0.15, height_shift_range=0.15)

#gen = image.ImageDataGenerator()



#gen = image.ImageDataGenerator(rotation_range=20, width_shift_range=0.2, 
#       height_shift_range=0.2, zoom_range=0.2, horizontal_flip=False)

In [None]:
#import Vgg16 helper class
vgg = Vgg16BN()

#Set constants. You can experiment with no_of_epochs to improve the model
batch_size = 64
no_of_epochs = 3

#Finetune the model
batches = vgg.get_batches(train_path, gen=gen,batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

layers = vgg.model.layers

In [None]:
# Get the index of the first dense layer...
first_dense_idx = [index for index,layer in enumerate(layers) if type(layer) is Dense][0]

# ...and set this and all subsequent layers to trainable
for layer in layers[first_dense_idx:]: 
    layer.trainable=True
    #print(layer.name)

In [None]:
vgg.model.summary()

In [None]:
layers = vgg.model.layers
for layer in layers:
    print(layer.trainable)

In [None]:
vgg.model.optimizer.lr = 0.0001
#print(vgg.model.optimizer.lr.get_value())

In [None]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
#latest_weights_filename = None
no_of_epochs = 1
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft-bn-final.h5'
    weights_path = os.path.join(results_path, latest_weights_filename)
    vgg.model.save_weights(weights_path)
print "Completed %s fit operations" % no_of_epochs

In [None]:
vgg.fit(batches, val_batches, nb_epoch=1)
latest_weights_filename = 'ft-e-1.h5'
weights_path = os.path.join(results_path, latest_weights_filename)
vgg.model.save_weights(weights_path)

In [None]:
vgg.fit(batches, val_batches, nb_epoch=1)
latest_weights_filename = 'ft-e-2.h5'
weights_path = os.path.join(results_path, latest_weights_filename)
vgg.model.save_weights(weights_path)

In [None]:
vgg.model.optimizer.lr = 0.0001
vgg.fit(batches, val_batches, nb_epoch=1)
latest_weights_filename = 'ft-e-3.h5'
weights_path = os.path.join(results_path, latest_weights_filename)
vgg.model.save_weights(weights_path)


2-layer (4096, 200) trainable with 4 epochs and learning-rate 0.001 and drop-out = 0.5, 0.3
another 4 epochs with lr 0.0001
    
                        acc     val_acc               acc     val_acc
w-0.05,h-0.1,r-5         .8     0.64 
w-0.1,h=0.15,r-10       0.69    0.66                  0.79     0.7


2-layer (4096, 200) trainable with 4 epochs and learning-rate 0.001 and drop-out = 0.5, 0.5
another 4 epochs with lr 0.0001

                        acc     val_acc               acc     val_acc
w-0.1,h=0.15,r-10       


2-layer (200, 200) trainable with 4 epochs and learning-rate 0.001 and drop-out = 0.5, 0.3
another 2 epochs with lr 0.0001

                        acc     val_acc               acc     val_acc
w-0.1,h=0.15,r-10       0.76     0.75                 0.86     0.82

with all data          



2-layer trainable with 2 epochs and learning-rate 0.001 and drop-out = 0.5

                          acc    val_acc
standard:                 0.5     0.4
width-shift (0.1)         0.4     0.4
width-shift (0.2)         0.38    0.36
height-shift (0.1)        0.4     0.43
height-shift (0.2)        0.37    0.46
height-shift (0.3)        0.33    0.37
rotation (10)             0.41    0.47
rotation (20)             0.37    0.38
zoom-range (0.1)          0.41    0.35


2-layer trainable with 2 epochs and learning-rate 0.001 and drop-out = 0.3

                          acc    val_acc
standard:                 0.68     0.48
w-0.05,h-0.1,r-0.05       0.48     0.38

2-layer trainable with 2 epochs and learning-rate 0.001 and drop-out = 0.6, 0.3

                          acc    val_acc        acc   val_acc
standard:                 0.55     0.48 
w-0.1,h-0.2,r-0.1         0.33     0.42         0.44    0.46     
w-0.05,h-0.1,r-0.05       0.42     0.40         0.52    0.45

2-layer trainable with 2 epochs and learning-rate 0.001 and drop-out = 0.3, 0.6

                          acc    val_acc        acc   val_acc
standard:                 0.52     0.43         0.68   0.53
 
2-layer trainable with 2 epochs and learning-rate 0.001 and drop-out = 0.5, 0.2

                          acc    val_acc
standard:                 0.65     0.48

2-layer trainable with 2 epochs and learning-rate 0.01 and drop-out = 0.6, 0.3

                          acc    val_acc
standard:                 0.63     0.40
        

2-layer trainable with 2 epochs and learning-rate 0.0001 and drop-out = 0.6, 0.3

                          acc    val_acc
standard:                 0.16    0.16

In [None]:
vgg.model.optimizer.lr = 0.0001

In [None]:
no_of_epochs = 3
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft-bn-%d.h5' % epoch
    weights_path = os.path.join(results_path, latest_weights_filename)
    vgg.model.save_weights(weights_path)
print "Completed %s fit operations" % no_of_epochs

In [None]:
vgg.model.optimizer.lr = 0.001

In [None]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
#latest_weights_filename = None
no_of_epochs = 8
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft-bn-%d.h5' % epoch
    weights_path = os.path.join(results_path, latest_weights_filename)
    vgg.model.save_weights(weights_path)
print "Completed %s fit operations" % no_of_epochs

## Generate Predictions

Let's use our new model to make predictions on the test dataset

In [None]:
def load_weights(weights_file):
    vgg.ft(10)
    vgg.model.load_weights(weights_file)

In [None]:
def gen_and_save_preds(weights_file):
    weights_path = os.path.join(results_path, weights_file + '.' + weights_postfix)
    load_weights(weights_path)
    batches, preds = vgg.test(test_path, batch_size = batch_size*2)
    preds_path = os.path.join(results_path, 'preds-' + weights_file + '.dat')
    files_path = os.path.join(results_path, 'files-' + weights_file + '.dat')
    filenames = batches.filenames
    save_array(preds_path, preds)
    save_array(files_path, filenames)    

In [None]:
weights_file = 'ft-d-2'
gen_and_save_preds(weights_file)

## Validate Predictions

Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 

- **Tip**: with our first model we should try to overfit before we start worrying about how to reduce over-fitting - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (ie those with highest probability that are correct)
4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
5. The most uncertain labels (ie those with probability closest to 0.5).

Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)

Calculate predictions on validation set, so we can find correct and incorrect examples:

In [None]:
latest_weights_filename = os.path.join(results_path, 'ft-c-2.h5')
vgg.model.load_weights(latest_weights_filename)

In [None]:
val_batches, probs = vgg.test(valid_path, batch_size = batch_size)

In [None]:
filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.argmax(our_predictions)

In [None]:
our_pred = probs[1200]
our_pred = our_pred.clip(0.05, 0.95)
label = np.argmax(our_pred)
print(our_pred)
print(label)

In [None]:
our_preds = probs
our_preds = our_preds.clip(0.05, 0.95)
our_labels = np.argmax(our_preds, axis=1)

In [None]:
print(expected_labels[:600])
print(our_labels[:600])

In [None]:
from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [None]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
print(idx)
title_label = our_labels[idx]
title_pred = our_preds[idx]
print(title_label)
print(title_pred)
#plots_idx(idx, [our_labels[idx], 'test'])

In [None]:
print(title_pred[3][5])

In [None]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [None]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

In [None]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [None]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [None]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

Perhaps the most common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

## Submit Predictions to Kaggle!

In [None]:
#Load our test predictions from file
def load_preds(weights_file):
    preds_path = os.path.join(results_path, 'preds-' + weights_file + '.dat')
    files_path = os.path.join(results_path, 'files-' + weights_file + '.dat')
    preds = load_array(preds_path)
    files = load_array(files_path)
    file_ids = np.array([f[8:] for f in files])
    return preds, file_ids

In [None]:
def prepare_result_df(preds, file_ids):
    df1 = pd.DataFrame(data=file_ids, columns=['img'])
    df2 = pd.DataFrame(data=preds, columns=['c0', 'c1','c2','c3','c4','c5','c6','c7','c8','c9'])
    result = pd.concat([df1, df2], axis=1) 
    result = result.set_index('img')
    return result

In [None]:
def write_submission_df(df, weights_file):
    file_path = os.path.join(subm_path, 'subm-' + weights_file + '.csv' )
    df.to_csv(file_path, sep=',', float_format='%.3f')
    return file_path

In [None]:
def create_submission(weights_file):
    preds, file_ids = load_preds(weights_file)
    result_df = prepare_result_df(preds, file_ids)
    file_path = write_submission_df(result_df, weights_file)
    FileLink(file_path)

In [None]:
weights_file = 'ft-bn-1'
create_submission(weights_file)