In [None]:
# This notebook is the joint work of Joseph Palermo and Alok Singh

In [1]:
import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Input
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, CSVLogger

from os.path import abspath, expanduser
import numpy as np
import pandas
import bcolz
import PIL
from PIL import Image
from matplotlib import pyplot as plt
%matplotlib inline

import random
from shutil import move, copy

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
# utilities

def save_array(fname, arr): 
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
    
def load_array(fname):
    return bcolz.open(fname)

def get_raw_batches(batch_gen, batch_size, n_epochs):
    img_batches = []
    n_batches = n_epochs * batch_gen.samples // batch_size
    print(n_batches)
    for i in range(n_batches):
        if i % 10 == 0:
            print(i)
        batch = batch_gen.next()
        img_batches.append(batch[0])
    return np.concatenate(img_batches)

def plots(ims, figsize=(24,12), rows=1, interp=False, titles=None):
    if type(ims[0]) is np.ndarray:
        ims = np.array(ims).astype(np.uint8)
        if (ims.shape[-1] != 3):
            ims = ims.transpose((0,2,3,1))
    f = plt.figure(figsize=figsize)
    cols = len(ims)//rows if len(ims) % 2 == 0 else len(ims)//rows + 1
    for i in range(len(ims)):
        sp = f.add_subplot(rows, cols, i+1)
        sp.axis('Off')
        if titles is not None:
            sp.set_title(titles[i], fontsize=16)
        plt.imshow(ims[i], interpolation=None if interp else 'none')

## Data Preprocessing

In [None]:
# 1 - create a validation set consisting of different drivers than are in the training set

data_path = "data/statefarm/"
driver_data = pandas.read_csv(data_path+"driver_imgs_list.csv")
unique_subjects = driver_data["subject"].unique() # 26 unique subjects
subjects = driver_data["subject"].tolist()
classes = driver_data["classname"].tolist()
imgs = driver_data["img"].tolist()

In [None]:
# get random sample of unique_subjects
validation_subjects = np.random.choice(unique_subjects, size=4, replace=False) # select 4 without replacement
train_subjects = [subj for subj in unique_subjects if subj not in validation_subjects]

In [None]:
# construct a dictionary that maps each classname to a list of picture ids in which the subject in the picture is in
# validation_subjects
subject_mapping = {c: [] for c in driver_data["classname"].unique()}
for i, subj in enumerate(subjects):
    if subj in validation_subjects:
        subject_mapping[classes[i]].append(imgs[i])   

In [None]:
# inspect the distribution of classes in the validation set
print(validation_subjects)
print(len(subject_mapping['c0']))
print(len(subject_mapping['c1']))
print(len(subject_mapping['c2']))
print(len(subject_mapping['c3']))
print(len(subject_mapping['c4']))
print(len(subject_mapping['c5']))
print(len(subject_mapping['c6']))
print(len(subject_mapping['c7']))
print(len(subject_mapping['c8']))
print(len(subject_mapping['c9']))

In [None]:
# use the constructed dictionary to selectively move files to the validation set

# %cd ~/nbs/data/statefarm
# for classname in subject_mapping:
#     train_classpath = "train/" + classname + "/"
#     valid_classpath = "valid/" + classname + "/"
#     for filename in subject_mapping[classname]:
#         move(train_classpath + filename, valid_classpath + filename)

In [None]:
# 2 - construct sample data by the same method

# construct sample training data
sample_train = np.random.choice(train_subjects, size=2, replace=False) # select 2 without replacement
train_subject_mapping = {c: [] for c in driver_data["classname"].unique()}
for i, subj in enumerate(subjects):
    if subj in sample_train:
        train_subject_mapping[classes[i]].append(imgs[i]) 

In [None]:
# construct sample validation data
sample_valid = np.random.choice(validation_subjects, size=1, replace=False) # select 1 without replacement
valid_subject_mapping = {c: [] for c in driver_data["classname"].unique()}
for i, subj in enumerate(subjects):
    if subj in sample_valid:
        valid_subject_mapping[classes[i]].append(imgs[i])     

In [None]:
# actually copy the sample training data

# %cd ~/nbs/data/statefarm
# for classname in train_subject_mapping:
#     train_path = "train/" + classname + "/"
#     sample_path = "sample/train/" + classname + "/"
#     for filename in train_subject_mapping[classname]:
#         copy(train_path + filename, sample_path + filename)

In [None]:
# actually copy the sample validation data

# %cd ~/nbs/data/statefarm
# for classname in valid_subject_mapping:
#     valid_path = "valid/" + classname + "/"
#     sample_path = "sample/valid/" + classname + "/"
#     for filename in valid_subject_mapping[classname]:
#         copy(valid_path + filename, sample_path + filename)

## Train some models

### 1 - simple conv net

In [None]:
def simple_conv(batch_gen, val_batch_gen):
    model = Sequential([
            BatchNormalization(axis=1, input_shape=(224,224,3)),
            Convolution2D(32,(3,3), activation='relu'),
            BatchNormalization(axis=1),
            MaxPooling2D((3,3)),
            Convolution2D(64,(3,3), activation='relu'),
            BatchNormalization(axis=1),
            MaxPooling2D((3,3)),
            Flatten(),
            Dense(200, activation='relu'),
            BatchNormalization(),
            Dense(10, activation='softmax')
        ])
    model.compile(Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit_generator(batch_gen, batch_gen.samples//batch_size, epochs=2, validation_data=val_batch_gen, validation_steps=val_batch_gen.samples//batch_size)
    model.optimizer.lr = 1e-3
    model.fit_generator(batch_gen, val_batch_gen.samples//batch_size, epochs=4, validation_data=val_batch_gen, validation_steps=val_batch_gen.samples//batch_size)
    return model

In [None]:
%cd ~/nbs
batch_size = 128
data_path = "data/statefarm/"
gen = image.ImageDataGenerator()
batch_gen = gen.flow_from_directory(data_path+"train", batch_size=batch_size, target_size=(224, 224))
val_batch_gen = gen.flow_from_directory(data_path+"valid", batch_size=batch_size, target_size=(224, 224))

In [None]:
model = simple_conv(batch_gen, val_batch_gen)

In [None]:
# the previous training overfits so try adding data augmentation 
gen = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, shear_range=0.1, 
                               channel_shift_range=20, width_shift_range=0.1)
batch_gen = gen.flow_from_directory(data_path+"train", batch_size=batch_size, target_size=(224, 224))
val_batch_gen = gen.flow_from_directory(data_path+"valid", batch_size=batch_size, target_size=(224, 224))
model = simple_conv(batch_gen, val_batch_gen)

In [None]:
model.optimizer.lr = 1e-4
model.fit_generator(batch_gen, batch_gen.samples//batch_size, epochs=12, validation_data=val_batch_gen, validation_steps=val_batch_gen.samples//batch_size)

### 2 - Finetune VGG16

In [3]:
# config
data_path = "data/statefarm/"
sample_data_path = "data/statefarm/sample/"
model_path = "data/statefarm/models/"
target_size = (224,224)

In [4]:
# load VGG
from keras.applications.vgg16 import VGG16
vgg = VGG16(include_top=False)

In [5]:
# define a preprocessing function
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape((1,1,3))
def vgg_preprocess(x):
    x = x - vgg_mean
    return x[:, ::-1] # reverse axis rgb->bgr

In [6]:
# test some data augmentation

# gen = image.ImageDataGenerator(rotation_range=5, width_shift_range=0.05, height_shift_range=0.05, shear_range=3.14/8, zoom_range=0.1)
# batch_gen = gen.flow_from_directory(sample_data_path+"train", batch_size=4, target_size=target_size, shuffle=False)
# batch = batch_gen.next()[0]
# plots(batch)

In [7]:
%cd ~/nbs
# get augmented training data generator
batch_size = 5 # use smaller batch size to lose fewer examples
gen = image.ImageDataGenerator(preprocessing_function=vgg_preprocess, rotation_range=15, height_shift_range=0.05, 
                               shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)
batch_gen = gen.flow_from_directory(data_path+"train", batch_size=batch_size, target_size=target_size, shuffle=False)
# get validation data generator
val_batch_size = 2
val_gen = image.ImageDataGenerator(preprocessing_function=vgg_preprocess)
val_batch_gen = val_gen.flow_from_directory(data_path+"valid", batch_size=val_batch_size, target_size=target_size, shuffle=False)


/home/ubuntu/nbs
Found 19093 images belonging to 10 classes.
Found 3331 images belonging to 10 classes.


In [None]:
# precompute VGG outputs
n_epochs = 5 # number of epochs of augmented data to generate
vgg_output = vgg.predict_generator(batch_gen, 2, verbose=1)
vgg_val_output = vgg.predict_generator(val_batch_gen, val_batch_gen.samples // val_batch_size, verbose=1)


In [None]:
# save the VGG outputs

# save_array(model_path+"vgg_output.bc", vgg_output)
# save_array(model_path+"vgg_val_output.bc", vgg_val_output)

In [None]:
# save the corresponding labels

# labels = np.concatenate([batch_gen.classes for _ in range(n_epochs)])[:vgg_output.shape[0]]
# val_labels = val_batch_gen.classes[:vgg_val_output.shape[0]]
# save_array(data_path+"raw/labels.bc", labels)
# save_array(data_path+"raw/val_labels.bc", val_labels)

In [8]:
# load precomputed vgg outputs
%cd ~/nbs
vgg_output = load_array(model_path+"vgg_output.bc")
vgg_val_output = load_array(model_path+"vgg_val_output.bc")
# load the corresponding labels
labels = load_array(data_path+"raw/labels.bc")
val_labels = load_array(data_path+"raw/val_labels.bc")

/home/ubuntu/nbs


In [9]:
print(vgg_output.shape)
print(vgg_val_output.shape)
print(labels.shape)
print(val_labels.shape)

(95457, 7, 7, 512)
(3330, 7, 7, 512)
(95457,)
(3330,)


In [10]:
# define a dense model 
model = Sequential([
        Flatten(batch_input_shape=(None,7,7,512)),
        Dropout(.8),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(.8),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(.8),
        Dense(10, activation='softmax')])  
model.compile(Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [11]:
CALLBACKS = [CSVLogger(abspath(expanduser('~/nbs/data/statefarm/results.csv')), append=True)]

In [12]:
batch_size = 256

In [13]:
K.set_value(model.optimizer.lr, 1e-5)
model.fit(vgg_output, labels, batch_size=128, epochs=1, validation_data=(vgg_val_output, val_labels), callbacks=CALLBACKS)

Train on 95457 samples, validate on 3330 samples
Epoch 1/1


<keras.callbacks.History at 0x7f5408eb1690>

In [14]:
K.set_value(model.optimizer.lr, 1e-3)
model.fit(vgg_output, labels, batch_size=128, epochs=4, validation_data=(vgg_val_output, val_labels), callbacks=CALLBACKS)

Train on 95457 samples, validate on 3330 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f5408eb1890>

In [15]:
K.set_value(model.optimizer.lr, 1e-3)
model.fit(vgg_output, labels, batch_size=256, epochs=4, validation_data=(vgg_val_output, val_labels), callbacks=CALLBACKS)

Train on 95457 samples, validate on 3330 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f542e6e8050>

In [16]:
K.set_value(model.optimizer.lr, 1e-4)
model.fit(vgg_output, labels, batch_size=256, epochs=4, validation_data=(vgg_val_output, val_labels), callbacks=CALLBACKS)

Train on 95457 samples, validate on 3330 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f5408e52f50>

In [22]:
model.save_weights(model_path+"dense.h5")