In [1]:
# import module
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

from glob import glob
from time import time
from pickle import load, dump
from pandas import DataFrame, read_csv
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from sklearn.metrics import log_loss
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [2]:
dataset = read_csv('data/driver_imgs_list.csv')
dataset.head(5)

Unnamed: 0,subject,classname,img
0,p002,c0,img_44733.jpg
1,p002,c0,img_72999.jpg
2,p002,c0,img_25094.jpg
3,p002,c0,img_69092.jpg
4,p002,c0,img_92629.jpg


In [3]:
def get_image(path, img_rows, img_cols, color_type):
    if color_type == 1:
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    else:
        img = cv2.imread(path, cv2.IMREAD_COLOR)
    return cv2.resize(img, (img_rows, img_cols))

def load_train(img_rows, img_cols, color_type):
    start = time()
    images = []
    labels = []
    img_ap = images.append
    lab_ap = labels.append
    for i in range(10):
        print('loading directory c{}'.format(i), flush=True)
        start_i = time()
        files = glob('data/imgs/train/c{}/*.jpg'.format(i))
        for file in files:
            img_ap(get_image(file, img_rows, img_cols, color_type))
            lab_ap(i)
        print('directory c{} loaded in {:.2f} seconds'.format(i, time() - start_i))
    print('data loaded in {:.2f} seconds'.format(time() - start))
    return images, labels

def normalized_train(img_rows, img_cols, color_type):
    images, labels = load_train(img_rows, img_cols, color_type)
    labels = np_utils.to_categorical(labels, 10)
    train_images, valid_images, train_labels, valid_labels = train_test_split(images, labels, test_size=0.2)
    train_images = np.array(train_images, dtype=np.uint8).reshape(-1, img_rows, img_cols, color_type)
    valid_images = np.array(valid_images, dtype=np.uint8).reshape(-1, img_rows, img_cols, color_type)
    return train_images, valid_images, train_labels, valid_labels

def load_test(img_rows, img_cols, color_type):
    start = time()
    images = []
    names = []
    img_ap = images.append
    nam_ap = names.append
    files = glob('../input/state-farm-distracted-driver-detection/imgs/test/*.jpg')
    for file in files:
        img_ap(get_image(file, img_rows, img_cols, color_type))
        nam_ap(os.path.basename(file))
    return images, names

def normalized_test(img_rows, img_cols, color_type):
    images, names = load_test(img_rows, img_cols, color_type)
    images = np.array(images, dtype=np.uint8).reshape(-1, img_rows, img_cols, color_type)
    return images, names

In [5]:
# load train data
if os.path.exists('train_data.pickle'):
    try:
        print('loading train data from pickle', flush=True)
        with open('train_data.pickle', 'rb') as f:
            train_images, valid_images, train_labels, valid_labels = load(f)
        print('complete!', flush=True)
    except EOFError:
        print('EOFError raised.', flush=True)
        print('loading train data...', flush=True)
        os.system('rm -f ../working/train_data.pickle')
        train_images, valid_images, train_labels, valid_labels = normalized_train(img_rows, img_cols, color_type)
        print('complete!', flush=True)
        print('pickling train data...', flush=True)
        with open('train_data.pickle', 'wb') as f:
            dump((train_images, valid_images, train_labels, valid_labels), f)
        print('complete!', flush=True)
else:
    print('loading train data...', flush=True)
    train_images, valid_images, train_labels, valid_labels = normalized_train(img_rows, img_cols, color_type)
    print('complete!', flush=True)
    print('pickling train data...', flush=True)
    with open('train_data.pickle', 'wb') as f:
        dump((train_images, valid_images, train_labels, valid_labels), f)


loading train data from pickle
complete!


In [6]:
# stats
train_size = len(train_images)
valid_size = len(valid_images)
test_size = len(glob('data/imgs/test/*.jpg'))
print('stats:', flush=True)
print('{} train images'.format(train_size), flush=True)
print('{} validation images'.format(valid_size), flush=True)
print('{} test images'.format(test_size), flush=True)
print('train_images.shape = {}'.format(train_images.shape), flush=True)
print('train_labels.shape = {}'.format(train_labels.shape), flush=True)
print('valid_images.shape = {}'.format(valid_images.shape), flush=True)
print('valid_labels.shape = {}'.format(valid_labels.shape), flush=True)

stats:
17939 train images
4485 validation images
79726 test images
train_images.shape = (17939, 224, 224, 1)
train_labels.shape = (17939, 10)
valid_images.shape = (4485, 224, 224, 1)
valid_labels.shape = (4485, 10)


In [7]:
batch_size = 8
nb_epoch = 30

In [9]:
import shutil
cache = 'e:/kaggle_imgs/cache/statefarm'
if os.path.exists(cache):
    shutil.rmtree(cache)
os.mkdir(cache)
weights_file="{}/weights_best.hdf5".format(cache)
print(weights_file)
os.system('rm -f ../working/saved_models/weights_best.hdf5')

e:/kaggle_imgs/cache/statefarm/weights_best.hdf5


1

In [10]:
# set checkpoints
saved_path="{}/saved_models".format(cache)
if not os.path.exists(saved_path):
    os.makedirs(saved_path)
filepath='{}/weights_best.hdf5'.format(saved_path)
checkpointer = ModelCheckpoint(filepath=filepath, 
                               monitor='val_loss', mode='min',
                               verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=20)
callbacks = [checkpointer, es]

In [11]:
def return_model(img_rows, img_cols, color_type):
    input_layer = Input((img_rows, img_cols, color_type))
    x = Conv2D(32, (3, 3))(input_layer)
    x = Conv2D(32, (3, 3))(x)
    x = Conv2D(32, (3, 3))(x)
    x = MaxPooling2D(pool_size=2)(x)
    x = Conv2D(64, (3, 3))(x)
    x = Conv2D(64, (3, 3))(x)
    x = Conv2D(64, (3, 3))(x)
    x = MaxPooling2D(pool_size=4)(x)
    x = Conv2D(128, (3, 3))(x)
    x = Conv2D(128, (3, 3))(x)
    x = Conv2D(128, (3, 3))(x)
    x = MaxPooling2D(pool_size=8)(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)
    model = Model(inputs = input_layer, outputs = predictions)
    return model
model = return_model(img_rows, img_cols, color_type)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 222, 222, 32)      320       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 220, 220, 32)      9248      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 218, 218, 32)      9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 109, 109, 32)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 107, 107, 64)      18496     
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 105, 105, 64)      3692

In [12]:
# data augmentation configuration
train_datagen = ImageDataGenerator(rescale=1.0 / 255, 
                                   shear_range=0.2, 
                                   zoom_range=0.2, 
                                   horizontal_flip=True, 
                                   validation_split=0.2)

test_datagen = ImageDataGenerator(rescale=1.0 / 255, validation_split=0.2)

In [13]:
nb_train_samples = train_images.shape[0]
nb_validation_samples = valid_images.shape[0]
training_generator = train_datagen.flow(train_images, train_labels, batch_size=batch_size)
validation_generator = test_datagen.flow(valid_images, valid_labels, batch_size=batch_size)

# training_generator = train_datagen.flow_from_directory(
#     '../input/state-farm-distracted-driver-detection/imgs/train', 
#     target_size=(img_rows, img_cols), 
#     batch_size=batch_size,
#     color_mode='grayscale',
#     shuffle=True,
#     class_mode='categorical', subset="training")

# validation_generator = test_datagen.flow_from_directory(
#     '../input/state-farm-distracted-driver-detection/imgs/train', 
#     target_size=(img_rows, img_cols), 
#     batch_size=batch_size,
#     color_mode='grayscale',
#     shuffle=False,
#     class_mode='categorical', subset="validation")

nb_train_samples = 17943
nb_validation_samples = 4481

In [15]:
# train a model
checkpoint = ModelCheckpoint(weights_file, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
history = model.fit_generator(
    training_generator,
    steps_per_epoch = nb_train_samples // batch_size,
    epochs = 4, 
    callbacks=[es, checkpoint],
    verbose = 1,
    class_weight='auto',
    validation_data = validation_generator,
    validation_steps = nb_validation_samples // batch_size)

Epoch 1/4

Epoch 00001: val_accuracy improved from -inf to 0.89866, saving model to e:/kaggle_imgs/cache/statefarm/weights_best.hdf5
Epoch 2/4
  89/2242 [>.............................] - ETA: 3:17 - loss: 0.5204 - accuracy: 0.8610 ETA: 3:18 - loss: 0.5470 

KeyboardInterrupt: 