# State Farm Distracted Driver Detection


[State Farm Distracted Driver Detection](https://www.kaggle.com/c/state-farm-distracted-driver-detection)
    
    

## Imports und Konstanten

In [5]:
import numpy as np
import pandas as pd
from glob import glob
import os, shutil
import sys

from utils import *
from vgg16 import Vgg16

from IPython.display import FileLink

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [50]:
%pwd

path = os.getcwd()
data_path = os.path.join(path, 'data', 'sample')
sample_path = os.path.join(path, 'data', 'sample')

train_path = os.path.join(data_path, 'train')
valid_path = os.path.join(data_path, 'valid')
test_path = os.path.join(data_path, 'test')
results_path = os.path.join(data_path, 'results')
subm_path = os.path.join(data_path, 'submissions')

weights_postfix = 'h5'
driver_list_path = 'data/driver_imgs_list.csv'

## Vorbereitung der Daten

### CSV-Datei mit Fahrer-Zuordnung analysieren

In [7]:
def get_driver_df(file_path):
    names = ['driver','class','img']
    return pd.read_csv(file_path, sep=',',names=names, header=0)

def get_driver_imgs(df, driver):
    sel = df['driver'] == driver
    return df.loc[sel]

In [9]:
driver_df = get_driver_df(driver_list_path)
print(driver_df.head())

  driver class            img
0   p002    c0  img_44733.jpg
1   p002    c0  img_72999.jpg
2   p002    c0  img_25094.jpg
3   p002    c0  img_69092.jpg
4   p002    c0  img_92629.jpg


In [13]:
p002 = get_driver_imgs(driver_df, 'p002')
print(p002.head())

  driver class            img
0   p002    c0  img_44733.jpg
1   p002    c0  img_72999.jpg
2   p002    c0  img_25094.jpg
3   p002    c0  img_69092.jpg
4   p002    c0  img_92629.jpg


In [15]:
drivers = driver_df['driver']
drivers = drivers.drop_duplicates()
print(drivers.values)

['p002' 'p012' 'p014' 'p015' 'p016' 'p021' 'p022' 'p024' 'p026' 'p035' 'p039' 'p041' 'p042' 'p045'
 'p047' 'p049' 'p050' 'p051' 'p052' 'p056' 'p061' 'p064' 'p066' 'p072' 'p075' 'p081']


In [17]:
result = driver_df.groupby('driver') \
             .agg({'class': pd.Series.nunique, 'img':'count'}).reset_index()
print(result)

   driver  class   img
0    p002     10   725
1    p012     10   823
2    p014     10   876
3    p015     10   875
4    p016     10  1078
5    p021     10  1237
6    p022     10  1233
7    p024     10  1226
8    p026     10  1196
9    p035     10   848
10   p039     10   651
11   p041     10   605
12   p042     10   591
13   p045     10   724
14   p047     10   835
15   p049     10  1011
16   p050     10   790
17   p051     10   920
18   p052     10   740
19   p056     10   794
20   p061     10   809
21   p064     10   820
22   p066     10  1034
23   p072     10   346
24   p075     10   814
25   p081     10   823


In [18]:
num_total = result['img'].sum()
num_move = num_total * 0.2
print(num_total)
print(num_move)

22424
4484.8


### Verzeichnisse erstellen und Daten bereitstellen

In [3]:
def create_class_dir(parent_path):
    for i in range(10):
        class_name = 'c' + str(i)
        class_path = os.path.join(parent_path, class_name)
        if not os.path.exists(class_path):
            os.mkdir(class_path)

def create_test_dir(parent_path):
    test_path = os.path.join(parent_path, 'test')
    if not os.path.exists(test_path):
        os.mkdir(test_path)
    unknown_path = os.path.join(test_path, 'unknown')
    if not os.path.exists(unknown_path):
        os.mkdir(unknown_path)

def make_dir(parent_path, directory):
    path = os.path.join(parent_path, directory)
    if not os.path.exists(path):
        os.mkdir(path)    
    
def create_train_dir(parent_path):
    train_path = os.path.join(parent_path, 'train')
    if not os.path.exists(train_path):
        os.mkdir(train_path)
    create_class_dir(train_path) 
    
def create_valid_dir(parent_path):
    valid_path = os.path.join(parent_path, 'valid')
    if not os.path.exists(valid_path):
        os.mkdir(valid_path)
    create_class_dir(valid_path)
    
def create_sample_dir(parent_path):
    sample_path = os.path.join(parent_path, 'sample')
    if not os.path.exists(sample_path):
        os.mkdir(sample_path)
    create_train_dir(sample_path)
    create_valid_dir(sample_path)
    create_test_dir(sample_path)
    make_dir(sample_path, 'results')
    make_dir(sample_path, 'submissions')
    
def fill_valid_dir():
    drivers = ['p002', 'p024', 'p051', 'p049']
    df = get_driver_df(driver_list_path)
    for driver in drivers:
        driver_list = get_driver_imgs(df, driver)
        move_driver_imgs(train_path, valid_path, driver_list)

def prepare_test_dir():
    test_files = glob(os.path.join(test_path, '*.jpg'))
    target_path = os.path.join(test_path, 'unknown')
    for f in test_files:
        shutil.move(f, target_path)
        
def prepare_sample_dir():
    sample_train = os.path.join(sample_path, 'train')
    sample_valid = os.path.join(sample_path, 'valid')
    sample_test = os.path.join(sample_path, 'test')

    drivers = ['p014', 'p045', 'p042']
    df = get_driver_df(driver_list_path)

    for driver in drivers:
        driver_list = get_driver_imgs(df, driver)
        move_driver_imgs(train_path, sample_train, driver_list, copy=True)
    
    driver_list = get_driver_imgs(df, 'p072')
    move_driver_imgs(train_path, sample_valid, driver_list, copy=True)
    
    test_files = glob(os.path.join(test_path, 'unknown', '*.jpg'))
    shuf = np.random.permutation(test_files)
    for i in range(200): 
        shutil.copy(shuf[i], sample_test)

def move_driver_imgs(source_path, target_path, driver_list, copy=False):
    for entry in driver_list.values:
        file = os.path.join(source_path, entry[1], entry[2])
        target = os.path.join(target_path, entry[1])
        target_file = os.path.join(target, entry[2])
        if os.path.exists(file) and not os.path.exists(target_file):
            if copy == True:
                shutil.copy(file, target)
            else:
                shutil.move(file, target)
                
def prepare_data():
    make_dir(data_path, 'results')
    make_dir(data_path, 'submission')
    create_valid_dir(data_path)
    create_sample_dir(data_path)
    create_test_dir(data_path)
    fill_valid_dir()
    prepare_test_dir()
    prepare_sample_dir()
    

In [19]:
prepare_data()

## Finetuning und Training

In [34]:
#import Vgg16 helper class
vgg = Vgg16()

#Set constants. You can experiment with no_of_epochs to improve the model
batch_size = 64
no_of_epochs = 3

#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

#Not sure if we set this for all fits
#vgg.model.optimizer.lr = 0.01

Found 2191 images belonging to 10 classes.
Found 346 images belonging to 10 classes.


In [35]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    weights_path = os.path.join(results_path, latest_weights_filename)
    vgg.model.save_weights(weights_path)
print "Completed %s fit operations" % no_of_epochs

Running epoch: 0
Epoch 1/1
Running epoch: 1
Epoch 1/1
Running epoch: 2
Epoch 1/1
Completed 3 fit operations


## Generate Predictions

Let's use our new model to make predictions on the test dataset

In [55]:
def load_weights(weights_file):
    vgg.ft(10)
    vgg.model.load_weights(weights_file)

In [58]:
def gen_and_save_preds(weights_file):
    weights_path = os.path.join(results_path, weights_file + '.' + weights_postfix)
    load_weights(weights_path)
    batches, preds = vgg.test(test_path, batch_size = batch_size*2)
    preds_path = os.path.join(results_path, 'preds-' + weights_file + '.dat')
    files_path = os.path.join(results_path, 'files-' + weights_file + '.dat')
    filenames = batches.filenames
    save_array(preds_path, preds)
    save_array(files_path, filenames)    

In [59]:
weights_file = 'ft2'
gen_and_save_preds(weights_file)

Found 200 images belonging to 1 classes.


## Validate Predictions

Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 

- **Tip**: with our first model we should try to overfit before we start worrying about how to reduce over-fitting - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (ie those with highest probability that are correct)
4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
5. The most uncertain labels (ie those with probability closest to 0.5).

Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)

Calculate predictions on validation set, so we can find correct and incorrect examples:

In [None]:
vgg.model.load_weights(results_path+latest_weights_filename)

In [19]:
val_batches, probs = vgg.test(valid_path, batch_size = batch_size)

Found 2000 images belonging to 10 classes.


In [20]:
filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.argmax(our_predictions)

In [43]:
our_pred = probs[1200]
our_pred = our_pred.clip(0.05, 0.95)
label = np.argmax(our_pred)
print(our_pred)
print(label)

[ 0.05  0.05  0.05  0.05  0.05  0.05  0.95  0.05  0.05  0.05]
6


In [47]:
our_preds = probs
our_preds = our_preds.clip(0.05, 0.95)
our_labels = np.argmax(our_preds, axis=1)

In [None]:
print(expected_labels[:600])
print(our_labels[:600])

In [53]:
from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [63]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
print(idx)
title_label = our_labels[idx]
title_pred = our_preds[idx]
print(title_label)
print(title_pred)
#plots_idx(idx, [our_labels[idx], 'test'])

Found 1740 correct labels
[1763  772 1955 1016]
[8 3 9 5]
[[ 0.05    0.05    0.05    0.05    0.05    0.05    0.1015  0.05    0.7608  0.05  ]
 [ 0.05    0.05    0.05    0.9239  0.05    0.05    0.05    0.05    0.05    0.05  ]
 [ 0.0969  0.05    0.05    0.05    0.05    0.05    0.1933  0.05    0.05    0.6416]
 [ 0.05    0.05    0.05    0.05    0.05    0.95    0.05    0.05    0.05    0.05  ]]


In [69]:
print(title_pred[3][5])

0.95


In [None]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [None]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

In [None]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [None]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [None]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

Perhaps the most common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

## Submit Predictions to Kaggle!

In [61]:
#Load our test predictions from file
def load_preds(weights_file):
    preds_path = os.path.join(results_path, 'preds-' + weights_file + '.dat')
    files_path = os.path.join(results_path, 'files-' + weights_file + '.dat')
    preds = load_array(preds_path)
    files = load_array(files_path)
    file_ids = np.array([f[8:] for f in files])
    return preds, file_ids

In [121]:
def prepare_result_df(preds, file_ids):
    df1 = pd.DataFrame(data=file_ids, columns=['img'])
    df2 = pd.DataFrame(data=preds, columns=['c0', 'c1','c2','c3','c4','c5','c6','c7','c8','c9'])
    result = pd.concat([df1, df2], axis=1) 
    result = result.set_index('img')
    return result

In [117]:
def write_submission_df(df, weights_file):
    file_path = os.path.join(subm_path, 'subm-' + weights_file + '.csv' )
    df.to_csv(file_path, sep=',', float_format='%.3f')
    return file_path

In [115]:
def create_submission(weights_file):
    preds, file_ids = load_preds(weights_file)
    result_df = prepare_result_df(preds, file_ids)
    file_path = write_submission_df(result_df, weights_file)
    FileLink(file_path)

In [123]:
weights_file = 'ft2'
create_submission(weights_file)