# State Farm

In [1]:
from theano.sandbox import cuda
cuda.use('gpu2')

%matplotlib inline
from __future__ import division,print_function

import os, json
from shutil import copyfile
from glob import glob
import numpy as np
import pandas as pd
import re
from keras.utils.data_utils import get_file
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
from PIL import Image
import keras
from keras.optimizers import SGD, Adam
from keras.applications.vgg16 import VGG16
from keras.models import Model, Sequential
from keras.applications.vgg16 import preprocess_input
from keras.layers import Dense, Dropout, Flatten, Input, BatchNormalization, Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras_tqdm import TQDMNotebookCallback
from utils import plots, get_batches, plot_confusion_matrix, get_data


Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


## Load data

## Create validation set

Important that the validation set not only contains a random selection of each class but also for each person, since there is multiple images of each person

In [62]:
path = "data/statefarm/"  
#path = "data/statefarm/sample/"

In [39]:
model_path = path + 'models/'
valid_path = path + 'valid/'

if not os.path.exists(model_path): os.mkdir(model_path)
batch_size=64

Read csv file and create set of person and classes and images

In [40]:
if not os.path.exists(valid_path): os.mkdir(valid_path) 
person_dict = {}
with open(path + 'driver_imgs_list.csv') as f:
    for line in f:
        words = line.rstrip().split(',')
        person_id = words[0]
        label = words[1]
        img = words[2]
        if(person_id == 'subject'):
            continue
        
        if person_id not in person_dict:
            person_dict[person_id] = {}
            person_dict[person_id][label] = [img]    
        elif label not in person_dict[person_id]:
            person_dict[person_id][label] = [img]
        else: 
            person_dict[person_id][label].append(img)
            


In [41]:
number_of_ppl = len(person_dict.keys())
valid_index = int(round(0.8 * number_of_ppl))
np.random.permutation(person_dict.keys())[valid_index:]

array(['p014', 'p041', 'p035', 'p012', 'p081'], 
      dtype='|S4')

In [42]:
number_of_ppl = len(person_dict.keys())
valid_index = int(round(0.8 * number_of_ppl))
valid_ppl = np.random.permutation(person_dict.keys())[valid_index:]
for p in valid_ppl:
    for c in person_dict[p].keys():
        g = person_dict[p][c]
        random_order_images = np.random.permutation(g)
        if not os.path.exists(valid_path + c): os.mkdir(valid_path + c) 
        for image in random_order_images:
            os.rename((path + "train/" + c + "/" + image), (valid_path + c + "/" + image))


Create a sample set with 50 picture of training images and 10 validation images

In [44]:
sample_path = path + 'sample/'
sample_valid_path = sample_path + 'valid/'
sample_train_path = sample_path + 'train/'
number_of_ppl = len(person_dict.keys())
classes = person_dict[person_dict.keys()[0]].keys()
if not os.path.exists(sample_path): os.mkdir(sample_path) 
if not os.path.exists(sample_valid_path): os.mkdir(sample_valid_path)
if not os.path.exists(sample_train_path): os.mkdir(sample_train_path) 

for c in classes:
    if not os.path.exists(sample_train_path + c): os.mkdir(sample_train_path + c)
    if not os.path.exists(sample_valid_path + c): os.mkdir(sample_valid_path + c) 
    try:
        train_files_in_class = os.listdir(path + "train/" + c) 
        valid_files_in_cass = os.listdir(valid_path + c)
        for i in range(140):
            copyfile((path + "train/" + c + "/" + train_files_in_class[i]), (sample_train_path + c + "/" + train_files_in_class[i]))
        for i in range(140, 180):
            copyfile((path + "valid/" + c + "/" + valid_files_in_cass[i]), (sample_valid_path + c + "/" + valid_files_in_cass[i]))
    except:
        print("Something went wrong")
    

### Load data and fit to net

In [6]:
im_size=224

### Using data augmentation that was provided in course

In [50]:
gen_t = ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)

In [64]:
train_gen = gen_t.flow_from_directory(path + "train",
                                              batch_size=batch_size,
                                              class_mode='categorical',
                                              target_size=(224, 224),
                                              shuffle=True);
val_gen = gen_t.flow_from_directory(path + "valid",
                                          batch_size=batch_size*2,
                                          class_mode='categorical',
                                          target_size=(224, 224),
                                          shuffle=False)
num_class=len(train_gen.class_indices)


Found 17876 images belonging to 10 classes.
Found 4548 images belonging to 10 classes.


### Creating a linear model

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
flatten_4 (Flatten)              (None, 150528)        0           input_8[0][0]                    
____________________________________________________________________________________________________
dense_10 (Dense)                 (None, 10)            1505290     flatten_4[0][0]                  
Total params: 1505290
____________________________________________________________________________________________________


In [31]:
inputs = Input(shape=(3, im_size, im_size))
x = BatchNormalization(inputs)
x = Flatten()(inputs)
x = Dense(num_class, activation='softmax')(x)

model = Model(input=inputs, output=x)

model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

### Fit linear model

In [60]:
nb_epoch=1

In [61]:
model.fit_generator(train_gen, samples_per_epoch=train_gen.N, nb_epoch=nb_epoch, 
                        validation_data=val_gen, nb_val_samples=val_gen.N, callbacks=[TQDMNotebookCallback()])

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


Epoch 1/1


<keras.callbacks.History at 0x7f8ed5e85f50>

In [62]:
model.optimizer.lr=0.001

In [64]:
model.fit_generator(train_gen, samples_per_epoch=train_gen.N, nb_epoch=nb_epoch, 
                        validation_data=val_gen, nb_val_samples=val_gen.N, callbacks=[TQDMNotebookCallback()])

Epoch 1/1


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"




<keras.callbacks.History at 0x7f8ee55c0150>

### A super simple CNN network

In [58]:
#Simple VGG inspired CNN
inputs = Input(shape=(3, im_size, im_size))
x = BatchNormalization(axis=1)(inputs)
x = Convolution2D(64, 3, 3, activation='relu', name='block1_conv1')(x)
x = BatchNormalization(axis=1)(x)
x = MaxPooling2D((3, 3), name='block1_pool')(x)
x = Convolution2D(64, 3, 3, activation='relu', name='block1_conv2')(x)
x = BatchNormalization(axis=1)(x)
x = MaxPooling2D((3, 3), name='block2_pool')(x)
x = Flatten()(x)
x = Dense(2000, activation='relu')(x)
x = BatchNormalization(axis=1)(x)
x = Dropout(0.5)(x)
x = Dense(200, activation='relu')(x)
x = BatchNormalization(axis=1)(x)
x = Dropout(0.5)(x)
x = BatchNormalization(axis=1)(x)
x = Dense(num_class, activation='softmax')(x)

cnn = Model(input=inputs, output=x)

cnn.compile(Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

cnn.summary()

INFO (theano.gof.compilelock): Refreshing lock /home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/lock_dir/lock


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 3, 224, 224)   0                                            
____________________________________________________________________________________________________
batchnormalization_13 (BatchNorma(None, 3, 224, 224)   6           input_4[0][0]                    
____________________________________________________________________________________________________
block1_conv1 (Convolution2D)     (None, 64, 222, 222)  1792        batchnormalization_13[0][0]      
____________________________________________________________________________________________________
batchnormalization_14 (BatchNorma(None, 64, 222, 222)  128         block1_conv1[0][0]               
___________________________________________________________________________________________

In [59]:
nb_epoch=2
cnn.fit_generator(train_gen, samples_per_epoch=train_gen.N, nb_epoch=nb_epoch, 
                        validation_data=val_gen, nb_val_samples=val_gen.N)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2dbeabea50>

In [68]:
cnn.optimizer.lr=0.0001
nb_epoch=4
cnn.fit_generator(train_gen, samples_per_epoch=train_gen.N, nb_epoch=nb_epoch, 
                        validation_data=val_gen, nb_val_samples=val_gen.N)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f2dbe2317d0>

### Load weights

In [55]:
weigth_file_name =  model_path + 'simple-cnn-%d-one-epoch.h5'

In [12]:
cnn.load_weights(weigth_file_name)

IOError: Unable to open file (Unable to open file: name = 'data/statefarm/models/simple-cnn-%d-one-epoch.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

### Save weights

In [69]:
cnn.save_weights(weigth_file_name)

In [32]:
train_gen.class_indices.keys()


['c9', 'c8', 'c3', 'c2', 'c1', 'c0', 'c7', 'c6', 'c5', 'c4']

### Semi-supervised learning pseudo labeling

Predict labels for the validation set and use that for the traning, notice that we don't use the validation lables, this would make our model biased

In [None]:
val_pred = cnn.predict_generator(val_gen, val_gen.nb_sample)
comb_pseudo = np.concatenate([da_trn_labels, val_pred])
comb_feat = np.concatenate([da_conv_feat, conv_val_feat])

In [None]:
bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels))

# Run on test data

In [71]:
print(path + "test")
test_gen = ImageDataGenerator().flow_from_directory(path + "test",
                                          batch_size=batch_size*2,
                                          class_mode=None,
                                          target_size=(224, 224),
                                          shuffle=False)
test_predictions = cnn.predict_generator(test_gen, test_gen.nb_sample)

data/statefarm/test
Found 79726 images belonging to 1 classes.


In [72]:
classes = sorted(train_gen.class_indices.keys())
ids = list(map(lambda x: [re.search('.+\/(.+\.jpg)', x).group(1)], test_gen.filenames))
subm = np.hstack((ids, test_predictions))
class_str = ','.join(['img'] + classes)
N = len(classes)
format_str = ','.join(['%s'] + ['%s']*N)
submission_file_name = 'statefarm.csv'
np.savetxt(submission_file_name, subm, fmt=format_str, header=class_str, comments='')

### Downloadable link

In [73]:
from IPython.display import FileLink
submission_file_name = 'statefarm.csv'
FileLink(submission_file_name)

In [68]:

path

'data/statefarm/'

In [50]:
sorted(train_gen.class_indices.keys())

['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']