# 0. Config

In [1]:
img_rows = 224
img_cols = 224
color_type = 1
batch_size=16
epochs=300
cache = 'e:/kaggle_imgs/cache/StateFarm'
img_path="E:/kaggle_imgs/Statefarm/Data/imgs"
train_pickle=img_path+"/train_data2.pickle"
test_pickle=img_path+"/test_data2.pickle"
subject="Digit_model"

import os
import shutil
if not os.path.exists(cache):
    #shutil.rmtree(cache)
    os.mkdir(cache)
saved_path="{}/saved_models".format(cache)
if not os.path.exists(saved_path):
    os.makedirs(saved_path)
file_path='{}/State_keras_200508_new_func.hdf5'.format(saved_path)

# 1. Import

In [2]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2016)
import os
import glob
import cv2
import math
import pickle
import datetime
import pandas as pd
import statistics

In [3]:
from PIL import Image as IM
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.callbacks import ModelCheckpoint, EarlyStopping,ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator

from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import model_from_json
from sklearn.metrics import log_loss
#from scipy.misc import imread, imresize

Using TensorFlow backend.


In [4]:
def cache_data(data, path):
    file = open(path, 'wb')
    pickle.dump(data, file)
    file.close()
def restore_data(path):
    file = open(path, 'rb')
    data = pickle.load(file)
    return data
def split_validation_set(train, target, test_size):
    random_state = 51
    X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [5]:
def rgb2gray(rgb):
    r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
    gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
    return gray

def get_image(path, img_rows, img_cols, color_type):
    img = IM.open(path)
    img = img.resize((img_rows,img_cols))
    img = np.array(img)
    img = rgb2gray(img)
    return img

def get_im_cv2(path, img_rows, img_cols, color_type=1):
    # Load as grayscale
    if color_type == 1:
        img = cv2.imread(path, 0)
    elif color_type == 3:
        img = cv2.imread(path)
    # Reduce size
    resized = cv2.resize(img, (img_cols, img_rows))
    return resized

def load_train(img_rows, img_cols, color_type=1):
    X_train = []
    y_train = []
  
    print('Read train images')
    for j in range(10):
        path = os.path.join(img_path, 'train', 'c' + str(j), '*.jpg')
        print(path)
        files = glob.glob(path)
        for (i,fl) in enumerate(files):
            flbase = os.path.basename(fl)
            img = get_image(fl, img_rows, img_cols, color_type)
            X_train.append(img)
            y_train.append(j)
    return X_train, y_train

def read_and_normalize_train_data(img_rows, img_cols, color_type=1):
    if not os.path.isfile(train_pickle):
        print('load train from fiels!')
        train_data, train_target = load_train(img_rows, img_cols, color_type)
        train_data = np.array(train_data, dtype=np.uint8)
        train_target = np.array(train_target, dtype=np.uint8)
        train_data = train_data.reshape(train_data.shape[0], img_rows, img_cols, color_type)
        train_target = np_utils.to_categorical(train_target, 10)
        X_train, X_valid, y_train, y_valid=split_validation_set(train_data,train_target,0.2)
        cache_data((X_train, X_valid, y_train, y_valid), train_pickle)
    else:
        print('Restore train from cache!')
        (X_train, X_valid, y_train, y_valid) = restore_data(train_pickle)

    X_train = X_train.astype('float32')
    X_train = X_train/255
    X_valid = X_valid.astype('float32')
    X_valid = X_valid/255
    return X_train, X_valid, y_train, y_valid

In [6]:
def load_test(img_rows, img_cols, color_type=1):
    print('Read test images')
    path = os.path.join(img_path, 'test', '*.jpg')
    files = glob.glob(path)
    X_test = []
    X_test_id = []
    total = 0
    thr = math.floor(len(files)/10)
    for (i,fl) in enumerate(files):
        flbase = os.path.basename(fl)
        img = get_image(fl, img_rows, img_cols, color_type)
        X_test.append(img)
        X_test_id.append(flbase)
        if i%thr == 0:
            print('Read {} images from {}'.format(i, len(files)))

    return X_test, X_test_id

def read_and_normalize_test_data(img_rows, img_cols, color_type=1):
    if not os.path.isfile(test_pickle):
        test_data, test_id = load_test(img_rows, img_cols, color_type)
        test_data = np.array(test_data, dtype=np.uint8)
        test_data = test_data.reshape(test_data.shape[0], img_rows, img_cols, color_type)
        cache_data((test_data, test_id), test_pickle)
    else:
        print('Restore test from cache!')
        (test_data, test_id) = restore_data(test_pickle)

    test_data = test_data.astype('float32')
    test_data =test_data/ 255
    return test_data, test_id

In [7]:
def disp_img(img):
    plt.figure()
    plt.imshow(np.array(img.reshape(img_rows,img_cols)))
def get_callback():
    checkpoint = ModelCheckpoint(filepath=file_path, 
                               monitor='val_loss', mode='min',
                               verbose=1, save_best_only=True)
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=7)
    learning_rate_reduction=ReduceLROnPlateau(monitor="val_acc",
                                          patience=3,
                                          verbose=1,
                                          factor=0.5,
                                          min_lr=0.00001)
    return [checkpoint, es,learning_rate_reduction]

In [8]:
def get_model():
    model = Sequential()

    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                     activation ='relu', input_shape = (img_rows,img_cols,color_type)))
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                     activation ='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))


    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                     activation ='relu'))
    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                     activation ='relu'))
    model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.2))


    model.add(Flatten())
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation = "softmax"))

    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [9]:
def get_ImgGen(X_train, X_valid, y_train, y_valid):
    datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images
    
    datagen.fit(X_train)
    training_generator = datagen.flow(X_train, y_train, batch_size=batch_size)
    validation_data = (X_valid,y_valid)
    return training_generator,validation_data

In [10]:
# input image dimensions
X_train, X_valid, y_train, y_valid = read_and_normalize_train_data(img_rows, img_cols, color_type)
test_data, test_id = read_and_normalize_test_data(img_rows, img_cols, color_type)


Restore train from cache!
Restore test from cache!


In [None]:
#callbacks
callbacks=get_callback()

#get model
model = get_model()
training_generator,validation_data=get_ImgGen(X_train, X_valid, y_train, y_valid)


print('Start Single Run')
print('Split train: ', len(X_train), len(y_train))
print('Split valid: ', len(X_valid), len(y_valid))

history = model.fit_generator(
    training_generator,
    epochs = epochs, 
    validation_data = validation_data,
    verbose = 1,
    steps_per_epoch = X_train.shape[0] // batch_size,
    callbacks=callbacks,
    validation_steps = X_valid.shape[0] // batch_size) 

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 224, 224, 32)      832       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 224, 224, 32)      25632     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 112, 112, 32)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 112, 112, 32)      0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 112, 112, 64)      18496     
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 112, 112, 64)      36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 56, 56, 64)       

In [None]:
# plot history
plt.subplots(figsize=(12,8))
plt.plot(history.history['accuracy'],"r")
plt.plot(history.history['val_accuracy'],"bo")
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.subplots(figsize=(12,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'],"bo")
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

In [None]:
# predictions_valid = model.predict(X_valid, batch_size=128, verbose=1)
# score = log_loss(Y_valid, predictions_valid)
# print('Score log_loss: ', score)

# # Store valid predictions
# for i in range(len(test_index)):
#     yfull_train[test_index[i]] = predictions_valid[i]

# # Store test predictions
# test_prediction = model.predict(test_data, batch_size=128, verbose=1)
# yfull_test.append(test_prediction)

# print('Final log_loss: {}, rows: {} cols: {} epoch: {}'.format(score, img_rows, img_cols, nb_epoch))
# info_string = 'loss_' + str(score) \
#                 + '_r_' + str(img_rows) \
#                 + '_c_' + str(img_cols) \
#                 + '_ep_' + str(nb_epoch)

# test_res = merge_several_folds_mean(yfull_test, 1)
# create_submission(test_res, test_id, info_string)