In [1]:
import os
img_rows = 224
img_cols = 224
color_type = 3
batch_size=80
epochs=300
cache_path = 'e:/kaggle_imgs/StateFarm'
img_path = os.path.join(cache_path,"Data","imgs")
saved_path=os.path.join(cache_path,"saved_models")
file_path=os.path.join(cache_path,"state_vgg16_200515.hdf5")

import shutil
paths=[cache_path,img_path,saved_path]
for mypath in paths:
    if not os.path.exists(mypath):
        os.mkdir(mypath)
        
train_pickle=cache_path+"/train_data_np_200515.npy"
test_pickle=cache_path+"/test_data_np_200515.npy"


In [2]:
# import module
from PIL import Image as IM
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from glob import glob
from time import time
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dropout,Input, Conv2D, MaxPooling2D, Flatten, Dense, MaxPool2D
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint, EarlyStopping,ReduceLROnPlateau

Using TensorFlow backend.


In [3]:
def get_image(path, img_rows, img_cols, color_type):
    color_mode = "rgb" if color_type==3 else "grayscale"
    grayscale = False if color_type==3 else True
    img = image.load_img(path, grayscale=grayscale, color_mode=color_mode, 
                         target_size=(img_rows, img_cols), interpolation="nearest")
    img = np.array(img)
    return img
    

def load_train(img_rows, img_cols, color_type, paths):
    images = []
    labels = []
    for i in range(10):
        start = time()
        files = glob(paths[i])
        for j,file in enumerate(files):
            images.append(get_image(file, img_rows, img_cols, color_type))
            labels.append(i)
        print('directory {} loaded in {:.2f} seconds, count:{}'.format(paths[i], time() - start, len(files)))
    return images, labels

def normalized_train(img_rows, img_cols, color_type,image_path):
    path=[]
    for i in range(10):
        path.append('{}/train/c{}/*.jpg'.format(img_path,i))
    images, labels = load_train(img_rows, img_cols, color_type,path)
    labels = np_utils.to_categorical(labels, 10)
    return images,labels

def load_test(img_rows, img_cols, color_type, path):
    images = []
    files = glob(path)
    print("test images count :",len(files))
    img_trace_cnt = len(files)//10
    for i,file in enumerate(files):
        images.append(get_image(file, img_rows, img_cols, color_type))
        if(i%img_trace_cnt==0):
            print("loading count is :",i)
    return images

def normalized_test(img_rows, img_cols, color_type,path):
    path='{}/test/*.jpg'.format(path)
    images = load_test(img_rows, img_cols, color_type, path)
    return images


In [4]:
# load train data
fp=train_pickle
if os.path.exists(fp):
    try:
        print('loading train data from pickle', flush=True)
        [train_images, valid_images, train_labels, valid_labels]=np.load(fp,allow_pickle=True)
        print('complete!', flush=True)
    except EOFError:
        print('EOFError raised.', flush=True)
        print('loading train data...', flush=True)
        os.system('rm -f train_data.pickle')
else:
    print('loading train data...', flush=True)
    trains,labels=normalized_train(img_rows, img_cols, color_type,img_path)
    train_images, valid_images, train_labels, valid_labels = train_test_split(trains, labels, test_size=0.2)
    train_images = np.array(train_images, dtype=np.uint8).reshape(-1, img_rows, img_cols, color_type)
    valid_images = np.array(valid_images, dtype=np.uint8).reshape(-1, img_rows, img_cols, color_type)
    print('train load complete!', flush=True)
    print('pickling train data...', flush=True)
    start=time()
    np.save(fp, np.array([train_images, valid_images, train_labels, valid_labels]))
    print("np save complete, {}".format(time()-start))

loading train data from pickle
complete!


In [5]:
# load test data
# fp=test_pickle
# if os.path.exists(fp):
#     try:
#         print('loading test data from pickle', flush=True)
#         start=time()
#         test=np.load(fp,allow_pickle=True)
#         print('complete!, {}'.format(time()-start), flush=True)
#     except EOFError:
#         print('EOFError raised.', flush=True)
#         print('loading test data...', flush=True)
#         os.system('rm -f test_data.pickle')
# else:
#     print('loading test data...', flush=True)
#     test=normalized_test(img_rows, img_cols, color_type, img_path)
#     test = np.array(test, dtype=np.uint8).reshape(-1, img_rows, img_cols, color_type)
#     print('test load complete!', flush=True)
#     print('np test data saving...', flush=True)
#     start=time()
#     np.save(fp,test)
#     print('np test data complete...,{}'.format(time()-start), flush=True)


loading test data from pickle
complete!, 98.65489935874939


In [6]:
train_images=train_images/255.
valid_images=valid_images/255.
#test=test/255.
#plt.imshow(train_images[100])

In [7]:
3745
# stats
train_size = len(train_images)
valid_size = len(valid_images)
test_size = len(glob('data/imgs/test/*.jpg'))
print('stats:', flush=True)
print('{} train images'.format(train_size), flush=True)
print('{} validation images'.format(valid_size), flush=True)
print('{} test images'.format(test_size), flush=True)
print('train_images.shape = {}'.format(train_images.shape), flush=True)
print('train_labels.shape = {}'.format(train_labels.shape), flush=True)
print('valid_images.shape = {}'.format(valid_images.shape), flush=True)
print('valid_labels.shape = {}'.format(valid_labels.shape), flush=True)

stats:
17939 train images
4485 validation images
0 test images
train_images.shape = (17939, 224, 224, 3)
train_labels.shape = (17939, 10)
valid_images.shape = (4485, 224, 224, 3)
valid_labels.shape = (4485, 10)


In [8]:
checkpoint = ModelCheckpoint(filepath=file_path, 
                               monitor='val_loss', mode='min',
                               verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=7)
learning_rate_reduction=ReduceLROnPlateau(monitor="val_acc",
                                          patience=3,
                                          verbose=1,
                                          factor=0.5,
                                          min_lr=0.00001)
#callbacks = [checkpoint, es,learning_rate_reduction]
callbacks = [checkpoint, es]

In [10]:
from keras.applications import VGG16
from keras import optimizers

def get_model():
    conv_base=VGG16(weights="imagenet",
                   include_top=False,
                   input_shape=(img_rows,img_cols,color_type))
    conv_base.trainable=False
    model = Sequential()
    model.add(conv_base)
    model.add(Flatten())
    model.add(Dense(128, activation = "relu"))
    model.add(Dense(10, activation = "softmax"))
    return model

model = get_model()
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=1e-4), metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 7, 7, 512)         14714688  
_________________________________________________________________
flatten_1 (Flatten)          (None, 25088)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               3211392   
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
Total params: 17,927,370
Trainable params: 3,212,682
Non-trainable params: 14,714,688
_________________________________________________________________


In [11]:
# data augmentation configuration
datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images


datagen.fit(train_images)

In [12]:
training_generator = datagen.flow(train_images, train_labels, batch_size=batch_size)
validation_data = (valid_images,valid_labels)

In [13]:
if os.path.exists(file_path):
    model.load_weights(file_path)
    print("load weight complete")
history = model.fit_generator(
    training_generator,
    epochs = epochs, 
    validation_data = validation_data,
    verbose = 1,
    steps_per_epoch = train_images.shape[0] // batch_size,
    callbacks=callbacks,
    validation_steps = valid_images.shape[0] // batch_size)

Epoch 1/300

Epoch 00001: val_loss improved from inf to 0.35570, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 2/300

Epoch 00002: val_loss improved from 0.35570 to 0.20873, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 3/300

Epoch 00003: val_loss improved from 0.20873 to 0.10039, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 4/300

Epoch 00004: val_loss improved from 0.10039 to 0.06819, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 5/300

Epoch 00005: val_loss improved from 0.06819 to 0.05644, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 6/300

Epoch 00006: val_loss improved from 0.05644 to 0.05401, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 7/300

Epoch 00007: val_loss improved from 0.05401 to 0.04578, saving model to e:/kaggle_imgs/StateFarm\state_vgg16_200515.hdf5
Epoch 8/300

Epoch 00008: val_loss did not improve from 0.04578
Ep

KeyboardInterrupt: 

In [None]:
# plot history
plt.subplots(figsize=(12,8))
plt.plot(history.history['accuracy'],"r",label="train")
plt.plot(history.history['val_accuracy'],"bo",label="valid")
plt.title('Model accuracy')
plt.legend()
plt.show()

plt.subplots(figsize=(12,8))
plt.plot(history.history['loss'],label="Model")
plt.plot(history.history['val_loss'],"bo",label="loss")
plt.title('Model loss')
plt.legend()

In [None]:
# submission=pd.DataFrame({"ImageId":pd.Series(range(1,28001)),"Label":results.values})

In [None]:
# from datetime import datetime
# now=datetime.now()
# submission.to_csv("data/{0:02d}{1:02d}{2:02d}{3:02d}_{}.csv".\
#                   format(now.year,now.month,now.day,now.hour,subject),index=False)

In [None]:
DFGFDG

#  Submission

In [None]:
if os.path.exists(file_path):
    model.load_weights(file_path)
    print("model load complete")
if os.path.exists(test_pickle):
    try:
        print('loading test data from pickle', flush=True)
        with open(test_pickle, 'rb') as f:
            (test_data,test_id) = load(f)
        print('complete!', flush=True)
    except EOFError:
        print('EOFError raised.', flush=True)
        print('loading test data...', flush=True)
test_data = test_data.astype('float32')
test_data =test_data/ 255

In [None]:
test_prediction = model.predict(test_data, batch_size=20, verbose=1)

In [None]:
yfull_test=[]
yfull_test.append(test_prediction)

In [None]:
import datetime
import pandas as pd
def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()

def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions, columns=['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'])
    result1.loc[:, 'img'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    suffix = info + '_' + str(now.strftime("%Y-%m-%d-%H-%M"))
    sub_file = os.path.join(cache, 'submission_' + suffix + '.csv')
    result1.to_csv(sub_file, index=False)

In [None]:
info_string = 'r_' + str(img_rows) \
                    + '_c_' + str(img_cols) \
                    + '_ep_' + str(epochs)
test_res = merge_several_folds_mean(yfull_test, 1)
create_submission(test_res, test_id, info_string)