## Library import

In [1]:
make_folder=0

In [2]:
import keras
from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, MaxPooling2D
from keras.layers.core import Dense, Dropout,Flatten,Activation
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping,ModelCheckpoint,Callback
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [3]:
#file io
import shutil
import subprocess
import os
from glob import glob
from datetime import datetime
import argparse

In [4]:
#data processing
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [5]:
#image processing
import cv2
from scipy.ndimage import rotate
import scipy.misc

## Argument

In [6]:
#arguement parser
parser=argparse.ArgumentParser()
parser.add_argument("-f")
parser.add_argument('--model', required=False, default='vgg16', help='Model Architecture')
parser.add_argument('--weights', required=False, default=None)
parser.add_argument('--learning-rate', required=False, type=float, default=1e-2)
parser.add_argument('--semi-train', required=False, default=None)
parser.add_argument('--batch-size', required=False, type=int, default=8)
parser.add_argument('--random-split', required=False, type=int, default=0)
parser.add_argument('--data-augment', required=False, type=int, default=0)
args = parser.parse_args()

## Data load

In [7]:
drivers=pd.read_csv("Data/driver_imgs_list.csv")
img_to_driver={}
uniq_drivers=[]
uniq_drivers=drivers.subject.unique()
for i, row in drivers.iterrows():
    label_n_driver = {}
    label_n_driver['label'] = row['classname']
    label_n_driver['driver'] = row['subject']
    img_to_driver[row['img']] = label_n_driver
# {'img_44733.jpg': {'label': 'c0', 'driver': 'p002'},
#  'img_72999.jpg': {'label': 'c0', 'driver': 'p002'},
#  'img_25094.jpg': {'label': 'c0', 'driver': 'p002'},

## config

In [8]:
#config
fc_size=2048
n_class=10
seed=10
nfolds=5
test_nfolds=3
img_row_size,img_col_size=224,224
train_path = 'Data/imgs/train'
test_path = 'Data/imgs/test'
labels = ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']

In [9]:
suffix = 'd{}.m{}.w{}.lr{}.s{}.nf{}.semi{}.b{}.row{}col{}.rsplit{}.augment{}'.format( 
    datetime.now().strftime("%m%d_%H%M"),args.model, args.weights, args.learning_rate, 
    seed, nfolds, args.semi_train, args.batch_size, img_row_size, img_col_size, 
    args.random_split, args.data_augment)
if make_folder:
    temp_train_fold = 'e:/kaggle_imgs/StateFarm/train_{}'.format(suffix)
    temp_valid_fold = 'e:/kaggle_imgs/StateFarm/valid_{}'.format(suffix)
    temp_test = 'e:/kaggle_imgs/StateFarm/test'
    cache = 'e:/kaggle_imgs/cache/{}'.format(suffix)
    subm = 'e:/kaggle_imgs/subm/{}'.format(suffix)
else:
    temp_train_fold = 'e:/kaggle_imgs/StateFarm/train_d0503_1739.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0'
    temp_valid_fold = 'e:/kaggle_imgs/StateFarm/valid_d0503_1739.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0'
    temp_test = 'e:/kaggle_imgs/StateFarm/test'
    cache = 'e:/kaggle_imgs/cache/d0503_1418.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0'
    subm = 'e:/kaggle_imgs/subm/d0503_1418.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0'
    print("load old data")

load old data


## model - vgg16

In [10]:
# def get_model():
#     base_model=keras.applications.vgg16.VGG16(include_top=False,
#                                               weights=args.weights,
#                                               input_shape=(img_row_size,img_col_size,3))
    
#     out = Flatten()(base_model.output)
#     out=Dense(fc_size,activation="relu")(out)
#     out=Dropout(0.5)(out)
#     out=Dense(fc_size,activation="relu")(out)
#     out=Dropout(0.5)(out)
#     output=Dense(n_class,activation="softmax")(out)
#     model=Model(inputs=base_model.input,outputs=output)
    
#     sgd=SGD(lr=args.learning_rate,decay=1e-6,momentum=0.9,nesterov=True)
#     model.compile(optimizer=sgd,loss="categorical_crossentropy",metrics=["accuracy"])
    
#     return model

## model - cnn

In [11]:
def get_model():
    input_layer = Input((img_row_size, img_col_size, 3))
    x = Conv2D(32, (3, 3))(input_layer)
    x = Conv2D(32, (3, 3))(x)
    x = Conv2D(32, (3, 3))(x)
    x = MaxPooling2D(pool_size=2)(x)
    x = Conv2D(64, (3, 3))(x)
    x = Conv2D(64, (3, 3))(x)
    x = Conv2D(64, (3, 3))(x)
    x = MaxPooling2D(pool_size=4)(x)
    x = Conv2D(128, (3, 3))(x)
    x = Conv2D(128, (3, 3))(x)
    x = Conv2D(128, (3, 3))(x)
    x = MaxPooling2D(pool_size=8)(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)
    model = Model(inputs = input_layer, outputs = predictions)
    return model
model = get_model()
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 222, 222, 32)      896       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 220, 220, 32)      9248      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 218, 218, 32)      9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 109, 109, 32)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 107, 107, 64)      18496     
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 105, 105, 64)      3692

In [12]:
def _clear_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.mkdir(path)

In [13]:
def generate_driver_based_split(img_to_driver, train_drivers):
    # 이미지 생성기를 위하여 임시 훈련/검증 폴더를 생성한다
    def _generate_temp_folder(root_path):
        _clear_dir(root_path)
        for i in range(n_class):
            os.mkdir('{}/c{}'.format(root_path, i))
    _generate_temp_folder(temp_train_fold)
    _generate_temp_folder(temp_valid_fold)

    # 임시 훈련/검증 폴더에 데이터를 랜덤하게 복사한다
    train_samples = 0
    valid_samples = 0

    for label in labels:
        files = glob('{}/{}/*jpg'.format(train_path, label))
        for fl in files:  
            tar=""
            if np.random.randint(nfolds) != 1:
                # 데이터의 4/5를 훈련 데이터에 추가한다
                tar="{}/{}/{}".format(temp_train_fold, label, os.path.basename(fl))
                train_samples += 1
            else:
                # 데이터의 1/5를 검증 데이터에 추가한다
                tar="{}/{}/{}".format(temp_valid_fold, label, os.path.basename(fl))
                valid_samples += 1
            # 원본 훈련 데이터를 임시 훈련/검증 데이터에 복사한다
            shutil.copy(fl,tar)

    # 훈련/검증 데이터 개수를 출력한다
    print('# {} train samples | {} valid samples'.format(train_samples, valid_samples))
    return train_samples, valid_samples

### test_generator

In [14]:
datagen=ImageDataGenerator()
test_generator=datagen.flow_from_directory(temp_test,
                                           target_size=(img_row_size,img_col_size),
                                           batch_size=1,
                                           class_mode=None,
                                           shuffle=False)
test_id=[os.path.basename(fl) for fl in glob("{}/*.jpg".format(test_path))]

Found 30216 images belonging to 1 classes.


In [15]:
if make_folder:
    kf=KFold(n_splits=nfolds,shuffle=True, random_state=20)
    for i,(trn,val) in enumerate(kf.split(uniq_drivers)):
        print("trn",trn)
        print("val",val)
        trn_drivers=[uniq_drivers[j] for j in trn]
        trn_samples,val_samples=generate_driver_based_split(img_to_driver,trn_drivers)
        break
else:
    trn_samples=17921
    val_samples=4503
    print(trn_samples,val_samples)

17921 4503


In [16]:
train_generator=datagen.flow_from_directory(temp_train_fold,
                                       target_size=(img_row_size,img_col_size),
                                       batch_size=args.batch_size,
                                       class_mode="categorical",
                                       seed=seed)
valid_generator = datagen.flow_from_directory(
        directory=temp_valid_fold,
        target_size=(img_row_size, img_col_size),
        batch_size=args.batch_size,
        class_mode='categorical',
        seed=seed)
weight_path = "{}/weight.fold_{}.h5".format(cache, 0)
callbacks=[EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=20),
           ModelCheckpoint(weight_path,monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]


Found 17921 images belonging to 10 classes.
Found 4503 images belonging to 10 classes.


In [None]:
model.fit_generator(train_generator,
                    steps_per_epoch=trn_samples/args.batch_size,
                    epochs=4,
                    callbacks=callbacks,
                    verbose=1,
                    class_weight='auto',
                    validation_data=valid_generator,
                    validation_steps=val_samples/args.batch_size)

Epoch 1/4

Epoch 00001: val_accuracy improved from -inf to 0.11215, saving model to e:/kaggle_imgs/cache/d0503_1418.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0/weight.fold_0.h5
Epoch 2/4

In [16]:
temp_train_fold,temp_valid_fold

('e:/kaggle_imgs/StateFarm/train_d0503_1739.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0',
 'e:/kaggle_imgs/StateFarm/valid_d0503_1739.mvgg16.wNone.lr0.0001.s10.nf5.semiNone.b8.row224col224.rsplit0.augment0')

In [18]:
trn_samples/args.batch_size

2240.125