In [23]:
import sys
sys.path.append('../..')

import os
import json

import keras
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
from tqdm import tqdm

import cr_interface as cri
import keras_utils as ku
import keras_bottle as kb

In [2]:
collection = cri.CrCollection.load().labeled().tri_label()

In [3]:
train = collection.filter_by(dataset_index=0)
test = collection.filter_by(dataset_index=1)

# Training Data Statistics

In [8]:
labels = ['obs', 'in', 'oap']
for label in labels:
    df = train.filter_by(label=label).df
    print('{}: {}'.format(label, len(df)))

obs: 328
in: 1808
oap: 386


## Load K-Fold Bottlenecks & Labels

In [9]:
splits = train.split_by(['dataset_index', 'pid'], [0.2] * 5)

for split in splits:
    print(len(split.df))
    
split_labels = []
split_aug_labels = []
split_bottles = []
split_aug_bottles = []

OUT_MULTIPLIER = 6
AUG_MULTIPLIER = 1

for i, split in enumerate(splits):
    print('Loading split {} of {}...'.format(i + 1, len(splits)))
    cr_codes = split.get_cr_codes()
    labels = split.get_labels()
    bottles = kb.load_bottlenecks(
        cr_codes, app.codename, app.get_model(),
        augmented=False, multiplier=1, verbose=0)
    split_bottles.append(bottles)
    split_labels.append(labels)
    
    labels = []
    cr_codes = split.filter_by(label='in').get_cr_codes()
    labels += split.filter_by(label='in').get_labels()
    in_bottles = kb.load_bottlenecks(
        cr_codes, app.codename, app.get_model(),
        augmented=True, multiplier=AUG_MULTIPLIER, verbose=0)
    
    cr_codes = split.filter_by(label=['oap', 'obs']).get_cr_codes()
    labels += split.filter_by(label=['oap', 'obs']).get_labels()* OUT_MULTIPLIER
    out_bottles = kb.load_bottlenecks(
        cr_codes, app.codename, app.get_model(),
        augmented=True, multiplier=AUG_MULTIPLIER * OUT_MULTIPLIER,
        verbose=0)
    
    bottles = np.concatenate((in_bottles, out_bottles))
    
    split_aug_bottles.append(bottles)
    split_aug_labels.append(labels)

502
482
514
510
514
Loading split 1 of 5...
Loading split 2 of 5...
Loading split 3 of 5...
Loading split 4 of 5...
Loading split 5 of 5...


## Form K-Fold Data

In [10]:
SPLITS = 5

labelize = LabelEncoder().fit_transform
onehot = OneHotEncoder(sparse=False).fit_transform
encode = lambda l: onehot(labelize(l).reshape(-1, 1))

i = 0
validation_labels = encode(split_labels[i])
validation_bottles = split_bottles[i]
train_labels = []
train_bottles = []
for j in list(range(0, i)) + list(range(i + 1, SPLITS)):
    train_labels += split_aug_labels[j]
    train_bottles.append(split_aug_bottles[j])
train_labels = encode(train_labels)
train_bottles = np.concatenate(train_bottles)

In [11]:
def compile_model(model, lr=1.0e-4):
    sgd = optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=sgd,
        #optimizer='rmsprop',
        metrics=['accuracy'])
    
def load_top_model(app, compiled=True, lr=1.0e-4):
    model = Sequential()
    model.add(Flatten(input_shape=app.get_model().output_shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))

    if compiled:
        compile_model(model, lr)
    
    return model

In [16]:
app = ku.applications['mobilenet']
lr_factor = 3
lr = 0.1 ** 3
epochs = 10
top_model = load_top_model(app, lr=lr)

In [17]:
res = top_model.fit(train_bottles, train_labels,
                    validation_data=(validation_bottles, validation_labels),
                    shuffle=True, epochs=1)

Train on 4840 samples, validate on 502 samples
Epoch 1/1


In [18]:
max(res.history['val_acc'])

0.7749003988813119

## Save Learning Curve

In [73]:
HIST_DIR = 'history'
name = '{}_LR{}_E{:03d}'.format(app.codename, lr_factor, epochs)
j = 0
os.makedirs(HIST_DIR, exist_ok=True)
hist_path = os.path.join(HIST_DIR,'{}_S{:02d}.json'.format(name, j))
with open(hist_path, 'w') as f:
    json.dump(res.history, f)

# Modularized!

In [36]:
def compile_model(model, lr=1.0e-4):
    sgd = optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=sgd,
        #optimizer='rmsprop',
        metrics=['accuracy'])
    
def load_top_model(app, compiled=True, lr=1.0e-4):
    model = Sequential()
    model.add(Flatten(input_shape=app.get_model().output_shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))

    if compiled:
        compile_model(model, lr)
    
    return model

def save_history(res: keras.callbacks.History, name):
    HIST_DIR = 'history'
    os.makedirs(HIST_DIR, exist_ok=True)
    hist_path = os.path.join(HIST_DIR,'{}.json'.format(name))
    with open(hist_path, 'w') as f:
        json.dump(res.history, f)

In [39]:
def train_top_models(train_collection: cri.CrCollection, app):
    '''
    Train top models using K-fold & learning rate search
    '''
    SPLITS = 5
    OUT_MULTIPLIER = 6
    AUG_MULTIPLIER = 1
    LR_RANGE = [3, 4, 5, 6]
    EPOCHS = [10, 10, 100, 100]
    EPOCHS = [1, 1, 1, 1]
    
    splits = train_collection.split_by(['dataset_index', 'pid'], [0.2] * 5)

    split_labels = []
    split_aug_labels = []
    split_bottles = []
    split_aug_bottles = []

    print('loading bottlenecks... ', end='')
    for i, split in enumerate(splits):
        #print('Loading split {} of {}...'.format(i + 1, len(splits)))
        cr_codes = split.get_cr_codes()
        labels = split.get_labels()
        bottles = kb.load_bottlenecks(
            cr_codes, app.codename, app.get_model(),
            augmented=False, multiplier=1, verbose=0)
        split_bottles.append(bottles)
        split_labels.append(labels)

        labels = []
        cr_codes = split.filter_by(label='in').get_cr_codes()
        labels += split.filter_by(label='in').get_labels()
        in_bottles = kb.load_bottlenecks(
            cr_codes, app.codename, app.get_model(),
            augmented=True, multiplier=AUG_MULTIPLIER, verbose=0)

        cr_codes = split.filter_by(label=['oap', 'obs']).get_cr_codes()
        labels += split.filter_by(label=['oap', 'obs']).get_labels()* OUT_MULTIPLIER
        out_bottles = kb.load_bottlenecks(
            cr_codes, app.codename, app.get_model(),
            augmented=True, multiplier=AUG_MULTIPLIER * OUT_MULTIPLIER,
            verbose=0)

        bottles = np.concatenate((in_bottles, out_bottles))

        split_aug_bottles.append(bottles)
        split_aug_labels.append(labels)
    print('done')
        
    labelize = LabelEncoder().fit_transform
    onehot = OneHotEncoder(sparse=False).fit_transform
    encode = lambda l: onehot(labelize(l).reshape(-1, 1))
    
    results = {}
    
    print('training models by split + learning rate')
    with tqdm(total=SPLITS * len(LR_RANGE)) as bar:
        for i in range(SPLITS):
            validation_labels = encode(split_labels[i])
            validation_bottles = split_bottles[i]
            train_labels = []
            train_bottles = []
            for j in list(range(0, i)) + list(range(i + 1, SPLITS)):
                train_labels += split_aug_labels[j]
                train_bottles.append(split_aug_bottles[j])
            train_labels = encode(train_labels)
            train_bottles = np.concatenate(train_bottles)

            for lr_factor, epochs in zip(LR_RANGE, EPOCHS):
                lr = 0.1 ** lr_factor
                top_model = load_top_model(app, lr=lr)

                res = top_model.fit(train_bottles, train_labels,
                                    validation_data=(validation_bottles, validation_labels),
                                    verbose=1, shuffle=True, epochs=epochs)

                name = '{}_LR{}_E{:03d}_S{:02d}'.format(
                    app.codename, lr_factor, epochs, i)
                save_history(res, name)
                
                MODEL_DIR = 'model'
                os.makedirs(MODEL_DIR, exist_ok=True)
                top_model.save(os.path.join(MODEL_DIR, '{}.hdf5'.format(name)))
                
                results[name] = res

                bar.update(1)
                
    for name, res in results.items():
        print(name, max(res.history['val_acc']))
    
    return results

In [40]:
collection = cri.CrCollection.load().labeled().tri_label()
train = collection.filter_by(dataset_index=0)
test = collection.filter_by(dataset_index=1)

all_results = []

for key, app in ku.applications.items():
    print(key.center(80, '-'))
    results = train_top_models(train, app)
    all_results.append(results)
    
with open('all_results.json') as f:
    json.dump(all_results, f)

-----------------------------------mobilenet------------------------------------
loading bottlenecks... 

  0%|          | 0/20 [00:00<?, ?it/s]

done
training models by split + learning rate
Train on 4824 samples, validate on 498 samples
Epoch 1/1


  5%|▌         | 1/20 [00:28<08:53, 28.10s/it]

Train on 4824 samples, validate on 498 samples
Epoch 1/1


 10%|█         | 2/20 [00:48<07:43, 25.74s/it]

Train on 4824 samples, validate on 498 samples
Epoch 1/1


 15%|█▌        | 3/20 [01:08<06:50, 24.13s/it]

Train on 4824 samples, validate on 498 samples
Epoch 1/1


 20%|██        | 4/20 [01:29<06:12, 23.26s/it]

Train on 4805 samples, validate on 532 samples
Epoch 1/1


 25%|██▌       | 5/20 [01:54<05:55, 23.73s/it]

Train on 4805 samples, validate on 532 samples
Epoch 1/1


 30%|███       | 6/20 [02:15<05:19, 22.85s/it]

Train on 4805 samples, validate on 532 samples
Epoch 1/1


 35%|███▌      | 7/20 [02:36<04:51, 22.42s/it]

Train on 4805 samples, validate on 532 samples
Epoch 1/1


 40%|████      | 8/20 [02:57<04:23, 21.94s/it]

Train on 4910 samples, validate on 492 samples
Epoch 1/1


 45%|████▌     | 9/20 [03:20<04:03, 22.12s/it]

Train on 4910 samples, validate on 492 samples
Epoch 1/1


 50%|█████     | 10/20 [03:42<03:40, 22.06s/it]

Train on 4910 samples, validate on 492 samples
Epoch 1/1


 55%|█████▌    | 11/20 [04:03<03:16, 21.82s/it]

Train on 4910 samples, validate on 492 samples
Epoch 1/1


 60%|██████    | 12/20 [04:25<02:54, 21.80s/it]

Train on 4943 samples, validate on 484 samples
Epoch 1/1


 65%|██████▌   | 13/20 [04:48<02:35, 22.22s/it]

Train on 4943 samples, validate on 484 samples
Epoch 1/1


 70%|███████   | 14/20 [05:10<02:13, 22.25s/it]

Train on 4943 samples, validate on 484 samples
Epoch 1/1


 75%|███████▌  | 15/20 [05:32<01:50, 22.17s/it]

Train on 4943 samples, validate on 484 samples
Epoch 1/1


 80%|████████  | 16/20 [05:55<01:29, 22.27s/it]

Train on 4886 samples, validate on 516 samples
Epoch 1/1


 85%|████████▌ | 17/20 [06:19<01:08, 22.76s/it]

Train on 4886 samples, validate on 516 samples
Epoch 1/1


 90%|█████████ | 18/20 [06:42<00:45, 22.79s/it]

Train on 4886 samples, validate on 516 samples
Epoch 1/1


 95%|█████████▌| 19/20 [07:05<00:22, 22.84s/it]

Train on 4886 samples, validate on 516 samples
Epoch 1/1


100%|██████████| 20/20 [07:27<00:00, 22.76s/it]


MOB_LR3_E001_S00 0.8393574287613712
MOB_LR4_E001_S00 0.8012048187983563
MOB_LR5_E001_S00 0.7570281129285513
MOB_LR6_E001_S00 0.5321285133380966
MOB_LR3_E001_S01 0.8139097735397798
MOB_LR4_E001_S01 0.740601502863088
MOB_LR5_E001_S01 0.7387218040631229
MOB_LR6_E001_S01 0.4981203005278021
MOB_LR3_E001_S02 0.2804878048780488
MOB_LR4_E001_S02 0.7296747962633768
MOB_LR5_E001_S02 0.6280487809723955
MOB_LR6_E001_S02 0.3556910570317168
MOB_LR3_E001_S03 0.7665289256198347
MOB_LR4_E001_S03 0.7892561983471075
MOB_LR5_E001_S03 0.6425619834710744
MOB_LR6_E001_S03 0.4049586776859504
MOB_LR3_E001_S04 0.8255813953488372
MOB_LR4_E001_S04 0.751937984496124
MOB_LR5_E001_S04 0.6007751937984496
MOB_LR6_E001_S04 0.37790697674418605
----------------------------------mobilenetv2-----------------------------------
loading bottlenecks... loading mobilenetv2 model


KeyboardInterrupt: 