# Train second set of one vs. rest (OVR) classifiers.

We train another set of classifiers that are used for classifications.  
These classifiers are trained using similar images for each target class; similarities between classes are computed in *classifier_similarity.ipynb*.

## Set up

In [1]:
import os
import sys

import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
BASE_MODEL_PATH="trained_model"
%mkdir -p $BASE_MODEL_PATH

In [3]:
from models.modelutils import ModelCompiler

Using TensorFlow backend.


In [4]:
compiler = ModelCompiler(BASE_MODEL_PATH)

In [5]:
from models.processor import create_generators

TRAIN_DATAGEN, VALID_DATAGEN = create_generators()

In [6]:
from models.modelutils import dir2filedict_sorted
import random

Load category and file path information.

In [7]:
trdict = dir2filedict_sorted("data_fgvc/train")
valdict = dir2filedict_sorted("data_fgvc/valid")
categories = [str(i) for i in range(0, 100)]

In [8]:
valdict['0'][0:5]

['data_fgvc/valid/0/0062781.jpg',
 'data_fgvc/valid/0/0113201.jpg',
 'data_fgvc/valid/0/0450014.jpg',
 'data_fgvc/valid/0/0602177.jpg',
 'data_fgvc/valid/0/0716386.jpg']

Here is expected outputs.   
All the outputs in {*train.ipynb*, *classifier_similarity.ipynb*, *train_multiclass_classifier.ipynb*, *train_second.ipynb*} must be the same. 

['data_fgvc/valid/0/0062781.jpg',  
 'data_fgvc/valid/0/0113201.jpg',  
 'data_fgvc/valid/0/0450014.jpg',  
 'data_fgvc/valid/0/0602177.jpg',  
 'data_fgvc/valid/0/0716386.jpg']

## Train second level classifiers

Define a class for training second level classifiers.

In [9]:
from models.one_vs_all import OneVsAllModelTrainer
from models.modelutils import split_files

In [10]:
trainer = OneVsAllModelTrainer(TRAIN_DATAGEN, VALID_DATAGEN)

In [11]:
from models.one_vs_all import FilesPair, TrValFiles

In [12]:
class SecondLevelClassifierTrainer:
    def __init__(self, base_model_name, basedir, trainer, compiler):
        self.base_model_name = base_model_name
        self.basedir = basedir

        self.compiler = compiler
        self.trainer = trainer
        
    def setup_filedict(self, train_files_dict, valid_files_dict):
        self.train_files_dict = train_files_dict
        self.valid_files_dict = valid_files_dict
        self.valid_files_dict_org = self.valid_files_dict
        
    def _model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))
    
    def _split_by_set(self, target_key, false_keyset, files_dict):
        trues = files_dict[target_key]
        falses = [path for key in false_keyset for path in files_dict[key]]
        return FilesPair(trues, falses)
    
    def _split_files(self, targetkey, files_dict):
        return FilesPair(*split_files(targetkey, files_dict))
    
    def train_second_level(self, target_key, highcat_keyset, eachepochs=10, retrainings=1, removecheckpoint=True):
        self.trainer.retrainings = retrainings
        falseset = highcat_keyset - set(target_key)
        trs = self._split_by_set(target_key, falseset, self.train_files_dict)
        vals = self._split_by_set(target_key, falseset, self.valid_files_dict)
        trvals = TrValFiles(trs, vals)
        self._train_one_core("sec_"+target_key, trvals, eachepochs, removecheckpoint)
        
    def _train_one_setup(self, model_key, trvals):
        model_save_path = self._model_path(model_key)

        model = self.compiler.generate_compiled_model(model_save_path)
        self.trainer.set_model(model)
        self.trainer.set_savepath(model_save_path)
        self.trainer.set_dataset(trvals)

    def _train_one_core(self, model_key, trvals, eachepochs, removecheckpoint):
        self._train_one_setup(model_key, trvals)

        self.trainer.train_model(eachepochs=eachepochs, hard_coded_steps_per_epoch=(100, 10))
        if removecheckpoint:
            self.trainer.remove_checkpoint()

    def remove_checkpoint(self, model_key):
        # utility method for cleaup interrupted case
        self.trainer.set_savepath(self._model_path(model_key))
        self.trainer.remove_checkpoint()

In [13]:
sec_trainer = SecondLevelClassifierTrainer("modelfgvc", BASE_MODEL_PATH, trainer, compiler)

In [14]:
sec_trainer.setup_filedict(trdict, valdict)

Load $ClassSim$ results to gather similar classes for each target class. 

In [15]:
classsim = pd.read_pickle("results/valid_sim_df_fgvc.dat")

### Execute training

In [16]:
# SIM_THRESHOLD = 0.1
# This data set is much fine grained. So threshold should be higher. We choose 0.4 for average similary class as about 18.
SIM_THRESHOLD = 0.4

In [17]:
import glob
def list_checkpoints_except_last(model_abs_path_prefix):
    pat = "{}-*.h5".format(model_abs_path_prefix)
    paths = list(glob.iglob(pat))
    laststarts = "{}-99".format(model_abs_path_prefix)
    return [path for path in paths if not path.startswith(laststarts)]

def remove_except_last(model_abs_path_prefix):
    list(map(os.remove, list_checkpoints_except_last(model_abs_path_prefix)))

In [18]:
# removetype= ["keeplast" ,"keepbest", None]

def train_seconds(keys, eachepochs=5, removecheckpoint="keeplast"):
    keepbest = removecheckpoint=="keepbest"
    keeplast = removecheckpoint=="keeplast"
    print(keepbest, keeplast)
    for targetkey in keys:
        similarkeyset = set(classsim[targetkey][classsim[targetkey] >= SIM_THRESHOLD].index)
        try:
            if len(similarkeyset) == 1:
                print("no similar category. only first classifier is enough. skip second training.")
            else:
                sec_trainer.train_second_level(targetkey, similarkeyset, eachepochs=eachepochs, removecheckpoint=keepbest)
                if keeplast:
                    print("deb here")
                    remove_except_last(sec_trainer._model_path("sec_" + targetkey))
        except ValueError as e:
            print("ValueError, skip {0}: {1}".format(targetkey, e))

In [20]:
train_seconds(categories[0:], eachepochs=2, removecheckpoint="keepbest")

True False
Epoch 1/2
Epoch 2/2
Epoch 1/2


KeyboardInterrupt: 

# Check whether model is property learned

In [25]:
target_cat = "0"

In [26]:
similarkeyset = set(classsim[target_cat][classsim[target_cat] >= SIM_THRESHOLD].index)

In [27]:
len(similarkeyset)

11

In [30]:
falseset = list(sorted(similarkeyset - set(target_cat)))
falseset[0:5]

['10', '13', '15', '22', '45']

In [33]:
from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

In [42]:
model_0 = load_best_model_if_exist("trained_model/modelfgvc_sec_0")

In [41]:
model_0

In [31]:
test_files = valdict['0']+valdict[falseset[0]]

In [34]:
ds = DataSet()

In [35]:
datas = ds.files_to_dataset(test_files)

In [53]:
res = model_0.predict(datas)

In [54]:
res[:, 1]

array([ 0.12177239,  0.14247935,  0.13586961,  0.10310103,  0.11975336,
        0.29116184,  0.25873953,  0.07083958,  0.05815217,  0.05779626,
        0.06138925,  0.05145602,  0.09198892,  0.19282584], dtype=float32)

In [55]:
test_files

['data_fgvc/valid/0/0062781.jpg',
 'data_fgvc/valid/0/0113201.jpg',
 'data_fgvc/valid/0/0450014.jpg',
 'data_fgvc/valid/0/0602177.jpg',
 'data_fgvc/valid/0/0716386.jpg',
 'data_fgvc/valid/0/0869722.jpg',
 'data_fgvc/valid/0/1514481.jpg',
 'data_fgvc/valid/52/0062226.jpg',
 'data_fgvc/valid/52/0136197.jpg',
 'data_fgvc/valid/52/0171956.jpg',
 'data_fgvc/valid/52/0523171.jpg',
 'data_fgvc/valid/52/0523172.jpg',
 'data_fgvc/valid/52/0894317.jpg',
 'data_fgvc/valid/52/1627560.jpg']

In [40]:
train_seconds(list(categories[0]), eachepochs=3, removecheckpoint=False)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
from models.modelutils import load_model_from

In [22]:
# epoch 2
model_0 = load_model_from("trained_model/modelfgvc_sec_0.json", "trained_model/modelfgvc_sec_0-02-0.872.h5")

In [36]:
res = model_0.predict(datas)

In [37]:
res[:, 1]

array([ 0.39031339,  0.00246694,  0.24623962,  0.99023902,  0.00664667,
        0.08404001,  0.87686318,  0.17210899,  0.02940854,  0.0094766 ,
        0.98935497,  0.92093688,  0.81629044,  0.82622737], dtype=float32)

In [38]:
test_files

['data_fgvc/valid/0/0062781.jpg',
 'data_fgvc/valid/0/0113201.jpg',
 'data_fgvc/valid/0/0450014.jpg',
 'data_fgvc/valid/0/0602177.jpg',
 'data_fgvc/valid/0/0716386.jpg',
 'data_fgvc/valid/0/0869722.jpg',
 'data_fgvc/valid/0/1514481.jpg',
 'data_fgvc/valid/10/0074633.jpg',
 'data_fgvc/valid/10/0139685.jpg',
 'data_fgvc/valid/10/0688093.jpg',
 'data_fgvc/valid/10/0713645.jpg',
 'data_fgvc/valid/10/0809727.jpg',
 'data_fgvc/valid/10/0869644.jpg',
 'data_fgvc/valid/10/1068733.jpg']

In [46]:
def check(weight_path):
    model_0 = load_model_from("trained_model/modelfgvc_sec_0.json", weight_path)
    res = model_0.predict(datas)
    return res[:, 1]

In [49]:
# epoch 2
check("trained_model/modelfgvc_sec_0-02-0.872.h5")

array([ 0.39031339,  0.00246694,  0.24623962,  0.99023902,  0.00664667,
        0.08404001,  0.87686318,  0.17210899,  0.02940854,  0.0094766 ,
        0.98935497,  0.92093688,  0.81629044,  0.82622737], dtype=float32)

In [47]:
# epoch 3
check("trained_model/modelfgvc_sec_0-01-0.845.h5")

array([  9.60957766e-01,   3.40920151e-03,   3.08307279e-02,
         9.85011578e-01,   3.60858347e-03,   4.93161939e-02,
         9.92795169e-01,   1.21976174e-01,   3.71880196e-02,
         1.83637065e-04,   6.54763103e-01,   1.94192082e-01,
         7.86803663e-01,   7.11530149e-01], dtype=float32)

In [43]:
# epoch 4
check("trained_model/modelfgvc_sec_0-02-0.804.h5")

array([  9.94575262e-01,   3.64112412e-03,   7.46889859e-02,
         9.87466872e-01,   5.67003945e-03,   1.41340062e-01,
         9.98587906e-01,   2.61859149e-01,   7.36977607e-02,
         4.96607448e-04,   8.58851254e-01,   1.38949797e-01,
         4.62956578e-01,   9.22738492e-01], dtype=float32)

In [48]:
# epoch 5
check("trained_model/modelfgvc_sec_0-03-0.818.h5")

array([ 0.99787402,  0.00530914,  0.04714767,  0.99593902,  0.00302442,
        0.13430484,  0.99962211,  0.28590381,  0.08145444,  0.00138246,
        0.87990385,  0.18785156,  0.76647055,  0.90059167], dtype=float32)

In [68]:
check("trained_model/modelfgvc_sec_0-99-0.831.h5")

array([ 0.04610051,  0.09499395,  0.6616056 ,  0.95639879,  0.09035208,
        0.54387975,  0.86188912,  0.64478827,  0.9523049 ,  0.00184466,
        0.70983821,  0.93802178,  0.87476081,  0.49245015], dtype=float32)

### Try and error

In [23]:
vals = FilesPair(*split_files(target_cat, valdict))

In [26]:
vals.trues[0:5]

['data_fgvc/valid/0/0062781.jpg',
 'data_fgvc/valid/0/0113201.jpg',
 'data_fgvc/valid/0/0450014.jpg',
 'data_fgvc/valid/0/0602177.jpg',
 'data_fgvc/valid/0/0716386.jpg']

In [27]:
vals.falses[0:5]

['data_fgvc/valid/63/0063918.jpg',
 'data_fgvc/valid/63/0522914.jpg',
 'data_fgvc/valid/63/0917793.jpg',
 'data_fgvc/valid/63/0959054.jpg',
 'data_fgvc/valid/63/1950704.jpg']

In [29]:
all_files = [path for files in [vals.trues, vals.falses] for path in files]

In [30]:
len(all_files)

700