# Train second set of one vs. rest (OVR) classifiers.

We train another set of classifiers that are used for classifications.  
These classifiers are trained using similar images for each target class; similarities between classes are computed in *classifier_similarity.ipynb*.

## Set up

In [None]:
import os
import sys

import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [None]:
BASE_MODEL_PATH="trained_model"
%mkdir -p $BASE_MODEL_PATH

In [None]:
from models.modelutils import ModelCompiler

In [None]:
compiler = ModelCompiler(BASE_MODEL_PATH)

In [None]:
from models.processor import create_generators

TRAIN_DATAGEN, VALID_DATAGEN = create_generators()

In [None]:
from models.modelutils import dir2filedict, split_fdict
import random

Load category and file path information.

In [None]:
fdict = dir2filedict("data")

In [None]:
categories = sorted(fdict.keys())

Split data int {train, validation, test} datasets.

In [None]:
trdict, testdict = split_fdict(fdict, test_size=0.2, random_state = 123)
trdict, valdict = split_fdict(trdict, test_size=0.2, random_state = 456)

In [None]:
valdict['clouds'][0:5]

Here is expected outputs.   
The output may be different if you create image urls yourself or exlude some files for GMM, but all the outputs in {*train.ipynb*, *classifier_similarity.ipynb*, *train_multiclass_classifier.ipynb*, *train_second.ipynb*} must be the same. 

['data/clouds/0678.jpeg',  
 'data/clouds/0701.jpeg',  
 'data/clouds/0431.jpeg',  
 'data/clouds/0033.jpeg',  
 'data/clouds/0290.jpeg']

## Train second level classifiers

Define a class for training second level classifiers.

In [None]:
from models.one_vs_all import OneVsAllModelTrainer
from models.modelutils import split_files

In [None]:
trainer = OneVsAllModelTrainer(TRAIN_DATAGEN, VALID_DATAGEN)

In [None]:
from models.one_vs_all import FilesPair, TrValFiles

In [None]:
class SecondLevelClassifierTrainer:
    def __init__(self, base_model_name, basedir, trainer, compiler):
        self.base_model_name = base_model_name
        self.basedir = basedir

        self.compiler = compiler
        self.trainer = trainer
        
    def setup_filedict(self, train_files_dict, valid_files_dict):
        self.train_files_dict = train_files_dict
        self.valid_files_dict = valid_files_dict
        self.valid_files_dict_org = self.valid_files_dict
        
    def _model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))
    
    def _split_by_set(self, target_key, false_keyset, files_dict):
        trues = files_dict[target_key]
        falses = [path for key in false_keyset for path in files_dict[key]]
        return FilesPair(trues, falses)
    
    def _split_files(self, targetkey, files_dict):
        return FilesPair(*split_files(targetkey, files_dict))
    
    def train_second_level(self, target_key, highcat_keyset, eachepochs=10, retrainings=1, removecheckpoint=True):
        self.trainer.retrainings = retrainings
        falseset = highcat_keyset - set(target_key)
        trs = self._split_by_set(target_key, falseset, self.train_files_dict)
        vals = self._split_by_set(target_key, falseset, self.valid_files_dict)
        trvals = TrValFiles(trs, vals)
        self._train_one_core("sec_"+target_key, trvals, eachepochs, removecheckpoint)
        
    def _train_one_setup(self, model_key, trvals):
        model_save_path = self._model_path(model_key)

        model = self.compiler.generate_compiled_model(model_save_path)
        self.trainer.set_model(model)
        self.trainer.set_savepath(model_save_path)
        self.trainer.set_dataset(trvals)

    def _train_one_core(self, model_key, trvals, eachepochs, removecheckpoint):
        self._train_one_setup(model_key, trvals)

        self.trainer.train_model(eachepochs=eachepochs)
        if removecheckpoint:
            self.trainer.remove_checkpoint()

    def remove_checkpoint(self, model_key):
        # utility method for cleaup interrupted case
        self.trainer.set_savepath(self._model_path(model_key))
        self.trainer.remove_checkpoint()

In [None]:
sec_trainer = SecondLevelClassifierTrainer("model", BASE_MODEL_PATH, trainer, compiler)

In [None]:
sec_trainer.setup_filedict(trdict, valdict)

Load $ClassSim$ results to gather similar classes for each target class. 

In [None]:
classsim = pd.read_pickle("results/valid_sim_df.dat")

### Execute training

In [None]:
SIM_THRESHOLD = 0.1

In [None]:
def train_seconds(keys, eachepochs=5):
    for targetkey in keys:
        similarkeyset = set(classsim[targetkey][classsim[targetkey] >= SIM_THRESHOLD].index)
        try:
            if len(similarkeyset) == 1:
                print("no similar category. only first classifier is enough. skip second training.")
            else:
                sec_trainer.train_second_level(targetkey, similarkeyset, eachepochs=eachepochs)
        except ValueError as e:
            print("ValueError, skip {0}: {1}".format(targetkey, e))

In [None]:
train_seconds(categories[0:], eachepochs=2)