# Performance test of $two \ level \ model$.

$two \ level \ model$ is composed of first set of OVR classifiers $f_{c, other}$ and secnd set of OVR classifiers $f^{(2)}_{c, other}$ .  
We test the model using a classification task, which is compared to the results of a baseline model that uses only $f_{c, other}$ .

## Set up

In [1]:
import os
import sys

import numpy as np

import pandas as pd
import glob

In [2]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


Load category and file path information.

In [3]:
fdict = dir2filedict("data")
categories = sorted(fdict.keys())

trdict, testdict = split_fdict(fdict, test_size=0.2, random_state = 123)
trdict, valdict = split_fdict(trdict, test_size=0.2, random_state = 456)

Create (category, path) information of test datasets.

In [4]:
testtup = [(key, file) for key in categories for file in testdict[key] ]

In [5]:
len(testtup)

2366

Check the list.

In [6]:
testtup[0:5]

[('bay', 'data/bay/0585.jpeg'),
 ('bay', 'data/bay/0587.jpeg'),
 ('bay', 'data/bay/0718.jpeg'),
 ('bay', 'data/bay/0183.jpeg'),
 ('bay', 'data/bay/0808.jpeg')]

Convert the list into a Pandas DAtaFrame for simple treatments.

In [7]:
testdf = pd.DataFrame({"category": [tup[0] for tup in testtup], "files": [tup[1] for tup in testtup]})

In [8]:
testdf_shuffled=testdf.sample(frac=1, random_state=123).reset_index(drop=True)

In [9]:
testdf_shuffled.head()

Unnamed: 0,category,files
0,sunrise,data/sunrise/0093.jpeg
1,mountain,data/mountain/0179.jpeg
2,bay,data/bay/0589.jpeg
3,sky,data/sky/0477.png
4,beach,data/beach/0528.jpeg


Define classes for $two \ level \ model$ .

In [10]:
from models.modelutils import load_best_model_if_exist
import os

In [11]:
class ModelBinder:
    def __init__(self, base_model_name, basedir, cats):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self._categories = cats
        self._OTHER_LABEL = "other"

    @classmethod
    def dup_from(cls, binder):
        return ModelBinder(binder.base_model_name, binder.basedir, binder._categories)

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))
        
    def predict_arrs(self, arrs):
        models = self._models
        preddict = {key: models[key].predict(arrs)[:, 1] for key in models.keys()}
        return pd.DataFrame(preddict)
    
    def _row2class(self, rowdf, threshold):
        for cat in self._categories:
            if rowdf[cat] >= threshold:
                return cat
        return self._OTHER_LABEL

    def df2classes(self, df, threshold = 0.5):
        res = []
        for i in range(len(df)):
            rowdf = df.iloc[i, :]
            res.append(self._row2class(rowdf, threshold))
        return res

In [12]:
class TwoLevelModel:
    def __init__(self, categories, h2binder, h1binder, otherlabel="OTHER"):
        self._categories = categories
        self._h2binder = h2binder
        self._h1binder = h1binder
        self._OTHERCLASS = -1
        self._OTHERLABEL = otherlabel
        self._FIRST_THRESHOLD = 0.5
        self._SECOND_THRESHOLD = 0.5

    def load_all(self):
        catkeys = self._categories
        self._h2binder.load_all_models(catkeys)
        self._h1binder.load_all_models(["sec_" + cat for cat in catkeys])

    def predict_arrs(self, arrs):
        df = self._predict_arrs(arrs)
        self._df = df
        return self._h1binder.df2classes(df, self._SECOND_THRESHOLD)

    def _predict_arrs(self, arrs):
        firstdf = pd.DataFrame(self._h2binder.predict_arrs(arrs))

        resultdf = pd.DataFrame(np.zeros(firstdf.shape))
        resultdf.columns = firstdf.columns

        for targetkey in self._categories:

            df = self._predict_second(targetkey, arrs, firstdf)
            if df is not None:
                resultdf.loc[df['orgindex'], targetkey] = df[targetkey].values

        return resultdf

    def _predict_second(self, targetcat, arrs, firstdf):
        filtered = firstdf[firstdf[targetcat] > self._FIRST_THRESHOLD]

        if len(filtered) == 0:
            return None

        farrs = arrs[filtered.index, :]
        model = self._h1binder.get_or_load_model('sec_' + targetcat)

        # no second level classifier, all score is already enough.
        if model == None:
            return pd.DataFrame({targetcat: np.ones(len(filtered.index)), 'orgindex': filtered.index})

        res = model.predict(farrs)
        scores = res[:, 1]

        return pd.DataFrame({targetcat: scores, 'orgindex': filtered.index})

Load classifiers.

In [13]:
binder = ModelBinder( "model", "trained_model", categories)

In [14]:
binder.load_all_models(categories)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


## Evaluate baseline model

Classifiers are trained in *train.ipynb*.

In [15]:
from models.processor import DataSet

In [16]:
ds = DataSet()

In [17]:
all_labels = pd.Series(dtype=object)
all_results = []

for chunk in ds.chunked(testdf_shuffled, 500):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    resdf = binder.predict_arrs(x)
    resclasses = binder.df2classes(resdf)
    
    all_labels = all_labels.append(label,  ignore_index=True)
    all_results.extend(resclasses)

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


In [18]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels), float(sum(all_labels == all_results))/len(all_labels)))

Number of test images: 2366
Accuracy: 0.5524091293322062


Save the results as picke.

In [19]:
results_df = pd.DataFrame({"labels": all_labels, "prediction": all_results})

In [20]:
results_df.to_pickle("results/baseline_model_prediction.dat")

## Evaluate $two \ level \ model$

Classifiers are trained in *train_second.ipynb*.

In [21]:
sec_binder = ModelBinder( "model", "trained_model", categories)

In [22]:
two_level_model = TwoLevelModel(categories, binder, sec_binder)

In [23]:
two_level_model.load_all()

load sec_bay
load sec_beach
load sec_birds
load sec_boeing
load sec_buildings
load sec_city
load sec_clouds
load sec_f-16
load sec_face
load sec_helicopter
load sec_mountain
load sec_ocean
load sec_ships
load sec_sky
load sec_sunrise
load sec_sunset


In [24]:
import tqdm

In [25]:
all_labels2 = pd.Series(dtype=object)
all_results2 = []

for chunk in tqdm.tqdm(ds.chunked(testdf_shuffled, 500)):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    resclasses = two_level_model.predict_arrs(x)
    
    all_labels2 = all_labels2.append(label,  ignore_index=True)
    all_results2.extend(resclasses)

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
100%|██████████| 5/5 [11:59<00:00, 143.99s/it]


In [26]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels2), float(sum(all_labels2 == all_results2))/len(all_labels2)))

Number of test images: 2366
Accuracy: 0.6111580726965342


Save the results as pickle.

In [27]:
results_df2 = pd.DataFrame({"labels": all_labels2, "prediction": all_results2})

In [28]:
results_df2.to_pickle("results/twolevelmodel_prediction.dat")