# Performance test of $two \ level \ model$.

$two \ level \ model$ is composed of first set of OVR classifiers $f_{c, other}$ and secnd set of OVR classifiers $f^{(2)}_{c, other}$ .  
We test the model using a classification task, which is compared to the results of a baseline model that uses only $f_{c, other}$ .

### Fix typo of fgcv->fgvc

In [5]:
import glob, os

In [9]:
for f in glob.iglob("trained_model/modelfgcv*"):
    os.rename(f, "trained_model/modelfgvc" + f[len('trained_model/modelfgcv'):])

## Set up

In [1]:
import os
import sys

import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
from models.modelutils import dir2filedict_sorted
import random

Using TensorFlow backend.


Load category and file path information.

In [3]:
testdict = dir2filedict_sorted("data_fgvc/test")
categories = [str(i) for i in range(0, 100)]

Create (category, path) information of test datasets.

In [29]:
#for debug
# org_testdict = testdict
# categories = [str(i) for i in range(0, 5)]
# testdict = {key: org_testdict[key] for key in categories}

In [5]:
testtup = [(key, file) for key in categories for file in testdict[key] ]

In [6]:
len(testtup)

166

Check the list.

In [16]:
testtup[0:5]

[('0', 'data_fgvc/test/0/0062765.jpg'),
 ('0', 'data_fgvc/test/0/0064932.jpg'),
 ('0', 'data_fgvc/test/0/0197342.jpg'),
 ('0', 'data_fgvc/test/0/0447936.jpg'),
 ('0', 'data_fgvc/test/0/0536515.jpg')]

Convert the list into a Pandas DataFrame for simple treatments.

In [7]:
testdf = pd.DataFrame({"category": [tup[0] for tup in testtup], "files": [tup[1] for tup in testtup]})

In [8]:
testdf_shuffled=testdf.sample(frac=1, random_state=123).reset_index(drop=True)

In [19]:
testdf_shuffled.head()

Unnamed: 0,category,files
0,13,data_fgvc/test/13/2233262.jpg
1,72,data_fgvc/test/72/1372357.jpg
2,53,data_fgvc/test/53/0773531.jpg
3,39,data_fgvc/test/39/1340322.jpg
4,13,data_fgvc/test/13/1313993.jpg


Define classes for $two \ level \ model$ .

In [9]:
from models.modelutils import load_best_model_if_exist
import os

In [10]:
class ModelBinder:
    def __init__(self, base_model_name, basedir, cats):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self._categories = cats
        self._OTHER_LABEL = "other"

    @classmethod
    def dup_from(cls, binder):
        return ModelBinder(binder.base_model_name, binder.basedir, binder._categories)

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))
        
    def predict_arrs(self, arrs):
        models = self._models
        preddict = {key: models[key].predict(arrs)[:, 1] for key in models.keys()}
        return pd.DataFrame(preddict)
    
    def _row2class(self, rowdf, threshold):
        for cat in self._categories:
            if rowdf[cat] >= threshold:
                return cat
        return self._OTHER_LABEL

    def df2classes(self, df, threshold = 0.5):
        res = []
        for i in range(len(df)):
            rowdf = df.iloc[i, :]
            res.append(self._row2class(rowdf, threshold))
        return res

In [11]:
class TwoLevelModel:
    def __init__(self, categories, h1binder, otherlabel="OTHER"):
        self._categories = categories
        self._h1binder = h1binder
        self._OTHERCLASS = -1
        self._OTHERLABEL = otherlabel
        self._FIRST_THRESHOLD = 0.5
        self._SECOND_THRESHOLD = 0.5

    def load_all(self):
        catkeys = self._categories
        self._h1binder.load_all_models(["sec_" + cat for cat in catkeys])

    def predict_arrs(self, arrs, firstdf):
        df = self._predict_arrs(arrs, firstdf)
        self._df = df
        return self._h1binder.df2classes(df, self._SECOND_THRESHOLD)

    def _predict_arrs(self, arrs, firstdf):
        # firstdf = pd.DataFrame(self._h2binder.predict_arrs(arrs))

        resultdf = pd.DataFrame(np.zeros(firstdf.shape))
        resultdf.columns = firstdf.columns

        for targetkey in self._categories:

            df = self._predict_second(targetkey, arrs, firstdf)
            if df is not None:
                resultdf.loc[df['orgindex'], targetkey] = df[targetkey].values

        return resultdf

    def _predict_second(self, targetcat, arrs, firstdf):
        filtered = firstdf[firstdf[targetcat] > self._FIRST_THRESHOLD]

        if len(filtered) == 0:
            return None

        farrs = arrs[filtered.index, :]
        model = self._h1binder.get_or_load_model('sec_' + targetcat)

        # no second level classifier, all score is already enough.
        if model == None:
            return pd.DataFrame({targetcat: np.ones(len(filtered.index)), 'orgindex': filtered.index})

        res = model.predict(farrs)
        scores = res[:, 1]

        return pd.DataFrame({targetcat: scores, 'orgindex': filtered.index})

In [12]:
from models.processor import DataSet

In [13]:
ds = DataSet()

## Conduct first level prediction (h2 prediction) and store result.

Also evaluate baseline model.

In [23]:
binder = ModelBinder( "modelfgvc", "trained_model", categories)

In [24]:
binder.load_all_models(categories)

load 0
load 1
load 2
load 3
load 4
load 5
load 6
load 7
load 8
load 9
load 10
load 11
load 12
load 13
load 14
load 15
load 16
load 17
load 18
load 19
load 20
load 21
load 22
load 23
load 24
load 25
load 26
load 27
load 28
load 29
load 30
load 31
load 32
load 33
load 34
load 35
load 36
load 37
load 38
load 39
load 40
load 41
load 42
load 43
load 44
load 45
load 46
load 47
load 48
load 49
load 50
load 51
load 52
load 53
load 54
load 55
load 56
load 57
load 58
load 59
load 60
load 61
load 62
load 63
load 64
load 65
load 66
load 67
load 68
load 69
load 70
load 71
load 72
load 73
load 74
load 75
load 76
load 77
load 78
load 79
load 80
load 81
load 82
load 83
load 84
load 85
load 86
load 87
load 88
load 89
load 90
load 91
load 92
load 93
load 94
load 95
load 96
load 97
load 98
load 99


### Evaluate baseline model

Classifiers are trained in *train.ipynb*.

In [14]:
#for debug
# org_testdf_shuffled = testdf_shuffled
# testdf_shuffled = org_testdf_shuffled[0:5]
# testdf_shuffled.shape

(5, 2)

In [61]:
#for debug
# modelsbackup = binder._models
# binder._models = {key: modelsbackup[key] for key in categories}

In [63]:
all_labels = pd.Series(dtype=object)
all_results = []
all_result_dfs = []

for chunk in ds.chunked(testdf_shuffled, 500):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    resdf = binder.predict_arrs(x)
    resclasses = binder.df2classes(resdf)
    
    all_labels = all_labels.append(label,  ignore_index=True)
    all_results.extend(resclasses)
    all_result_dfs.append(resdf)

In [66]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels), float(sum(all_labels == all_results))/len(all_labels)))

Number of test images: 5
Accuracy: 0.2


Save the results as picke.

In [67]:
results_df = pd.DataFrame({"labels": all_labels, "prediction": all_results})

In [68]:
results_df.to_pickle("results/baseline_model_prediction_fgvc.dat")

In [69]:
h2df = pd.concat(all_result_dfs)
h2df['files'] = testdf_shuffled["files"]

In [71]:
h2df.to_pickle("results/baseline_h2_df_fgvc.dat")

## Evaluate $two \ level \ model$

Classifiers are trained in *train_second.ipynb*.

In [15]:
h2df = pd.read_pickle("results/baseline_h2_df_fgvc.dat")

In [16]:
sec_binder = ModelBinder( "modelfgvc", "trained_model", categories)

In [17]:
two_level_model = TwoLevelModel(categories, sec_binder)

In [18]:
two_level_model.load_all()

load sec_0
load sec_1
load sec_2
load sec_3
load sec_4


In [21]:
import tqdm

In [22]:
def h2_subset(h2df, files):
    return h2df[h2df['files'].isin(files)][list(set(h2df.columns) - {"files"})]

In [23]:
all_labels2 = pd.Series(dtype=object)
all_results2 = []

for chunk in tqdm.tqdm(ds.chunked(testdf_shuffled, 500)):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    firstdf = h2_subset(h2df, chunk['files'])
    resclasses = two_level_model.predict_arrs(x, firstdf)
    
    all_labels2 = all_labels2.append(label,  ignore_index=True)
    all_results2.extend(resclasses)

100%|██████████| 1/1 [00:08<00:00,  8.18s/it]


In [24]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels2), float(sum(all_labels2 == all_results2))/len(all_labels2)))

Number of test images: 5
Accuracy: 0.4


Save the results as pickle.

In [25]:
results_df2 = pd.DataFrame({"labels": all_labels2, "prediction": all_results2})

In [26]:
results_df2.to_pickle("results/twolevelmodel_prediction_fgvc.dat")

### Precision and recall

In [3]:
# twolevelmodel_prediction.dat
results_df1 = pd.read_pickle("results/baseline_model_prediction_fgvc.dat")
results_df2 = pd.read_pickle("results/twolevelmodel_prediction_fgvc.dat")

In [4]:
categories = [str(i) for i in range(0, 100)]

In [5]:
def calc_tp(df, cat):
    trs = (df["labels"] == cat)
    prds = (df["prediction"] == cat)
    return sum(trs & prds)

In [6]:
def calc_fp(df, cat):
    trs = (df["labels"] != cat)
    prds = (df["prediction"] == cat)
    return sum(trs & prds)

In [7]:
def calc_fn(df, cat):
    trs = (df["labels"] == cat)
    prds = (df["prediction"] != cat)
    return sum(trs & prds)

In [8]:
TP2 = {cat: calc_tp(results_df2, cat) for cat in categories}

In [9]:
FP2 = {cat: calc_fp(results_df2, cat) for cat in categories}

In [10]:
FN2 = {cat: calc_fn(results_df2, cat) for cat in categories}

In [11]:
Precision2 = {cat: TP2[cat]/(TP2[cat]+FP2[cat]) for cat in categories}

In [12]:
Recall2 = {cat: TP2[cat]/(TP2[cat]+FN2[cat]) for cat in categories}

In [13]:
TP1 = {cat: calc_tp(results_df1, cat) for cat in categories}

In [14]:
FP1 = {cat: calc_fp(results_df1, cat) for cat in categories}

In [15]:
FN1 = {cat: calc_fn(results_df1, cat) for cat in categories}

In [16]:
Precision1 = {cat: TP1[cat]/(TP1[cat]+FP1[cat]) for cat in categories}

In [17]:
Recall1 = {cat: TP1[cat]/(TP1[cat]+FN1[cat]) for cat in categories}

In [98]:
precisiondf = pd.DataFrame({"classes": categories, "baseline": [Precision1[cat] for cat in categories], "twolevel": [Precision2[cat] for cat in categories]}).set_index("classes")

In [99]:
precisiondf

Unnamed: 0_level_0,baseline,twolevel
classes,Unnamed: 1_level_1,Unnamed: 2_level_1
bay,0.323877,0.41841
beach,0.368421,0.486486
birds,0.937853,0.922222
boeing,0.506073,0.737805
buildings,0.584746,0.792079
city,0.276596,0.554622
clouds,0.436567,0.544503
f-16,0.603774,0.814815
face,0.957055,0.929825
helicopter,0.907216,0.908333


In [105]:
precisiondf.mean()

baseline    0.564433
twolevel    0.684426
dtype: float64

In [100]:
recalldf = pd.DataFrame({"classes": categories, "baseline": [Recall1[cat] for cat in categories], "twolevel": [Recall2[cat] for cat in categories]}).set_index("classes")

In [101]:
recalldf

Unnamed: 0_level_0,baseline,twolevel
classes,Unnamed: 1_level_1,Unnamed: 2_level_1
bay,0.778409,0.568182
beach,0.321839,0.62069
birds,1.0,1.0
boeing,0.992063,0.960317
buildings,0.890323,0.516129
city,0.088435,0.44898
clouds,0.9,0.8
f-16,0.52459,0.721311
face,0.906977,0.924419
helicopter,0.598639,0.741497


In [104]:
# f1 score
2*precisiondf*recalldf/(precisiondf+recalldf)

Unnamed: 0_level_0,baseline,twolevel
classes,Unnamed: 1_level_1,Unnamed: 2_level_1
bay,0.457429,0.481928
beach,0.343558,0.545455
birds,0.96793,0.959538
boeing,0.670241,0.834483
buildings,0.705882,0.625
city,0.134021,0.496241
clouds,0.58794,0.647975
f-16,0.561404,0.765217
face,0.931343,0.927114
helicopter,0.721311,0.816479
