# Performance test of $two \ level \ model$.

$two \ level \ model$ is composed of first set of OVR classifiers $f_{c, other}$ and secnd set of OVR classifiers $f^{(2)}_{c, other}$ .  
We test the model using a classification task, which is compared to the results of a baseline model that uses only $f_{c, other}$ .

## Set up

In [1]:
import os
import sys

import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


Load category and file path information.

In [3]:
fdict = dir2filedict("data")
categories = sorted(fdict.keys())

trdict, testdict = split_fdict(fdict, test_size=0.2, random_state = 123)
trdict, valdict = split_fdict(trdict, test_size=0.2, random_state = 456)

Create (category, path) information of test datasets.

In [4]:
testtup = [(key, file) for key in categories for file in testdict[key] ]

In [5]:
len(testtup)

2366

Check the list.

In [6]:
testtup[0:5]

[('bay', 'data/bay/0585.jpeg'),
 ('bay', 'data/bay/0587.jpeg'),
 ('bay', 'data/bay/0718.jpeg'),
 ('bay', 'data/bay/0183.jpeg'),
 ('bay', 'data/bay/0808.jpeg')]

Convert the list into a Pandas DAtaFrame for simple treatments.

In [7]:
testdf = pd.DataFrame({"category": [tup[0] for tup in testtup], "files": [tup[1] for tup in testtup]})

In [8]:
testdf_shuffled=testdf.sample(frac=1, random_state=123).reset_index(drop=True)

In [9]:
testdf_shuffled.head()

Unnamed: 0,category,files
0,sunrise,data/sunrise/0093.jpeg
1,mountain,data/mountain/0179.jpeg
2,bay,data/bay/0589.jpeg
3,sky,data/sky/0477.png
4,beach,data/beach/0528.jpeg


Define classes for $two \ level \ model$ .

In [10]:
from models.modelutils import load_best_model_if_exist
import os

In [11]:
class ModelBinder:
    def __init__(self, base_model_name, basedir, cats):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self._categories = cats
        self._OTHER_LABEL = "other"

    @classmethod
    def dup_from(cls, binder):
        return ModelBinder(binder.base_model_name, binder.basedir, binder._categories)

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))
        
    def predict_arrs(self, arrs):
        models = self._models
        preddict = {key: models[key].predict(arrs)[:, 1] for key in models.keys()}
        return pd.DataFrame(preddict)
    
    def _row2class(self, rowdf, threshold):
        for cat in self._categories:
            if rowdf[cat] >= threshold:
                return cat
        return self._OTHER_LABEL

    def df2classes(self, df, threshold = 0.5):
        res = []
        for i in range(len(df)):
            rowdf = df.iloc[i, :]
            res.append(self._row2class(rowdf, threshold))
        return res

In [12]:
class TwoLevelModel:
    def __init__(self, categories, h2binder, h1binder, otherlabel="OTHER"):
        self._categories = categories
        self._h2binder = h2binder
        self._h1binder = h1binder
        self._OTHERCLASS = -1
        self._OTHERLABEL = otherlabel
        self._FIRST_THRESHOLD = 0.5
        self._SECOND_THRESHOLD = 0.5

    def load_all(self):
        catkeys = self._categories
        self._h2binder.load_all_models(catkeys)
        self._h1binder.load_all_models(["sec_" + cat for cat in catkeys])

    def predict_arrs(self, arrs):
        df = self._predict_arrs(arrs)
        self._df = df
        return self._h1binder.df2classes(df, self._SECOND_THRESHOLD)

    def _predict_arrs(self, arrs):
        firstdf = pd.DataFrame(self._h2binder.predict_arrs(arrs))

        resultdf = pd.DataFrame(np.zeros(firstdf.shape))
        resultdf.columns = firstdf.columns

        for targetkey in self._categories:

            df = self._predict_second(targetkey, arrs, firstdf)
            if df is not None:
                resultdf.loc[df['orgindex'], targetkey] = df[targetkey].values

        return resultdf

    def _predict_second(self, targetcat, arrs, firstdf):
        filtered = firstdf[firstdf[targetcat] > self._FIRST_THRESHOLD]

        if len(filtered) == 0:
            return None

        farrs = arrs[filtered.index, :]
        model = self._h1binder.get_or_load_model('sec_' + targetcat)

        # no second level classifier, all score is already enough.
        if model == None:
            return pd.DataFrame({targetcat: np.ones(len(filtered.index)), 'orgindex': filtered.index})

        res = model.predict(farrs)
        scores = res[:, 1]

        return pd.DataFrame({targetcat: scores, 'orgindex': filtered.index})

Load classifiers.

In [13]:
binder = ModelBinder( "model", "trained_model", categories)

In [14]:
binder.load_all_models(categories)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


## Evaluate baseline model

Classifiers are trained in *train.ipynb*.

In [15]:
from models.processor import DataSet

In [16]:
ds = DataSet()

In [17]:
all_labels = pd.Series(dtype=object)
all_results = []

for chunk in ds.chunked(testdf_shuffled, 500):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    resdf = binder.predict_arrs(x)
    resclasses = binder.df2classes(resdf)
    
    all_labels = all_labels.append(label,  ignore_index=True)
    all_results.extend(resclasses)

In [18]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels), float(sum(all_labels == all_results))/len(all_labels)))

Number of test images: 2366
Accuracy: 0.5524091293322062


Save the results as picke.

In [19]:
results_df = pd.DataFrame({"labels": all_labels, "prediction": all_results})

In [20]:
results_df.to_pickle("results/baseline_model_prediction.dat")

## Evaluate $two \ level \ model$

Classifiers are trained in *train_second.ipynb*.

In [21]:
sec_binder = ModelBinder( "model", "trained_model", categories)

In [22]:
two_level_model = TwoLevelModel(categories, binder, sec_binder)

In [23]:
two_level_model.load_all()

load sec_bay
load sec_beach
load sec_birds
load sec_boeing
load sec_buildings
load sec_city
load sec_clouds
load sec_f-16
load sec_face
load sec_helicopter
load sec_mountain
load sec_ocean
load sec_ships
load sec_sky
load sec_sunrise
load sec_sunset


In [24]:
import tqdm

In [25]:
all_labels2 = pd.Series(dtype=object)
all_results2 = []

for chunk in tqdm.tqdm(ds.chunked(testdf_shuffled, 500)):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    resclasses = two_level_model.predict_arrs(x)
    
    all_labels2 = all_labels2.append(label,  ignore_index=True)
    all_results2.extend(resclasses)

100%|██████████| 5/5 [13:13<00:00, 158.69s/it]


In [26]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels2), float(sum(all_labels2 == all_results2))/len(all_labels2)))

Number of test images: 2366
Accuracy: 0.6111580726965342


Save the results as pickle.

In [27]:
results_df2 = pd.DataFrame({"labels": all_labels2, "prediction": all_results2})

In [28]:
results_df2.to_pickle("results/twolevelmodel_prediction.dat")

### Precision and recall

In [6]:
# twolevelmodel_prediction.dat
results_df1 = pd.read_pickle("results/baseline_model_prediction.dat")
results_df2 = pd.read_pickle("results/twolevelmodel_prediction.dat")

In [13]:
fdict = dir2filedict("data")
categories = sorted(fdict.keys())

In [29]:
def calc_tp(df, cat):
    trs = (df["labels"] == cat)
    prds = (df["prediction"] == cat)
    return sum(trs & prds)

In [36]:
def calc_fp(df, cat):
    trs = (df["labels"] != cat)
    prds = (df["prediction"] == cat)
    return sum(trs & prds)

In [38]:
def calc_fn(df, cat):
    trs = (df["labels"] == cat)
    prds = (df["prediction"] != cat)
    return sum(trs & prds)

In [40]:
TP2 = {cat: calc_tp(results_df2, cat) for cat in categories}

In [50]:
FP2 = {cat: calc_fp(results_df2, cat) for cat in categories}

In [51]:
FN2 = {cat: calc_fn(results_df2, cat) for cat in categories}

In [58]:
Precision2 = {cat: TP2[cat]/(TP2[cat]+FP2[cat]) for cat in categories}

In [59]:
Recall2 = {cat: TP2[cat]/(TP2[cat]+FN2[cat]) for cat in categories}

In [61]:
TP1 = {cat: calc_tp(results_df1, cat) for cat in categories}

In [63]:
FP1 = {cat: calc_fp(results_df1, cat) for cat in categories}

In [64]:
FN1 = {cat: calc_fn(results_df1, cat) for cat in categories}

In [65]:
Precision1 = {cat: TP1[cat]/(TP1[cat]+FP1[cat]) for cat in categories}

In [67]:
Recall1 = {cat: TP1[cat]/(TP1[cat]+FN1[cat]) for cat in categories}

In [70]:
Precision1

{'bay': 0.32387706855791965,
 'beach': 0.3684210526315789,
 'birds': 0.9378531073446328,
 'boeing': 0.5060728744939271,
 'buildings': 0.5847457627118644,
 'city': 0.2765957446808511,
 'clouds': 0.43656716417910446,
 'f-16': 0.6037735849056604,
 'face': 0.9570552147239264,
 'helicopter': 0.9072164948453608,
 'mountain': 0.8058252427184466,
 'ocean': 0.4044943820224719,
 'ships': 0.9367088607594937,
 'sky': 0.05172413793103448,
 'sunrise': 0.43,
 'sunset': 0.5}

In [71]:
Precision2

{'bay': 0.41841004184100417,
 'beach': 0.4864864864864865,
 'birds': 0.9222222222222223,
 'boeing': 0.7378048780487805,
 'buildings': 0.7920792079207921,
 'city': 0.5546218487394958,
 'clouds': 0.5445026178010471,
 'f-16': 0.8148148148148148,
 'face': 0.9298245614035088,
 'helicopter': 0.9083333333333333,
 'mountain': 0.8099173553719008,
 'ocean': 0.6049382716049383,
 'ships': 0.912621359223301,
 'sky': 0.6363636363636364,
 'sunrise': 0.3778801843317972,
 'sunset': 0.5}

In [72]:
Recall1

{'bay': 0.7784090909090909,
 'beach': 0.3218390804597701,
 'birds': 1.0,
 'boeing': 0.9920634920634921,
 'buildings': 0.8903225806451613,
 'city': 0.08843537414965986,
 'clouds': 0.9,
 'f-16': 0.5245901639344263,
 'face': 0.9069767441860465,
 'helicopter': 0.5986394557823129,
 'mountain': 0.51875,
 'ocean': 0.2553191489361702,
 'ships': 0.5,
 'sky': 0.02158273381294964,
 'sunrise': 0.3706896551724138,
 'sunset': 0.027210884353741496}

In [73]:
Recall2

{'bay': 0.5681818181818182,
 'beach': 0.6206896551724138,
 'birds': 1.0,
 'boeing': 0.9603174603174603,
 'buildings': 0.5161290322580645,
 'city': 0.4489795918367347,
 'clouds': 0.8,
 'f-16': 0.7213114754098361,
 'face': 0.9244186046511628,
 'helicopter': 0.7414965986394558,
 'mountain': 0.6125,
 'ocean': 0.3475177304964539,
 'ships': 0.6351351351351351,
 'sky': 0.10071942446043165,
 'sunrise': 0.7068965517241379,
 'sunset': 0.047619047619047616}

### temp code

In [54]:
Precision2["bay"]

0.41841004184100417

In [60]:
Recall2["bay"]

0.5681818181818182

In [66]:
Precision1["bay"]

0.32387706855791965

In [68]:
Recall1["bay"]

0.7784090909090909

In [41]:
TP2["bay"]

100

In [62]:
TP1["bay"]

137

In [30]:
calc_tp(results_df2, categories[0])

100

In [31]:
sum(trs)

176

In [33]:
sum(prds)

239

In [37]:
calc_fp(results_df2, categories[0])

139

In [39]:
calc_fn(results_df2, categories[0])

76

In [55]:
TP2["bay"]

100

In [56]:
FP2["bay"]

139

In [32]:
100/176

0.5681818181818182

In [20]:
trs = results_df2["labels"] == categories[0]

In [21]:
prds = results_df2["prediction"] == categories[0]

In [27]:
trs[0:25]

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
Name: labels, dtype: bool

In [26]:
prds[0:25]

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21     True
22    False
23    False
24    False
Name: prediction, dtype: bool

In [28]:
trs[0:25] & prds[0:25]

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
dtype: bool

In [15]:
results_df2["labels"] == categories[0]

0       False
1       False
2        True
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
2336    False
2337    False
2338    False
2339     True
2340    False
2341    False
2342    False
2343    False
2344    False
2345     True
2346    False
2347    False
2348    False
2349    False
2350    False
2351     True
2352    False
2353    False
2354    False
2355     True
2356     True
2357    False
2358     True
2359    False
2360    False
2361    False
2362    False
2363    False
2364    False
2365    False
Name: labels, Length: 2366, dtype: bool

Index(['labels', 'prediction'], dtype='object')

In [11]:
results_df2["prediction"][0:5]

0     sunrise
1    mountain
2         bay
3         sky
4       beach
Name: prediction, dtype: object

In [34]:
all_labels2, all_results2 = results_df2["labels"], results_df2["prediction"]

In [35]:
print("Number of test images: {}\nAccuracy: {}".format(len(all_labels2), float(sum(all_labels2 == all_results2))/len(all_labels2)))

Number of test images: 2366
Accuracy: 0.6107354184277262
