In [1]:
import os
import sys

import numpy as np

import pandas as pd
import glob

In [2]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


In [3]:
fdict = dir2filedict("data")
categories = sorted(fdict.keys())

trdict, testdict = split_fdict(fdict, test_size=0.2, random_state = 123)
trdict, valdict = split_fdict(trdict, test_size=0.2, random_state = 456)

In [10]:
testtup = [(key, file) for key in testdict.keys() for file in testdict[key] ]

In [11]:
len(testtup)

2366

In [12]:
testtup[0:5]

[('bay', 'data/bay/0585.jpeg'),
 ('bay', 'data/bay/0587.jpeg'),
 ('bay', 'data/bay/0718.jpeg'),
 ('bay', 'data/bay/0183.jpeg'),
 ('bay', 'data/bay/0808.jpeg')]

In [16]:
testdf = pd.DataFrame({"category": [tup[0] for tup in testtup], "files": [tup[1] for tup in testtup]})

In [25]:
testdf_shuffled=testdf.sample(frac=1).reset_index(drop=True)

In [26]:
len(testdf_shuffled)

2366

In [35]:
from models.modelutils import load_best_model_if_exist
import os

In [38]:
class ModelBinder:
    def __init__(self, base_model_name, basedir, cats):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self._categories = cats
        self._OTHER_LABEL = "other"

    @classmethod
    def dup_from(cls, binder):
        return ModelBinder(binder.base_model_name, binder.basedir, binder._categories)

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))
        
    def predict_arrs(self, arrs):
        models = self._models
        preddict = {key: models[key].predict(arrs)[:, 1] for key in models.keys()}
        return pd.DataFrame(preddict)
    
    def _row2class(self, rowdf, threshold):
        for cat in self._categories:
            if rowdf[cat] >= threshold:
                return cat
        return self._OTHERLABEL

    def df2classes(self, df, threshold = 0.5):
        res = []
        for i in range(len(df)):
            rowdf = df.iloc[i, :]
            res.append(self._row2class(rowdf, threshold))
        return res
    


In [37]:
class TwoLevelClassifier:
    def __init__(self, categories, h2binder, h1binder, otherlabel="OTHER"):
        self._categories = categories
        self._h2binder = h2binder
        self._h1binder = h1binder
        self._OTHERCLASS = -1
        self._OTHERLABEL = otherlabel
        self._FIRST_THRESHOLD = 0.5
        self._SECOND_THRESHOLD = 0.5

    def load_all(self):
        catkeys = self._catkeys
        self._h2binder.load_all_models(catkeys)
        self._h1binder.load_all_models(["sec_" + cat for cat in catkeys])

    def predict_arrs(self, arrs):
        df = self._predict_arrs(arrs)
        self._df = df
        return self._h1binder.df2classes(df, self._SECOND_THRESHOLD)

    def _predict_arrs(self, arrs):
        firstdf = pd.DataFrame(self._h2binder.predict_arrs(arrs))

        resultdf = pd.DataFrame(np.zeros(firstdf.shape))
        resultdf.columns = firstdf.columns

        for targetkey in self._categories:

            df = self._predict_second(targetkey, arrs, firstdf)
            if df is not None:
                resultdf.loc[df['orgindex'], targetkey] = df[targetkey].values

        return resultdf

    def _predict_second(self, targetcat, arrs, firstdf):
        filtered = firstdf[firstdf[targetcat] > self._FIRST_THRESHOLD]

        if len(filtered) == 0:
            return None

        farrs = arrs[filtered.index, :]
        model = self._h1binder.get_or_load_model('sec_' + targetcat)

        # no second level classifier, all score is already enough.
        if model == None:
            return pd.DataFrame({targetcat: np.ones(len(filtered.index)), 'orgindex': filtered.index})

        res = model.predict(farrs)
        scores = res[:, 1]

        return pd.DataFrame({targetkey: scores, 'orgindex': filtered.index})
    
    


In [27]:
from models.processor import DataSet

In [28]:
ds = DataSet()

In [39]:
binder = ModelBinder( "model", "trained_model", categories)

In [None]:
binder.load_all_models(categories)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships


In [None]:
for chunk in ds.chunked(testdf_shuffled, 500):
    x = ds.files_to_dataset(chunk['files'])
    label = chunk['category']
    resdf = binder.predict_arrs(x)

In [4]:
cat = "clouds"

In [6]:
len(testdict[cat])

130

In [16]:
testdict[cat][0:5]

['data/clouds/0367.jpeg',
 'data/clouds/0567.jpeg',
 'data/clouds/0143.jpeg',
 'data/clouds/0004.jpeg',
 'data/clouds/0502.jpeg']

In [11]:
x = ds.files_to_dataset(testdict[cat])

In [12]:
x.shape

(130, 224, 224, 3)

In [None]:
    
# below hereis not necessary        
    def bottlenecks2predict(self, btlnks):
        models = self._models
        return {key: models[key].predict(btlnks)[:, 1] for key in models.keys()}

    def files2bottlenecks(self, files):
        return np.array(list(self.extractor.map_code(files)))

    def predict(self, files):
        btlnks = self.files2bottlenecks(files)
        preddict = self.bottlenecks2predict(btlnks)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)



    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)

    def arrs2bottlenecks(self, arrs):
        return self.extractor.arrs2bottlenecks(arrs)

In [None]:
    # below here is not necessary
    def _predict_bottlenecks_df(self, btlnks):
        firstdf = pd.DataFrame(self._h2binder.bottlenecks2predict(btlnks))

        resultdf = pd.DataFrame(np.zeros(firstdf.shape))
        resultdf.columns = firstdf.columns

        for targetkey in self._catkeys:

            df = self._predict_second(targetkey, btlnks, firstdf)
            if df is not None:
                resultdf.loc[df['orgindex'], targetkey] = df[targetkey].values

        return resultdf

    def _predict_bottlenecks_cattupple(self, btlnks):
        df = self._predict_bottlenecks_df(btlnks)
        self._df = df
        return self._df2classes(df)
    
    def predict_arrs(self, arrs):
        btlnks = self._h2binder.arrs2bottlenecks(arrs)
        return self._predict_bottlenecks_cattupple(btlnks)

    def predict_files(self, files):
        btlnks = self._h2binder.files2bottlenecks(files)
        return self._predict_bottlenecks_cattupple(btlnks)

    def predict_files_df(self, files):
        btlnks = self._h2binder.files2bottlenecks(files)
        tups = self._predict_bottlenecks_cattupple(btlnks)
        return pd.DataFrame({
            "filepaths": files,
            "category": [tup[0] for tup in tups],
            "label": [tup[1] for tup in tups]
        })



In [14]:
from models.modelutils import load_best_model

In [17]:
first = load_best_model("trained_model/model_{}".format(cat))

In [18]:
first.predict(x)

array([[  9.99939442e-03,   9.90000606e-01],
       [  1.30196549e-02,   9.86980259e-01],
       [  7.25162565e-04,   9.99274790e-01],
       [  6.83021406e-03,   9.93169785e-01],
       [  6.39600083e-02,   9.36039984e-01],
       [  3.99376824e-02,   9.60062325e-01],
       [  8.19742978e-02,   9.18025732e-01],
       [  1.39728012e-02,   9.86027241e-01],
       [  9.42717865e-03,   9.90572751e-01],
       [  2.22350620e-02,   9.77764904e-01],
       [  7.79604912e-03,   9.92203951e-01],
       [  1.10087417e-01,   8.89912605e-01],
       [  4.04850245e-02,   9.59515035e-01],
       [  1.80641524e-02,   9.81935799e-01],
       [  1.01254266e-02,   9.89874601e-01],
       [  1.80572248e-03,   9.98194277e-01],
       [  3.81261766e-01,   6.18738234e-01],
       [  3.70556898e-02,   9.62944269e-01],
       [  3.42167281e-02,   9.65783238e-01],
       [  9.40244820e-04,   9.99059737e-01],
       [  2.24162545e-02,   9.77583766e-01],
       [  1.35218806e-03,   9.98647869e-01],
       [  