# Compute similarities between categories using trained classifiers

In this notebook our proposed similarity, $ClassSim$, is computed for the OVR and mutl-class case.

In order to compute $ClassSim$, this notebook requires trained OVR classifiers which are obtained in *train.ipynb* and a trained multi-class classifier which is obtained in *train_multiclass_classifier.ipynb*.

## Set up

In [1]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
!mkdir -p results

In [3]:
VALID_DF_PATH = "results/valid_df_fgvc.dat"
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df_fgvc.dat"

In [4]:
BASE_MODEL_PATH="trained_model"

In [5]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


Load category information and all of image paths.

In [None]:
trdict = dir2filedict("data_fgvc/train")
valdict = dir2filedict("data_fgvc/valid")
categories = [str(i) for i in range(0, 100)]

In [9]:
valdict['clouds'][0:5]

['data/clouds/0678.jpeg',
 'data/clouds/0701.jpeg',
 'data/clouds/0431.jpeg',
 'data/clouds/0033.jpeg',
 'data/clouds/0290.jpeg']

Here is expected outputs.   
The output may be different if you create image urls yourself or exlude some files for GMM, but all the outputs in {*train.ipynb*, *classifier_similarity.ipynb*, *train_multiclass_classifier.ipynb*, *train_second.ipynb*} must be the same. 

['data_fgvc/valid/0/1319365.jpg',  
 'data_fgvc/valid/0/0062781.jpg',  
 'data_fgvc/valid/0/1042021.jpg',  
 'data_fgvc/valid/0/0602177.jpg',  
 'data_fgvc/valid/0/0817494.jpg']

In [10]:
# target is validation set.
fdict = valdict

## Compute $ClassSim$ using OVR classifiers

Now compute our proposed similarities using trained OVR classifiers in *train.ipynb*.

In [11]:
from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

In [12]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
        
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))

    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)

    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)

In [13]:
def ModelBinder_create(base_model_name = "model", basedir = BASE_MODEL_PATH):
    return ModelBinder(base_model_name, basedir)

### Predict scores and store them as DataFrame

In [14]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [15]:
labels = [tup[0] for tup in ftuppls]

In [16]:
paths = [tup[1] for tup in ftuppls]

In [17]:
len(labels), len(paths)

(1894, 1894)

In [18]:
binder = ModelBinder_create()

In [19]:
binder.load_all_models(catkeys)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


In [20]:
%%time
df = binder.predict(paths)

CPU times: user 9min 7s, sys: 32.5 s, total: 9min 39s
Wall time: 8min 36s


In [21]:
df['label'] = labels

Save the computed scores as pickle.

In [22]:
df.to_pickle(VALID_DF_PATH)

### Compute $ClassSim$ using computed scores

In [23]:
class SimilarityCalculator:
    def __init__(self, df, thrshold=0.5):
        self.df = df
        self.rmiss = {}
        self.thrshold = thrshold
        self.cats = [key for key in df.columns.values if key != "filepaths" and key !="label"]
        self.catidxs = [list(df.columns).index(key) for key in self.cats]
    
    def combinekey(self, fromkey, tokey):
        return "{0}~{1}".format(fromkey, tokey)
    
    def saveAll(self):
        for key in self.cats:
            self.saveNaxx(key)
            
    def saveNaxx(self, fromkey):
        targetdf = self.df[self.df['label'] == fromkey]
        
        Nab = (targetdf[targetdf.columns[self.catidxs]] > self.thrshold).sum()
        Na = len(targetdf)
        
        Nab_a = Nab/Na
        
        otherkeys = Nab_a.index
        list(map(lambda otherkey: self.add(fromkey, otherkey, Nab_a[otherkey]), otherkeys))
        
    def add(self, fromkey, tokey, val):
        key = self.combinekey(fromkey, tokey)
        self.rmiss.setdefault(key, 0)
        self.rmiss[key] += val
        
    def similarity(self, fromkey, tokey):
        key1 = self.combinekey(fromkey, tokey)
        key2 = self.combinekey(tokey, fromkey)
        return (self.rmiss[key1] + self.rmiss[key2])/2

In [24]:
sim = SimilarityCalculator(df)

In [25]:
sim.saveAll()

In [26]:
keys = sim.cats

The number of all combinations between classes.

In [27]:
import itertools
len(list(itertools.combinations(keys, 2)))

120

In [28]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [29]:
simdf = pd.DataFrame(simdict)

In [30]:
simdf.index = keys

In [31]:
simdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,0.829787,0.626241,0.003788,0.024823,0.069321,0.301298,0.0,0.027935,0.007196,0.003546,0.188109,0.319808,0.077143,0.033878,0.055251,0.055866
beach,0.626241,0.921429,0.0,0.010714,0.025922,0.073184,0.014423,0.022959,0.007221,0.029976,0.114174,0.24548,0.02276,0.030357,0.105645,0.094794
birds,0.003788,0.0,0.984848,0.018939,0.003788,0.004237,0.0,0.026515,0.037464,0.008061,0.0,0.045488,0.007576,0.003788,0.012952,0.028313
boeing,0.024823,0.010714,0.018939,0.990099,0.013015,0.035325,0.0,0.258386,0.015899,0.152958,0.011719,0.066897,0.059322,0.013879,0.010753,0.004237
buildings,0.069321,0.025922,0.003788,0.013015,0.991935,0.656096,0.0,0.0,0.00365,0.004274,0.019783,0.025371,0.09158,0.008497,0.025538,0.028841
city,0.301298,0.073184,0.004237,0.035325,0.656096,0.957627,0.0,0.017814,0.029073,0.017022,0.060249,0.055647,0.097458,0.013166,0.040414,0.038136
clouds,0.0,0.014423,0.0,0.0,0.0,0.0,0.961538,0.0,0.0,0.0,0.085337,0.259658,0.0,0.787088,0.116263,0.128015
f-16,0.027935,0.022959,0.026515,0.258386,0.0,0.017814,0.0,0.969388,0.022605,0.187773,0.015625,0.050027,0.125649,0.005102,0.016129,0.029661
face,0.007196,0.007221,0.037464,0.015899,0.00365,0.029073,0.0,0.022605,0.992701,0.007299,0.018761,0.050772,0.015774,0.008114,0.03995,0.024248
helicopter,0.003546,0.029976,0.008061,0.152958,0.004274,0.017022,0.0,0.187773,0.007299,0.991453,0.028078,0.004425,0.097892,0.004464,0.010753,0.004237


Save the similarity results as pickle.

In [32]:
simdf.to_pickle(VALID_SIMILARITY_DICT_PATH)

## Compute $ClassSim$ using multi-class classifier

Next compute our proposed similarities using trained multi-class classifier in *train_multiclass_classifier.ipynb*.

Load multi-class classifier.

In [33]:
from models.modelutils import load_model_from

In [34]:
model = load_model_from("trained_model/multiclass/multiclass.json", "trained_model/multiclass/multiclass.h5")

In [35]:
with open("trained_model/multiclass/multiclass-labels.json", 'r') as f:
    labeldic = json.load(f)

### Predict scores and store them as DataFrame

In [36]:
ds = DataSet()

In [37]:
multidic = {val:key for key,val in labeldic.items()}

In [38]:
multidic

{0: 'bay',
 1: 'beach',
 2: 'birds',
 3: 'boeing',
 4: 'buildings',
 5: 'city',
 6: 'clouds',
 7: 'f-16',
 8: 'face',
 9: 'helicopter',
 10: 'mountain',
 11: 'ocean',
 12: 'ships',
 13: 'sky',
 14: 'sunrise',
 15: 'sunset'}

In [39]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [40]:
labels = [tup[0] for tup in ftuppls]

In [41]:
paths = [tup[1] for tup in ftuppls]

In [42]:
len(labels), len(paths)

(1894, 1894)

In [43]:
def predict_flist(flist):
    arr = ds.files_to_dataset(flist)
    scores = model.predict(arr)
    return np.argmax(scores, axis=1)

In [44]:
chunked_paths = ds.chunked(paths, 3000)

res_inds = []

for chunk in chunked_paths:
    inds = predict_flist(chunk)
    res_inds = np.concatenate((res_inds, inds)).astype(int)
    
unordered_df = pd.get_dummies([multidic[ind] for ind in res_inds])
df = unordered_df[catkeys]

In [45]:
df['label'] = labels

Save the computed scores as pickle.

In [46]:
df.to_pickle("results/multi_df.dat")

### Compute $ClassSim$ using computed scores

In [47]:
sim = SimilarityCalculator(df)

In [48]:
sim.saveAll()

In [49]:
keys = sim.cats

In [50]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [51]:
simdf = pd.DataFrame(simdict)

In [52]:
simdf.index = keys

In [53]:
simdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,0.617021,0.245719,0.003788,0.0,0.044498,0.122972,0.004808,0.008648,0.007299,0.011366,0.09339,0.086738,0.061486,0.047271,0.019675,0.012712
beach,0.245719,0.614286,0.003788,0.0,0.014747,0.007809,0.0,0.0,0.0,0.0,0.011049,0.039823,0.007143,0.004464,0.008948,0.014286
birds,0.003788,0.003788,0.954545,0.00495,0.0,0.0,0.004808,0.005102,0.011225,0.0,0.007812,0.00885,0.003788,0.003788,0.0,0.004237
boeing,0.0,0.0,0.00495,0.881188,0.00495,0.0,0.0,0.039604,0.0,0.00495,0.0,0.0,0.0,0.00495,0.0,0.0
buildings,0.044498,0.014747,0.0,0.00495,0.887097,0.121651,0.0,0.009134,0.0,0.0,0.003906,0.012882,0.016744,0.004464,0.005376,0.0
city,0.122972,0.007809,0.0,0.0,0.121651,0.550847,0.0,0.005102,0.0,0.008475,0.008144,0.0,0.012712,0.004237,0.0,0.008475
clouds,0.004808,0.0,0.004808,0.0,0.0,0.0,0.701923,0.0,0.0,0.0,0.021334,0.041355,0.0,0.247596,0.004808,0.014423
f-16,0.008648,0.0,0.005102,0.039604,0.009134,0.005102,0.0,0.887755,0.005102,0.038331,0.012915,0.00885,0.012712,0.0,0.0,0.0
face,0.007299,0.0,0.011225,0.0,0.0,0.0,0.0,0.005102,0.956204,0.0,0.007556,0.012499,0.0,0.008114,0.005376,0.008475
helicopter,0.011366,0.0,0.0,0.00495,0.0,0.008475,0.0,0.038331,0.0,0.940171,0.004274,0.0,0.025496,0.0,0.0,0.004237


Save the similarity results as pickle.

In [54]:
simdf.to_pickle("results/multi_sim_df.dat")