# Compute similarities between categories using trained classifiers

In this notebook our proposed similarity, $ClassSim$, is computed for the OVR and mutl-class case.

In order to compute $ClassSim$, this notebook requires trained OVR classifiers which are obtained in *train.ipynb* and a trained multi-class classifier which is obtained in *train_multiclass_classifier.ipynb*.

## Set up

In [1]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
!mkdir -p results

In [3]:
VALID_DF_PATH = "results/valid_df_fgvc.dat"
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df_fgvc.dat"

In [4]:
BASE_MODEL_PATH="trained_model"

In [5]:
from models.modelutils import dir2filedict_sorted, split_fdict
import random

Using TensorFlow backend.


Load category information and all of image paths.

In [6]:
trdict = dir2filedict_sorted("data_fgvc/train")
valdict = dir2filedict_sorted("data_fgvc/valid")
categories = [str(i) for i in range(0, 100)]

In [7]:
valdict['0'][0:5]

['data_fgvc/valid/0/0062781.jpg',
 'data_fgvc/valid/0/0113201.jpg',
 'data_fgvc/valid/0/0450014.jpg',
 'data_fgvc/valid/0/0602177.jpg',
 'data_fgvc/valid/0/0716386.jpg']

Here is expected outputs.   
All the outputs in {*train.ipynb*, *classifier_similarity.ipynb*, *train_multiclass_classifier.ipynb*, *train_second.ipynb*} must be the same. 

['data_fgvc/valid/0/0062781.jpg',  
 'data_fgvc/valid/0/0113201.jpg',  
 'data_fgvc/valid/0/0450014.jpg',  
 'data_fgvc/valid/0/0602177.jpg',  
 'data_fgvc/valid/0/0716386.jpg']

In [8]:
# target is validation set.
fdict = valdict

## Compute $ClassSim$ using OVR classifiers

Now compute our proposed similarities using trained OVR classifiers in *train.ipynb*.

In [9]:
from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

In [10]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
        
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))

    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)

    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)

In [28]:
def ModelBinder_create(base_model_name = "modelfgcv", basedir = BASE_MODEL_PATH):
    return ModelBinder(base_model_name, basedir)

### Predict scores and store them as DataFrame

In [29]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [30]:
labels = [tup[0] for tup in ftuppls]

In [31]:
paths = [tup[1] for tup in ftuppls]

In [32]:
len(labels), len(paths)

(700, 700)

In [33]:
binder = ModelBinder_create()

In [35]:
catkeys = [str(elem) for elem in range(0,100)]

In [36]:
binder.load_all_models(catkeys)

load 0
load 1
load 2
load 3
load 4
load 5
load 6
load 7
load 8
load 9
load 10
load 11
load 12
load 13
load 14
load 15
load 16
load 17
load 18
load 19
load 20
load 21
load 22
load 23
load 24
load 25
load 26
load 27
load 28
load 29
load 30
load 31
load 32
load 33
load 34
load 35
load 36
load 37
load 38
load 39
load 40
load 41
load 42
load 43
load 44
load 45
load 46
load 47
load 48
load 49
load 50
load 51
load 52
load 53
load 54
load 55
load 56
load 57
load 58
load 59
load 60
load 61
load 62
load 63
load 64
load 65
load 66
load 67
load 68
load 69
load 70
load 71
load 72
load 73
load 74
load 75
load 76
load 77
load 78
load 79
load 80
load 81
load 82
load 83
load 84
load 85
load 86
load 87
load 88
load 89
load 90
load 91
load 92
load 93
load 94
load 95
load 96
load 97
load 98
load 99


In [37]:
%%time
df = binder.predict(paths)

CPU times: user 1h 42s, sys: 2min 24s, total: 1h 3min 6s
Wall time: 1h 3s


In [40]:
df['label'] = labels

Save the computed scores as pickle.

In [41]:
df.to_pickle(VALID_DF_PATH)

In [45]:
df[ df['label'] == '0' ]

Unnamed: 0,0,1,10,11,12,13,14,15,16,17,...,92,93,94,95,96,97,98,99,filepaths,label
238,0.089616,0.017954,0.209178,0.022456,0.034009,0.038931,0.254064,0.392112,0.698947,0.812348,...,0.060931,0.270459,0.313821,0.088379,0.040737,0.390475,0.368633,0.541874,data_fgvc/valid/0/0062781.jpg,0
239,0.031177,0.00579,0.014931,0.070781,0.043547,0.130371,0.07,0.051315,0.299628,0.266218,...,0.191735,0.40742,0.366743,0.141985,0.050744,0.304188,0.221305,0.672944,data_fgvc/valid/0/0113201.jpg,0
240,0.01586,0.163897,0.35588,0.442474,0.285891,0.30357,0.478263,0.519271,0.708889,0.439306,...,0.277683,0.113334,0.181148,0.185438,0.521531,0.430259,0.301127,0.447388,data_fgvc/valid/0/0450014.jpg,0
241,0.999731,0.761334,0.367704,0.692881,0.049585,0.569026,0.58529,0.603518,0.093596,0.604499,...,0.428543,0.343016,0.195761,0.160351,0.283461,0.565337,0.230096,0.569375,data_fgvc/valid/0/0602177.jpg,0
242,0.013903,0.456479,0.574854,0.270044,0.25484,0.313575,0.816059,0.894145,0.477749,0.529812,...,0.469626,0.537903,0.269574,0.152358,0.114862,0.33801,0.378427,0.521545,data_fgvc/valid/0/0716386.jpg,0
243,0.449196,0.565291,0.460743,0.594603,0.141617,0.268219,0.531257,0.586334,0.516807,0.417807,...,0.086205,0.381821,0.381246,0.132773,0.136187,0.684642,0.48096,0.682915,data_fgvc/valid/0/0869722.jpg,0
244,0.998139,0.897536,0.850995,0.044796,0.096523,0.725767,0.527185,0.663421,0.461256,0.782678,...,0.133169,0.423191,0.242352,0.106318,0.199159,0.389018,0.412814,0.577946,data_fgvc/valid/0/1514481.jpg,0


### Compute $ClassSim$ using computed scores

In [46]:
class SimilarityCalculator:
    def __init__(self, df, thrshold=0.5):
        self.df = df
        self.rmiss = {}
        self.thrshold = thrshold
        self.cats = [key for key in df.columns.values if key != "filepaths" and key !="label"]
        self.catidxs = [list(df.columns).index(key) for key in self.cats]
    
    def combinekey(self, fromkey, tokey):
        return "{0}~{1}".format(fromkey, tokey)
    
    def saveAll(self):
        for key in self.cats:
            self.saveNaxx(key)
            
    def saveNaxx(self, fromkey):
        targetdf = self.df[self.df['label'] == fromkey]
        
        Nab = (targetdf[targetdf.columns[self.catidxs]] > self.thrshold).sum()
        Na = len(targetdf)
        
        Nab_a = Nab/Na
        
        otherkeys = Nab_a.index
        list(map(lambda otherkey: self.add(fromkey, otherkey, Nab_a[otherkey]), otherkeys))
        
    def add(self, fromkey, tokey, val):
        key = self.combinekey(fromkey, tokey)
        self.rmiss.setdefault(key, 0)
        self.rmiss[key] += val
        
    def similarity(self, fromkey, tokey):
        key1 = self.combinekey(fromkey, tokey)
        key2 = self.combinekey(tokey, fromkey)
        return (self.rmiss[key1] + self.rmiss[key2])/2

In [47]:
sim = SimilarityCalculator(df)

In [48]:
sim.saveAll()

In [49]:
keys = sim.cats

The number of all combinations between classes.

In [50]:
import itertools
len(list(itertools.combinations(keys, 2)))

4950

In [51]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [52]:
simdf = pd.DataFrame(simdict)

In [53]:
simdf.index = keys

In [54]:
simdf

Unnamed: 0,0,1,10,11,12,13,14,15,16,17,...,90,91,92,93,94,95,96,97,98,99
0,0.285714,0.285714,0.500000,0.357143,0.142857,0.571429,0.357143,0.500000,0.285714,0.357143,...,0.000000,0.000000,0.000000,0.071429,0.000000,0.000000,0.285714,0.142857,0.071429,0.428571
1,0.285714,0.857143,0.571429,0.000000,0.214286,0.142857,0.428571,0.357143,0.571429,0.428571,...,0.071429,0.000000,0.071429,0.142857,0.000000,0.000000,0.000000,0.214286,0.285714,0.357143
10,0.500000,0.571429,0.714286,0.428571,0.642857,0.642857,0.428571,0.142857,0.500000,0.500000,...,0.071429,0.142857,0.142857,0.142857,0.000000,0.071429,0.142857,0.214286,0.285714,0.214286
11,0.357143,0.000000,0.428571,0.571429,0.428571,0.714286,0.500000,0.071429,0.357143,0.428571,...,0.000000,0.071429,0.000000,0.071429,0.071429,0.071429,0.000000,0.000000,0.071429,0.071429
12,0.142857,0.214286,0.642857,0.428571,0.285714,0.571429,0.142857,0.214286,0.428571,0.214286,...,0.000000,0.071429,0.071429,0.071429,0.071429,0.071429,0.000000,0.000000,0.142857,0.071429
13,0.571429,0.142857,0.642857,0.714286,0.571429,0.714286,0.214286,0.285714,0.428571,0.428571,...,0.071429,0.214286,0.000000,0.071429,0.214286,0.142857,0.214286,0.000000,0.142857,0.142857
14,0.357143,0.428571,0.428571,0.500000,0.142857,0.214286,0.428571,0.785714,0.428571,0.428571,...,0.000000,0.142857,0.000000,0.285714,0.071429,0.285714,0.071429,0.285714,0.357143,0.285714
15,0.500000,0.357143,0.142857,0.071429,0.214286,0.285714,0.785714,0.714286,0.571429,0.642857,...,0.071429,0.000000,0.000000,0.428571,0.000000,0.000000,0.071429,0.357143,0.357143,0.285714
16,0.285714,0.571429,0.500000,0.357143,0.428571,0.428571,0.428571,0.571429,0.714286,0.714286,...,0.000000,0.000000,0.000000,0.500000,0.357143,0.142857,0.000000,0.428571,0.571429,0.214286
17,0.357143,0.428571,0.500000,0.428571,0.214286,0.428571,0.428571,0.642857,0.714286,0.857143,...,0.071429,0.285714,0.000000,0.714286,0.285714,0.000000,0.142857,0.285714,0.428571,0.285714


Save the similarity results as pickle.

In [55]:
simdf.to_pickle(VALID_SIMILARITY_DICT_PATH)

### Evaluation

In [4]:
simdf = pd.read_pickle(VALID_SIMILARITY_DICT_PATH)

In [10]:
simdf["2"][[str(cat) for cat in range(3, 10)]]

3    0.714286
4    0.357143
5    0.785714
6    0.642857
7    0.571429
8    0.642857
9    0.428571
Name: 2, dtype: float64

In [12]:
simdf["2"][simdf["2"]>0.5]

16    0.571429
2     1.000000
23    0.642857
24    0.571429
3     0.714286
5     0.785714
6     0.642857
64    0.642857
7     0.571429
8     0.642857
Name: 2, dtype: float64

In [17]:
len(simdf["50"][simdf["50"]>0.2]),len(simdf["50"][simdf["50"]>0.3]), len(simdf["50"][simdf["50"]>0.4])

(39, 16, 11)

In [30]:
res = [len(simdf[cat][simdf[cat]>0.4]) for cat in map(str, range(0, 100))]

In [31]:
np.mean(res)

18.57

In [33]:
np.mean([len(simdf[cat][simdf[cat]>0.3]) for cat in map(str, range(0, 100))])

25.109999999999999

## Compute $ClassSim$ using multi-class classifier

Next compute our proposed similarities using trained multi-class classifier in *train_multiclass_classifier.ipynb*.

Load multi-class classifier.

In [33]:
from models.modelutils import load_model_from

In [34]:
model = load_model_from("trained_model/multiclass/multiclass.json", "trained_model/multiclass/multiclass.h5")

In [35]:
with open("trained_model/multiclass/multiclass-labels.json", 'r') as f:
    labeldic = json.load(f)

### Predict scores and store them as DataFrame

In [36]:
ds = DataSet()

In [37]:
multidic = {val:key for key,val in labeldic.items()}

In [38]:
multidic

{0: 'bay',
 1: 'beach',
 2: 'birds',
 3: 'boeing',
 4: 'buildings',
 5: 'city',
 6: 'clouds',
 7: 'f-16',
 8: 'face',
 9: 'helicopter',
 10: 'mountain',
 11: 'ocean',
 12: 'ships',
 13: 'sky',
 14: 'sunrise',
 15: 'sunset'}

In [39]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [40]:
labels = [tup[0] for tup in ftuppls]

In [41]:
paths = [tup[1] for tup in ftuppls]

In [42]:
len(labels), len(paths)

(1894, 1894)

In [43]:
def predict_flist(flist):
    arr = ds.files_to_dataset(flist)
    scores = model.predict(arr)
    return np.argmax(scores, axis=1)

In [44]:
chunked_paths = ds.chunked(paths, 3000)

res_inds = []

for chunk in chunked_paths:
    inds = predict_flist(chunk)
    res_inds = np.concatenate((res_inds, inds)).astype(int)
    
unordered_df = pd.get_dummies([multidic[ind] for ind in res_inds])
df = unordered_df[catkeys]

In [45]:
df['label'] = labels

Save the computed scores as pickle.

In [46]:
df.to_pickle("results/multi_df.dat")

### Compute $ClassSim$ using computed scores

In [47]:
sim = SimilarityCalculator(df)

In [48]:
sim.saveAll()

In [49]:
keys = sim.cats

In [50]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [51]:
simdf = pd.DataFrame(simdict)

In [52]:
simdf.index = keys

In [53]:
simdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,0.617021,0.245719,0.003788,0.0,0.044498,0.122972,0.004808,0.008648,0.007299,0.011366,0.09339,0.086738,0.061486,0.047271,0.019675,0.012712
beach,0.245719,0.614286,0.003788,0.0,0.014747,0.007809,0.0,0.0,0.0,0.0,0.011049,0.039823,0.007143,0.004464,0.008948,0.014286
birds,0.003788,0.003788,0.954545,0.00495,0.0,0.0,0.004808,0.005102,0.011225,0.0,0.007812,0.00885,0.003788,0.003788,0.0,0.004237
boeing,0.0,0.0,0.00495,0.881188,0.00495,0.0,0.0,0.039604,0.0,0.00495,0.0,0.0,0.0,0.00495,0.0,0.0
buildings,0.044498,0.014747,0.0,0.00495,0.887097,0.121651,0.0,0.009134,0.0,0.0,0.003906,0.012882,0.016744,0.004464,0.005376,0.0
city,0.122972,0.007809,0.0,0.0,0.121651,0.550847,0.0,0.005102,0.0,0.008475,0.008144,0.0,0.012712,0.004237,0.0,0.008475
clouds,0.004808,0.0,0.004808,0.0,0.0,0.0,0.701923,0.0,0.0,0.0,0.021334,0.041355,0.0,0.247596,0.004808,0.014423
f-16,0.008648,0.0,0.005102,0.039604,0.009134,0.005102,0.0,0.887755,0.005102,0.038331,0.012915,0.00885,0.012712,0.0,0.0,0.0
face,0.007299,0.0,0.011225,0.0,0.0,0.0,0.0,0.005102,0.956204,0.0,0.007556,0.012499,0.0,0.008114,0.005376,0.008475
helicopter,0.011366,0.0,0.0,0.00495,0.0,0.008475,0.0,0.038331,0.0,0.940171,0.004274,0.0,0.025496,0.0,0.0,0.004237


Save the similarity results as pickle.

In [54]:
simdf.to_pickle("results/multi_sim_df.dat")