# Calculate similarity between category based on 1 vs all classifier


## 1. Set up

In [1]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

In [2]:
!mkdir -p results

In [3]:
VALID_DF_PATH = "results/valid_df.dat"
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df.dat"

In [4]:
BASE_MODEL_PATH="trained_model"

In [5]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


In [6]:
fdict = dir2filedict("data")

In [7]:
catkeys = sorted(fdict.keys())

In [8]:
trdict, testdict = split_fdict(fdict, test_size=0.2, random_state = 123)
trdict, valdict = split_fdict(trdict, test_size=0.2, random_state = 456)

In [9]:
valdict['clouds'][0:5]

['data/clouds/0678.jpeg',
 'data/clouds/0701.jpeg',
 'data/clouds/0431.jpeg',
 'data/clouds/0033.jpeg',
 'data/clouds/0290.jpeg']

In [10]:
# target is validation set.
fdict = valdict

## Calc similarity and store df

In [11]:
from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

In [13]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))


    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)


    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)



In [14]:

def ModelBinder_create(base_model_name = "model", basedir = BASE_MODEL_PATH):
    return ModelBinder(base_model_name, basedir)

## Predict score and store to df

In [15]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [16]:
labels = [tup[0] for tup in ftuppls]

In [17]:
paths = [tup[1] for tup in ftuppls]

In [18]:
len(labels), len(paths)

(1894, 1894)

In [19]:
binder = ModelBinder_create()

In [20]:
binder.load_all_models(catkeys)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


In [21]:
# binder._models = ens._models

In [22]:
%%time
df = binder.predict(paths)

  'to RGBA images')
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data),

CPU times: user 9min 10s, sys: 28.4 s, total: 9min 38s
Wall time: 8min 38s


In [23]:
df['label'] = labels

In [24]:
df.to_pickle(VALID_DF_PATH)

## Calculate similarity from score df

In [25]:
class SimilarityCalculator:
    def __init__(self, df, thrshold=0.5):
        self.df = df
        self.rmiss = {}
        self.thrshold = thrshold
        self.cats = [key for key in df.columns.values if key != "filepaths" and key !="label"]
        self.catidxs = [list(df.columns).index(key) for key in self.cats]
    def combinekey(self, fromkey, tokey):
        return "{0}~{1}".format(fromkey, tokey)
    def saveAll(self):
        for key in self.cats:
            self.saveNaxx(key)
    def saveNaxx(self, fromkey):
        targetdf = self.df[self.df['label'] == fromkey]
        
        Nab = (targetdf[targetdf.columns[self.catidxs]] > self.thrshold).sum()
        Na = len(targetdf)
        
        Nab_a = Nab/Na
        
        otherkeys = Nab_a.index
        list(map(lambda otherkey: self.add(fromkey, otherkey, Nab_a[otherkey]), otherkeys))

        
    def add(self, fromkey, tokey, val):
        key = self.combinekey(fromkey, tokey)
        self.rmiss.setdefault(key, 0)
        self.rmiss[key] += val
    def similarity(self, fromkey, tokey):
        key1 = self.combinekey(fromkey, tokey)
        key2 = self.combinekey(tokey, fromkey)
        return (self.rmiss[key1] + self.rmiss[key2])/2
        


In [26]:
sim = SimilarityCalculator(df)

In [27]:
sim.saveAll()

In [28]:
# keys = df.columns[:-2]
keys = sim.cats

In [29]:
import itertools

In [30]:
len(list(itertools.combinations(keys, 2)))

120

In [31]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [32]:
simdf = pd.DataFrame(simdict)

In [33]:
simdf.index = keys

In [34]:
simdf.to_pickle(VALID_SIMILARITY_DICT_PATH)

In [35]:
simdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,0.829787,0.626241,0.003788,0.024823,0.069321,0.301298,0.0,0.027935,0.007196,0.003546,0.188109,0.319808,0.077143,0.033878,0.055251,0.055866
beach,0.626241,0.921429,0.0,0.010714,0.025922,0.073184,0.014423,0.022959,0.007221,0.029976,0.114174,0.24548,0.02276,0.030357,0.105645,0.094794
birds,0.003788,0.0,0.984848,0.018939,0.003788,0.004237,0.0,0.026515,0.037464,0.008061,0.0,0.045488,0.007576,0.003788,0.012952,0.028313
boeing,0.024823,0.010714,0.018939,0.990099,0.013015,0.035325,0.0,0.258386,0.015899,0.152958,0.011719,0.066897,0.059322,0.013879,0.010753,0.004237
buildings,0.069321,0.025922,0.003788,0.013015,0.991935,0.656096,0.0,0.0,0.00365,0.004274,0.019783,0.025371,0.09158,0.008497,0.025538,0.028841
city,0.301298,0.073184,0.004237,0.035325,0.656096,0.957627,0.0,0.017814,0.029073,0.017022,0.060249,0.055647,0.097458,0.013166,0.040414,0.038136
clouds,0.0,0.014423,0.0,0.0,0.0,0.0,0.961538,0.0,0.0,0.0,0.085337,0.259658,0.0,0.787088,0.116263,0.128015
f-16,0.027935,0.022959,0.026515,0.258386,0.0,0.017814,0.0,0.969388,0.022605,0.187773,0.015625,0.050027,0.125649,0.005102,0.016129,0.029661
face,0.007196,0.007221,0.037464,0.015899,0.00365,0.029073,0.0,0.022605,0.992701,0.007299,0.018761,0.050772,0.015774,0.008114,0.03995,0.024248
helicopter,0.003546,0.029976,0.008061,0.152958,0.004274,0.017022,0.0,0.187773,0.007299,0.991453,0.028078,0.004425,0.097892,0.004464,0.010753,0.004237


### Calculate by multiclass

In [13]:
from models.modelutils import load_model_from

In [17]:
model = load_model_from("trained_model/multiclass-classification.json", "trained_model/multiclass-classification.h5")

In [18]:
ds = DataSet()

In [19]:
arr = ds.files_to_dataset(fdict['clouds'][0:50])

In [20]:
arr.shape

(50, 224, 224, 3)

In [21]:
model.predict(arr)

array([[  4.96721070e-04,   5.19526875e-05,   1.10543424e-05,
          8.45339002e-07,   1.01411715e-05,   5.92178112e-05,
          9.94535208e-01,   3.62609171e-06,   1.21139183e-05,
          1.46836825e-07,   8.94523982e-04,   1.02049555e-04,
          4.53352477e-05,   3.20550846e-03,   4.28635656e-04,
          1.42955148e-04],
       [  7.01853889e-04,   2.51361023e-04,   4.54426936e-06,
          5.18442248e-05,   2.27106102e-06,   1.45264203e-04,
          1.81485340e-01,   8.32003367e-04,   2.38061584e-05,
          1.00907178e-04,   3.56462406e-04,   3.18455452e-04,
          7.61086121e-04,   8.13844204e-01,   5.26620133e-04,
          5.93939738e-04],
       [  5.66092283e-02,   9.35878884e-03,   7.89730530e-06,
          3.84489031e-05,   7.92378523e-06,   8.27737531e-05,
          7.75388956e-01,   5.52000012e-04,   7.91174170e-06,
          3.45515495e-04,   9.53301322e-03,   6.49301056e-03,
          6.03929046e-04,   1.37966856e-01,   1.86394562e-03,
          1.1399

In [23]:
with open("trained_model/multiclass-classification-labels.json", 'r') as f:
    labeldic = json.load(f)

In [26]:
multidic = {val:key for key,val in labeldic.items()}

In [27]:
multidic

{0: 'bay',
 1: 'beach',
 2: 'birds',
 3: 'boeing',
 4: 'buildings',
 5: 'city',
 6: 'clouds',
 7: 'f-16',
 8: 'face',
 9: 'helicopter',
 10: 'mountain',
 11: 'ocean',
 12: 'ships',
 13: 'sky',
 14: 'sunrise',
 15: 'sunset'}

In [28]:
tmp= Out[21]

In [32]:
np.argmax(tmp, axis=1)

array([ 6, 13,  6,  6,  6,  6, 13,  6,  6,  6, 13,  6, 14,  6,  6, 11,  6,
       15,  6,  6,  6,  6,  6,  6, 13, 13, 13,  6,  6,  6,  6, 13, 10,  6,
        6,  6,  6,  6, 15,  6,  6, 13, 15,  6, 13,  6, 11,  6, 13,  6])

In [33]:
tmp.shape

(50, 16)

In [34]:
len(np.argmax(tmp, axis=1))

50

In [53]:
res = pd.DataFrame(np.zeros((50, 16)))

In [62]:
pd.get_dummies([multidic[ind] for ind in np.argmax(tmp, axis=1)])

Unnamed: 0,clouds,mountain,ocean,sky,sunrise,sunset
0,1,0,0,0,0,0
1,0,0,0,1,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
5,1,0,0,0,0,0
6,0,0,0,1,0,0
7,1,0,0,0,0,0
8,1,0,0,0,0,0
9,1,0,0,0,0,0


In [63]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [64]:
labels = [tup[0] for tup in ftuppls]

In [65]:
paths = [tup[1] for tup in ftuppls]

In [66]:
len(labels), len(paths)

(1894, 1894)

In [80]:
def predict_flist(flist):
    arr = ds.files_to_dataset(flist)
    scores = model.predict(arr)
    return np.argmax(scores, axis=1)

In [None]:
chunked_paths = ds.chunked(paths, 3000)

res_inds = []

for chunk in chunked_paths:
    inds = predict_flist(chunk)
    res_inds = np.concatenate((res_inds, inds)).astype(int)
    
unordered_df = pd.get_dummies([multidic[ind] for ind in res_inds])
df = unordered_df[catkeys]




In [None]:
df['label'] = labels

In [None]:
df.to_pickle("results/multi_df.dat")

In [27]:
inds[0:5]

NameError: name 'inds' is not defined

In [81]:
predict_flist(fdict['clouds'][0:50])

array([ 6, 13,  6,  6,  6,  6, 13,  6,  6,  6, 13,  6, 14,  6,  6, 11,  6,
       15,  6,  6,  6,  6,  6,  6, 13, 13, 13,  6,  6,  6,  6, 13, 10,  6,
        6,  6,  6,  6, 15,  6,  6, 13, 15,  6, 13,  6, 11,  6, 13,  6])

In [19]:
arr = ds.files_to_dataset(fdict['clouds'][0:50])

In [20]:
arr.shape

(50, 224, 224, 3)

In [21]:
scores = model.predict(arr)

array([[  4.96721070e-04,   5.19526875e-05,   1.10543424e-05,
          8.45339002e-07,   1.01411715e-05,   5.92178112e-05,
          9.94535208e-01,   3.62609171e-06,   1.21139183e-05,
          1.46836825e-07,   8.94523982e-04,   1.02049555e-04,
          4.53352477e-05,   3.20550846e-03,   4.28635656e-04,
          1.42955148e-04],
       [  7.01853889e-04,   2.51361023e-04,   4.54426936e-06,
          5.18442248e-05,   2.27106102e-06,   1.45264203e-04,
          1.81485340e-01,   8.32003367e-04,   2.38061584e-05,
          1.00907178e-04,   3.56462406e-04,   3.18455452e-04,
          7.61086121e-04,   8.13844204e-01,   5.26620133e-04,
          5.93939738e-04],
       [  5.66092283e-02,   9.35878884e-03,   7.89730530e-06,
          3.84489031e-05,   7.92378523e-06,   8.27737531e-05,
          7.75388956e-01,   5.52000012e-04,   7.91174170e-06,
          3.45515495e-04,   9.53301322e-03,   6.49301056e-03,
          6.03929046e-04,   1.37966856e-01,   1.86394562e-03,
          1.1399

In [32]:
np.argmax(scores, axis=1)

array([ 6, 13,  6,  6,  6,  6, 13,  6,  6,  6, 13,  6, 14,  6,  6, 11,  6,
       15,  6,  6,  6,  6,  6,  6, 13, 13, 13,  6,  6,  6,  6, 13, 10,  6,
        6,  6,  6,  6, 15,  6,  6, 13, 15,  6, 13,  6, 11,  6, 13,  6])

In [74]:
a = np.argmax(tmp, axis=1)

In [68]:
b = np.array([100, 100])

In [75]:
np.concatenate((a, b))

array([  6,  13,   6,   6,   6,   6,  13,   6,   6,   6,  13,   6,  14,
         6,   6,  11,   6,  15,   6,   6,   6,   6,   6,   6,  13,  13,
        13,   6,   6,   6,   6,  13,  10,   6,   6,   6,   6,   6,  15,
         6,   6,  13,  15,   6,  13,   6,  11,   6,  13,   6, 100, 100])

In [79]:
np.concatenate(([], a)).astype(int)

array([ 6, 13,  6,  6,  6,  6, 13,  6,  6,  6, 13,  6, 14,  6,  6, 11,  6,
       15,  6,  6,  6,  6,  6,  6, 13, 13, 13,  6,  6,  6,  6, 13, 10,  6,
        6,  6,  6,  6, 15,  6,  6, 13, 15,  6, 13,  6, 11,  6, 13,  6])

In [33]:
tmp.shape

(50, 16)

In [34]:
len(np.argmax(tmp, axis=1))

50

In [53]:
res = pd.DataFrame(np.zeros((50, 16)))

In [82]:
df = pd.get_dummies([multidic[ind] for ind in np.argmax(tmp, axis=1)])

In [83]:
df[0:5]

Unnamed: 0,clouds,mountain,ocean,sky,sunrise,sunset
0,1,0,0,0,0,0
1,0,0,0,1,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0


In [86]:
df[['sunrise', 'sunset', 'sky']]

Unnamed: 0,sunrise,sunset,sky
0,0,0,0
1,0,0,1
2,0,0,0
3,0,0,0
4,0,0,0
5,0,0,0
6,0,0,1
7,0,0,0
8,0,0,0
9,0,0,0


In [54]:
res.iloc[:, np.argmax(tmp, axis=1)]

Unnamed: 0,6,13,6.1,6.2,6.3,6.4,13.1,6.5,6.6,6.7,...,6.8,13.2,15,6.9,13.3,6.10,11,6.11,13.4,6.12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        )        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))


    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)


    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)



SyntaxError: invalid syntax (<ipython-input-36-4e76aada0afc>, line 3)