# Calculate similarity between category based on 1 vs all classifier


## 1. Set up

In [1]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

In [2]:
!mkdir -p results

In [3]:
VALID_DF_PATH = "results/valid_df.dat"
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df.dat"

In [4]:
BASE_MODEL_PATH="trained_model"

In [5]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


In [6]:
fdict = dir2filedict("data")

In [8]:
catkeys = sorted(fdict.keys())

In [9]:
trdict, testdict = split_fdict(fdict, test_size=0.2, random_state = 123)
trdict, valdict = split_fdict(trdict, test_size=0.2, random_state = 456)

In [10]:
valdict['clouds'][0:5]

['data/clouds/0678.jpeg',
 'data/clouds/0701.jpeg',
 'data/clouds/0431.jpeg',
 'data/clouds/0033.jpeg',
 'data/clouds/0290.jpeg']

In [11]:
# target is validation set.
fdict = valdict

## Calc similarity and store df

In [12]:
from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

In [13]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))


    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)


    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)



In [14]:

def ModelBinder_create(base_model_name = "model", basedir = BASE_MODEL_PATH):
    return ModelBinder(base_model_name, basedir)

## Predict score and store to df

In [15]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [16]:
labels = [tup[0] for tup in ftuppls]

In [17]:
paths = [tup[1] for tup in ftuppls]

In [18]:
len(labels), len(paths)

(1894, 1894)

In [19]:
binder = ModelBinder_create()

In [20]:
binder.load_all_models(catkeys)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


In [21]:
# binder._models = ens._models

In [22]:
%%time
df = binder.predict(paths)

  'to RGBA images')
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data),

CPU times: user 9min 10s, sys: 28.4 s, total: 9min 38s
Wall time: 8min 38s


In [23]:
df['label'] = labels

In [24]:
df.to_pickle(VALID_DF_PATH)

## Calculate similarity from score df

In [25]:
class SimilarityCalculator:
    def __init__(self, df, thrshold=0.5):
        self.df = df
        self.rmiss = {}
        self.thrshold = thrshold
        self.cats = [key for key in df.columns.values if key != "filepaths" and key !="label"]
        self.catidxs = [list(df.columns).index(key) for key in self.cats]
    def combinekey(self, fromkey, tokey):
        return "{0}~{1}".format(fromkey, tokey)
    def saveAll(self):
        for key in self.cats:
            self.saveNaxx(key)
    def saveNaxx(self, fromkey):
        targetdf = self.df[self.df['label'] == fromkey]
        
        Nab = (targetdf[targetdf.columns[self.catidxs]] > self.thrshold).sum()
        Na = len(targetdf)
        
        Nab_a = Nab/Na
        
        otherkeys = Nab_a.index
        list(map(lambda otherkey: self.add(fromkey, otherkey, Nab_a[otherkey]), otherkeys))

        
    def add(self, fromkey, tokey, val):
        key = self.combinekey(fromkey, tokey)
        self.rmiss.setdefault(key, 0)
        self.rmiss[key] += val
    def similarity(self, fromkey, tokey):
        key1 = self.combinekey(fromkey, tokey)
        key2 = self.combinekey(tokey, fromkey)
        return (self.rmiss[key1] + self.rmiss[key2])/2
        


In [26]:
sim = SimilarityCalculator(df)

In [27]:
sim.saveAll()

In [28]:
# keys = df.columns[:-2]
keys = sim.cats

In [29]:
import itertools

In [30]:
len(list(itertools.combinations(keys, 2)))

120

In [31]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [32]:
simdf = pd.DataFrame(simdict)

In [33]:
simdf.index = keys

In [34]:
simdf.to_pickle(VALID_SIMILARITY_DICT_PATH)

In [35]:
simdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,0.829787,0.626241,0.003788,0.024823,0.069321,0.301298,0.0,0.027935,0.007196,0.003546,0.188109,0.319808,0.077143,0.033878,0.055251,0.055866
beach,0.626241,0.921429,0.0,0.010714,0.025922,0.073184,0.014423,0.022959,0.007221,0.029976,0.114174,0.24548,0.02276,0.030357,0.105645,0.094794
birds,0.003788,0.0,0.984848,0.018939,0.003788,0.004237,0.0,0.026515,0.037464,0.008061,0.0,0.045488,0.007576,0.003788,0.012952,0.028313
boeing,0.024823,0.010714,0.018939,0.990099,0.013015,0.035325,0.0,0.258386,0.015899,0.152958,0.011719,0.066897,0.059322,0.013879,0.010753,0.004237
buildings,0.069321,0.025922,0.003788,0.013015,0.991935,0.656096,0.0,0.0,0.00365,0.004274,0.019783,0.025371,0.09158,0.008497,0.025538,0.028841
city,0.301298,0.073184,0.004237,0.035325,0.656096,0.957627,0.0,0.017814,0.029073,0.017022,0.060249,0.055647,0.097458,0.013166,0.040414,0.038136
clouds,0.0,0.014423,0.0,0.0,0.0,0.0,0.961538,0.0,0.0,0.0,0.085337,0.259658,0.0,0.787088,0.116263,0.128015
f-16,0.027935,0.022959,0.026515,0.258386,0.0,0.017814,0.0,0.969388,0.022605,0.187773,0.015625,0.050027,0.125649,0.005102,0.016129,0.029661
face,0.007196,0.007221,0.037464,0.015899,0.00365,0.029073,0.0,0.022605,0.992701,0.007299,0.018761,0.050772,0.015774,0.008114,0.03995,0.024248
helicopter,0.003546,0.029976,0.008061,0.152958,0.004274,0.017022,0.0,0.187773,0.007299,0.991453,0.028078,0.004425,0.097892,0.004464,0.010753,0.004237
