# Calculate similarity between category based on 1 vs all classifier


## 1. Set up

In [1]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

In [2]:
!mkdir -p results

In [3]:
VALID_DF_PATH = "results/valid_df.dat"
VALID_SIMILARITY_DICT_PATH="results/valid_sim_df.dat"

In [4]:
BASE_MODEL_PATH="trained_model"

In [5]:
from models.modelutils import dir2filedict, split_fdict
import random

Using TensorFlow backend.


In [6]:
fdict_all = dir2filedict("data")

In [7]:
catkeys = sorted(fdict_all.keys())

In [8]:
random.seed(123)
trdict, valdict = split_fdict(fdict_all)

In [9]:
# target is validation set.
fdict = valdict

## Calc similarity and store df

In [10]:
from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

In [11]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))


    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)


    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)



In [12]:

def ModelBinder_create(base_model_name = "model", basedir = BASE_MODEL_PATH):
    return ModelBinder(base_model_name, basedir)

## Predict score and store to df

In [13]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [14]:
labels = [tup[0] for tup in ftuppls]

In [15]:
paths = [tup[1] for tup in ftuppls]

In [16]:
len(labels), len(paths)

(1187, 1187)

In [17]:
binder = ModelBinder_create()

In [21]:
binder.load_all_models(catkeys)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


In [52]:
# binder._models = ens._models

In [22]:
%%time
df = binder.predict(paths)

CPU times: user 5min 9s, sys: 15 s, total: 5min 24s
Wall time: 7min 26s


In [23]:
df['label'] = labels

In [24]:
df.to_pickle(VALID_DF_PATH)

## Calculate similarity from score df

In [18]:
class SimilarityCalculator:
    def __init__(self, df, thrshold=0.5):
        self.df = df
        self.rmiss = {}
        self.thrshold = thrshold
        self.cats = [key for key in df.columns.values if key != "filepaths" and key !="label"]
        self.catidxs = [list(df.columns).index(key) for key in self.cats]
    def combinekey(self, fromkey, tokey):
        return "{0}~{1}".format(fromkey, tokey)
    def saveAll(self):
        for key in self.cats:
            self.saveNaxx(key)
    def saveNaxx(self, fromkey):
        targetdf = self.df[self.df['label'] == fromkey]
        
        Nab = (targetdf[targetdf.columns[self.catidxs]] > self.thrshold).sum()
        Na = len(targetdf)
        
        Nab_a = Nab/Na
        
        otherkeys = Nab_a.index
        list(map(lambda otherkey: self.add(fromkey, otherkey, Nab_a[otherkey]), otherkeys))

        
    def add(self, fromkey, tokey, val):
        key = self.combinekey(fromkey, tokey)
        self.rmiss.setdefault(key, 0)
        self.rmiss[key] += val
    def similarity(self, fromkey, tokey):
        key1 = self.combinekey(fromkey, tokey)
        key2 = self.combinekey(tokey, fromkey)
        return (self.rmiss[key1] + self.rmiss[key2])/2
        


In [20]:
sim = SimilarityCalculator(df)

In [39]:
sim.saveAll()

In [40]:
# keys = df.columns[:-2]
keys = sim.cats

In [41]:
import itertools

In [42]:
len(list(itertools.combinations(keys, 2)))

120

In [43]:
simdict = {key1: [sim.similarity(key1, key2) for key2 in keys] for key1 in keys}

In [44]:
simdf = pd.DataFrame(simdict)

In [45]:
simdf.index = keys

In [46]:
simdf.to_pickle(VALID_SIMILARITY_DICT_PATH)

In [47]:
simdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,0.852273,0.560345,0.005682,0.0,0.029866,0.226044,0.0,0.0,0.011364,0.006757,0.172159,0.149968,0.013514,0.0,0.034483,0.017045
beach,0.560345,0.873563,0.005747,0.0,0.011494,0.047996,0.015385,0.0,0.0,0.006757,0.067744,0.238384,0.018251,0.014286,0.017241,0.08279
birds,0.005682,0.005747,1.0,0.0,0.0,0.006757,0.007692,0.0,0.011628,0.0,0.0,0.048278,0.0,0.0,0.0,0.006757
boeing,0.0,0.0,0.0,0.952381,0.007937,0.015873,0.0,0.179287,0.0,0.055234,0.0,0.043148,0.037323,0.0,0.0,0.0
buildings,0.029866,0.011494,0.0,0.007937,0.961538,0.564103,0.0,0.0,0.005814,0.013514,0.0125,0.007042,0.027027,0.007143,0.017241,0.013167
city,0.226044,0.047996,0.006757,0.015873,0.564103,0.905405,0.0,0.016393,0.006757,0.013514,0.06402,0.03464,0.033784,0.0,0.054753,0.033784
clouds,0.0,0.015385,0.007692,0.0,0.0,0.0,0.953846,0.0,0.0,0.0,0.050962,0.137703,0.0,0.722527,0.091114,0.092516
f-16,0.0,0.0,0.0,0.179287,0.0,0.016393,0.0,0.967213,0.008197,0.072331,0.00625,0.007042,0.061254,0.0,0.0,0.0
face,0.011364,0.0,0.011628,0.0,0.005814,0.006757,0.0,0.008197,0.976744,0.013514,0.00625,0.032755,0.005814,0.014286,0.014435,0.005814
helicopter,0.006757,0.006757,0.0,0.055234,0.013514,0.013514,0.0,0.072331,0.013514,0.959459,0.031757,0.006757,0.027027,0.007143,0.0,0.006757


# Below here is obsolete. Remove later.

## Send result to S3

In [None]:


DIST_FILE=''
DIST_DIR=''


In [None]:
DIST_PATH=os.path.join(DIST_DIR, DIST_FILE)

In [None]:
os.link(VALID_DISTANCE_DICT_PATH, DIST_PATH)

In [None]:
S3PATH = "" + DIST_PATH

In [None]:
!aws s3 cp  {DIST_PATH} {S3PATH} 

# Misc evaluation

In [None]:
from visualize import plot_image_list
def plot_cat(df, cat, thrld, origin=0):
    filtered = df[df[cat] > thrld].sort_values(by=cat, ascending=False)
    for i in range(0, 50, 5):
        print(",".join([os.path.basename(os.path.dirname(v)) for v in filtered['filepaths'].values[i+origin:(i+5+origin)]]))
    for i in range(0, 50, 5):
        print(",".join(["{0:.4f}".format(v) for v in filtered[cat].values[origin+i:(origin+5+i)]]))
    plot_image_list(filtered["filepaths"].values[origin:(origin+50)])

In [None]:
#plot_cat(df, '41', 0.1)

### load df from file

In [None]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

In [None]:
distdf = pd.read_pickle(VALID_DISTANCE_DICT_PATH)

In [None]:
CATEGORY_DICT_CSV=os.path.join(BASE_DATA_DIR, "")

In [None]:
from category import load_category_dict
catdict = load_category_dict(CATEGORY_DICT_CSV)

In [None]:
def listdist(targetkey):
    dist = distdf[targetkey].sort_values(ascending=False)
    print(dist.index[0:20])
    # dist.index = [catdict[int(idx)] for idx in dist.index]
    dist.index = [vc.name(idx) for idx in dist.index]
    return dist[0:20]

In [None]:
def listdist15(targetkey):
    dist = distdf[targetkey].sort_values(ascending=False)
    dist = dist[dist > 1.5]
    
    for idx in dist.index:
        print("{0}:   {2:.4f} {1}".format(idx, vc.name(idx), dist[idx]))


In [None]:
listdist15('76')

In [None]:
listdist('110')

In [None]:
name2id = {catdict[key]:key for key in catdict.keys()}

In [None]:
name2id['']

In [None]:
vc.name("118_155")

In [None]:
listdist('118_155')

In [None]:
def catlist2names(catlist, catdictst):
    return ["+".join(cats2) for cats2 in 
               [map(lambda cat: catdictst[cat], cats) for cats in catlist]]

In [None]:
from category import merge_categories_above

In [None]:
catdictst = {key:vc.name(key) for key in vc.keys()}

In [None]:
catlist = merge_categories_above(distdf, 1.89)

In [None]:
len(catlist)

In [None]:
catlist2names(catlist, catdictst)

In [None]:
catlist19 = merge_categories_above(distdf, 1.9)

In [None]:
len(catlist19)

In [None]:
catlist2names(catlist19, catdictst)

In [None]:
HIGH_CATEGORY_PATH="trained_model/high_cats19.dat"

In [None]:
with open(HIGH_CATEGORY_PATH, "wb") as f:
    pickle.dump(catlist19, f)

In [None]:
catlist19[0:20]

### Choose lonely subcategory

In [None]:
catlist = merge_categories_above(distdf, 1.8)

In [None]:
len(catlist)

In [None]:
catlist2names(catlist, catdictst)

In [None]:
singlelist = [cat for cat in catlist if len(cat) == 1]

In [None]:
len(singlelist)

In [None]:
distdf['176']['176']

In [None]:
[(cat[0], catdictst[cat[0]]) for cat in singlelist]

In [None]:
def listdist(targetkey):
    dist = distdf[targetkey].sort_values(ascending=False)
    dist.index = [vc.name(idx) for idx in dist.index]
    return dist[0:20]

In [None]:
listdist("10")

In [None]:
listdist("101")

In [None]:
listdist("102")

In [None]:
listdist("104")

In [None]:
listdist("135")

In [None]:
listdist("176")

In [None]:
listdist("0_128")

In [None]:
listdist("107")

In [None]:
listdist("171")

In [None]:
listdist("93")

In [None]:
listdist("33")

In [None]:
listdist("108_72")

In [None]:
def listdistgt(targetkey, val):
    dist = distdf[targetkey][distdf[targetkey] >= val].sort_values(ascending=False)
    dist.index = [vc.name(idx) for idx in dist.index]
    return dist


In [None]:
listdistgt('33', 1.5)

In [None]:
listdistgt('93', 1.2)

In [None]:
distdf['93'][distdf['93'] >= 1.5].index

In [None]:
listdistgt("108_72", 1.5)

In [None]:
distdf['108_72'][distdf['108_72'] >= 1.5].index

In [None]:
TRAIN_DATA_DIR_FOR_1VSALL= os.path.normpath(os.path.join(DATA_DIR, ''))


In [None]:
def count_train_files(targetcat):
    return len(list(glob.glob(os.path.join(os.path.join(TRAIN_DATA_DIR_FOR_1VSALL, "train"), targetcat)+"/*.jpg")))

In [None]:
count_train_files("135")

In [None]:
# not handle virtual category, so the category with under score just return 0
[(cat[0], catdictst[cat[0]], distdf[cat[0]][cat[0]], count_train_files(cat[0])) for cat in singlelist]

In [None]:
tups = [(cat[0], catdictst[cat[0]], distdf[cat[0]][cat[0]], count_train_files(cat[0])) for cat in singlelist]

In [None]:
from operator import itemgetter

In [None]:
sorted(tups, key=itemgetter(2))

In [None]:
os.path.join(BASE_DATA_DIR, "train")

# Try and error

In [None]:
sim.distance(targetkey, tokey)

In [None]:
dists = {}

In [None]:
dists = {"{0}~{1}".format(*pair): sim.distance(pair[0], pair[1])+sim.distance(pair[1], pair[0]) for pair in itertools.combinations(keys, 2)}

In [None]:
list(dists.keys())[0:3]

In [None]:
#with open(VALID_DISTANCE_DICT_PATH, mode='wb') as f:
#    pickle.dump(dists, f)

In [None]:
dists['109~93']

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.hist(list(dists.values()), bins=10)

In [None]:
tmp = pd.DataFrame({"a":[1, 2, 3], "b":[4, 5, 6]})
tmp

In [None]:
tmp.index = ["d", "e", "f"]

In [None]:
tmp

In [None]:
dists = {"{0}".format(*pair): sim.distance(pair[0], pair[1])+sim.distance(pair[1], pair[0]) for pair in itertools.combinations(keys, 2)}

In [None]:
dist = distdf["93"].sort_values(ascending=False)

In [None]:
dist.index = [vc.name(idx) for idx in dist.index]

In [None]:
dist

In [None]:
genodist = distdf["109"].sort_values(ascending=False)

In [None]:
genodist.index = [vc.name(idx) for idx in genodist.index]

In [None]:
genodist