# Calculate similarity between category based on 1 vs all classifier


## 1. Set up

In [2]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

In [3]:
VALID_DF_PATH = "trained_model/valid_df.dat"
VALID_DISTANCE_DICT_PATH="trained_model/valid_sim_df.dat"

### dup from train.ipynb

In [4]:
BASE_MODEL_PATH="trained_model"

In [5]:
TRAIN_VALID_RATIO=0.9

In [33]:
def dir2filedict(basedir):
    res = {}
    for f in glob.iglob("{}/*/*".format(basedir), recursive=True):
        cat = os.path.basename(os.path.dirname(f))
        res.setdefault(cat, []).append(f)
    return res

In [34]:
import random
def split_train_valid(input_paths, ratio= TRAIN_VALID_RATIO):
    paths = sorted(input_paths)
    random.shuffle(paths)
    sep = int(len(paths)*ratio)
    return paths[0:sep], paths[sep:]

def split_fdict(fdict):
    trdict = {}
    valdict = {}
    cats = sorted(fdict.keys())
    for cat in cats:
        tr, val = split_train_valid(fdict[cat])
        trdict[cat] = tr
        valdict[cat] = val
    return trdict, valdict

In [35]:
fdict_all = dir2filedict("data")

In [36]:
catkeys = sorted(fdict_all.keys())

In [37]:
random.seed(123)
trdict, valdict = split_fdict(fdict_all)

In [38]:
# target is validation set.
fdict = valdict

## Calc similarity and store df

In [12]:
# copy from model_binder.py for development purpose.

from models.modelutils import load_best_model_if_exist
import os
from models.processor import DataSet

Using TensorFlow backend.


In [49]:
class ModelBinder:
    def __init__(self, base_model_name, basedir):
        self.base_model_name = base_model_name
        self.basedir = basedir
        self._models = {}
        self.verbose = True
        self.ds = DataSet()
        self.chunk_size = 3000
    def predict_chunks(self, chunked_paths):
        datas = self.ds.files_to_dataset(chunked_paths)
        models = self._models
        return {key: models[key].predict(datas)[:, 1] for key in models.keys()}
    def predict_files(self, flist):
        chunked_paths = self.ds.chunked(flist, self.chunk_size)
        
        models = self._models
        preddictlist =  [self.predict_chunks(chunk) for chunk in chunked_paths]
        preddict = {key:[] for key in models.keys()}
        for onedict in preddictlist:
            for key in models.keys():
                preddict[key].extend(onedict[key])
        return preddict

    def model_path(self, target_key):
        return os.path.join(self.basedir, "{}_{}".format(self.base_model_name, target_key))

    def get_or_load_model(self, target_key):
        if target_key in self._models:
            return self._models[target_key]
        self.notify("load {}".format(target_key))
        self._models[target_key] = load_best_model_if_exist(self.model_path(target_key))
        return self._models[target_key]

    def notify(self, msg):
        if self.verbose:
            print(msg)

    def load_all_models(self, keys):
        list(map(self.get_or_load_model, keys))


    def predict(self, files):
        preddict = self.predict_files(files)
        preddict["filepaths"] = files
        return pd.DataFrame(preddict)


    def add_argmax_max(self, df):
        catkeys = list(self._models.keys())
        df['argmax'] = df[df.columns[:len(catkeys)]].idxmax(axis=1)
        df['max'] = df[df.columns[0:len(catkeys)]].max(axis=1)



In [50]:

def ModelBinder_create(base_model_name = "model", basedir = BASE_MODEL_PATH):
    return ModelBinder(base_model_name, basedir)

## Predict score and store to df

In [40]:
ftuppls = [(key, file) for key in fdict.keys() for file in fdict[key]]

In [41]:
labels = [tup[0] for tup in ftuppls]

In [42]:
paths = [tup[1] for tup in ftuppls]

In [43]:
len(labels), len(paths)

(1186, 1186)

In [51]:
binder = ModelBinder_create()

In [23]:
binder.load_all_models(catkeys)

load bay
load beach
load birds
load boeing
load buildings
load city
load clouds
load data
load f-16
load face
load helicopter
load mountain
load ocean
load ships
load sky
load sunrise
load sunset


In [52]:
# binder._models = ens._models

In [53]:
%%time
df = binder.predict(paths)

CPU times: user 3min 57s, sys: 12.3 s, total: 4min 10s
Wall time: 4min 52s


In [54]:
df['label'] = labels

In [55]:
df.to_pickle(VALID_DF_PATH)

## Calculate similarity from score df

In [92]:
class SimilarityCalculator:
    def __init__(self, df, thrshold=0.5):
        self.df = df
        self.rmiss = {}
        self.thrshold = thrshold
        self.cats = [key for key in df.columns.values if key != "filepaths" and key !="label"]
        self.catidxs = [list(df.columns).index(key) for key in self.cats]
    def combinekey(self, fromkey, tokey):
        return "{0}~{1}".format(fromkey, tokey)
    def saveAll(self):
        for key in self.cats:
            self.saveNaxx(key)
    def saveNaxx(self, fromkey):
        targetdf = self.df[self.df['label'] == fromkey]
        
        Nab = (targetdf[targetdf.columns[self.catidxs]] > self.thrshold).sum()
        Na = len(targetdf)
        
        Nab_a = Nab/Na
        
        otherkeys = Nab_a.index
        list(map(lambda otherkey: self.add(fromkey, otherkey, Nab_a[otherkey]), otherkeys))

        
    def add(self, fromkey, tokey, val):
        key = self.combinekey(fromkey, tokey)
        self.rmiss.setdefault(key, 0)
        self.rmiss[key] += val
    def distance(self, fromkey, tokey):
        key1 = self.combinekey(fromkey, tokey)
        key2 = self.combinekey(tokey, fromkey)
        return self.rmiss[key1] + self.rmiss[key2]
        


In [93]:
sim = SimilarityCalculator(df)

In [94]:
sim.saveAll()

In [96]:
# keys = df.columns[:-2]
keys = sim.cats

In [97]:
import itertools

In [98]:
len(list(itertools.combinations(keys, 2)))

120

In [99]:
distsdict = {key1: [sim.distance(key1, key2) for key2 in keys] for key1 in keys}

In [100]:
distdf = pd.DataFrame(distsdict)

In [101]:
distdf.index = keys

In [102]:
distdf.to_pickle(VALID_DISTANCE_DICT_PATH)

In [103]:
distdf

Unnamed: 0,bay,beach,birds,boeing,buildings,city,clouds,f-16,face,helicopter,mountain,ocean,ships,sky,sunrise,sunset
bay,1.704545,1.222962,0.011364,0.011364,0.071096,0.486179,0.022727,0.0,0.022727,0.013514,0.355682,0.299936,0.038391,0.011364,0.068966,0.011364
beach,1.222962,1.747126,0.011494,0.027367,0.037135,0.168997,0.0,0.0,0.011494,0.013514,0.133477,0.504938,0.036502,0.011494,0.103448,0.127058
birds,0.011364,0.011494,2.0,0.0,0.0,0.013514,0.015385,0.0,0.070188,0.0,0.012048,0.124724,0.013514,0.012048,0.017241,0.013514
boeing,0.011364,0.027367,0.0,1.873016,0.031746,0.031746,0.0,0.34166,0.0,0.110468,0.0,0.086296,0.090519,0.015873,0.0,0.0
buildings,0.071096,0.037135,0.0,0.031746,1.948718,1.208593,0.0,0.016393,0.011628,0.027027,0.088462,0.014085,0.106029,0.027106,0.034483,0.026334
city,0.486179,0.168997,0.013514,0.031746,1.208593,1.891892,0.015385,0.032787,0.061911,0.027027,0.259122,0.09745,0.162162,0.040541,0.109506,0.054054
clouds,0.022727,0.0,0.015385,0.0,0.0,0.015385,1.969231,0.0,0.015385,0.0,0.101923,0.289491,0.015385,1.732967,0.233952,0.196674
f-16,0.0,0.0,0.0,0.34166,0.016393,0.032787,0.0,1.934426,0.016393,0.198715,0.045287,0.014085,0.215109,0.014286,0.0,0.0
face,0.022727,0.011494,0.070188,0.0,0.011628,0.061911,0.015385,0.016393,1.953488,0.027027,0.0125,0.051425,0.013514,0.051827,0.028869,0.023256
helicopter,0.013514,0.013514,0.0,0.110468,0.027027,0.027027,0.0,0.198715,0.027027,1.918919,0.077027,0.013514,0.081081,0.014286,0.0,0.013514


## Send result to S3

In [None]:


DIST_FILE=''
DIST_DIR=''


In [None]:
DIST_PATH=os.path.join(DIST_DIR, DIST_FILE)

In [None]:
os.link(VALID_DISTANCE_DICT_PATH, DIST_PATH)

In [None]:
S3PATH = "" + DIST_PATH

In [None]:
!aws s3 cp  {DIST_PATH} {S3PATH} 

# Misc evaluation

In [None]:
from visualize import plot_image_list
def plot_cat(df, cat, thrld, origin=0):
    filtered = df[df[cat] > thrld].sort_values(by=cat, ascending=False)
    for i in range(0, 50, 5):
        print(",".join([os.path.basename(os.path.dirname(v)) for v in filtered['filepaths'].values[i+origin:(i+5+origin)]]))
    for i in range(0, 50, 5):
        print(",".join(["{0:.4f}".format(v) for v in filtered[cat].values[origin+i:(origin+5+i)]]))
    plot_image_list(filtered["filepaths"].values[origin:(origin+50)])

In [None]:
#plot_cat(df, '41', 0.1)

### load df from file

In [None]:
import os
import json
import glob
import scipy
import numpy as np

import pandas as pd
import glob

In [None]:
distdf = pd.read_pickle(VALID_DISTANCE_DICT_PATH)

In [None]:
CATEGORY_DICT_CSV=os.path.join(BASE_DATA_DIR, "")

In [None]:
from category import load_category_dict
catdict = load_category_dict(CATEGORY_DICT_CSV)

In [None]:
def listdist(targetkey):
    dist = distdf[targetkey].sort_values(ascending=False)
    print(dist.index[0:20])
    # dist.index = [catdict[int(idx)] for idx in dist.index]
    dist.index = [vc.name(idx) for idx in dist.index]
    return dist[0:20]

In [None]:
def listdist15(targetkey):
    dist = distdf[targetkey].sort_values(ascending=False)
    dist = dist[dist > 1.5]
    
    for idx in dist.index:
        print("{0}:   {2:.4f} {1}".format(idx, vc.name(idx), dist[idx]))


In [None]:
listdist15('76')

In [None]:
listdist('110')

In [None]:
name2id = {catdict[key]:key for key in catdict.keys()}

In [None]:
name2id['']

In [None]:
vc.name("118_155")

In [None]:
listdist('118_155')

In [None]:
def catlist2names(catlist, catdictst):
    return ["+".join(cats2) for cats2 in 
               [map(lambda cat: catdictst[cat], cats) for cats in catlist]]

In [None]:
from category import merge_categories_above

In [None]:
catdictst = {key:vc.name(key) for key in vc.keys()}

In [None]:
catlist = merge_categories_above(distdf, 1.89)

In [None]:
len(catlist)

In [None]:
catlist2names(catlist, catdictst)

In [None]:
catlist19 = merge_categories_above(distdf, 1.9)

In [None]:
len(catlist19)

In [None]:
catlist2names(catlist19, catdictst)

In [None]:
HIGH_CATEGORY_PATH="trained_model/high_cats19.dat"

In [None]:
with open(HIGH_CATEGORY_PATH, "wb") as f:
    pickle.dump(catlist19, f)

In [None]:
catlist19[0:20]

### Choose lonely subcategory

In [None]:
catlist = merge_categories_above(distdf, 1.8)

In [None]:
len(catlist)

In [None]:
catlist2names(catlist, catdictst)

In [None]:
singlelist = [cat for cat in catlist if len(cat) == 1]

In [None]:
len(singlelist)

In [None]:
distdf['176']['176']

In [None]:
[(cat[0], catdictst[cat[0]]) for cat in singlelist]

In [None]:
def listdist(targetkey):
    dist = distdf[targetkey].sort_values(ascending=False)
    dist.index = [vc.name(idx) for idx in dist.index]
    return dist[0:20]

In [None]:
listdist("10")

In [None]:
listdist("101")

In [None]:
listdist("102")

In [None]:
listdist("104")

In [None]:
listdist("135")

In [None]:
listdist("176")

In [None]:
listdist("0_128")

In [None]:
listdist("107")

In [None]:
listdist("171")

In [None]:
listdist("93")

In [None]:
listdist("33")

In [None]:
listdist("108_72")

In [None]:
def listdistgt(targetkey, val):
    dist = distdf[targetkey][distdf[targetkey] >= val].sort_values(ascending=False)
    dist.index = [vc.name(idx) for idx in dist.index]
    return dist


In [None]:
listdistgt('33', 1.5)

In [None]:
listdistgt('93', 1.2)

In [None]:
distdf['93'][distdf['93'] >= 1.5].index

In [None]:
listdistgt("108_72", 1.5)

In [None]:
distdf['108_72'][distdf['108_72'] >= 1.5].index

In [None]:
TRAIN_DATA_DIR_FOR_1VSALL= os.path.normpath(os.path.join(DATA_DIR, ''))


In [None]:
def count_train_files(targetcat):
    return len(list(glob.glob(os.path.join(os.path.join(TRAIN_DATA_DIR_FOR_1VSALL, "train"), targetcat)+"/*.jpg")))

In [None]:
count_train_files("135")

In [None]:
# not handle virtual category, so the category with under score just return 0
[(cat[0], catdictst[cat[0]], distdf[cat[0]][cat[0]], count_train_files(cat[0])) for cat in singlelist]

In [None]:
tups = [(cat[0], catdictst[cat[0]], distdf[cat[0]][cat[0]], count_train_files(cat[0])) for cat in singlelist]

In [None]:
from operator import itemgetter

In [None]:
sorted(tups, key=itemgetter(2))

In [None]:
os.path.join(BASE_DATA_DIR, "train")

# Try and error

In [None]:
sim.distance(targetkey, tokey)

In [None]:
dists = {}

In [None]:
dists = {"{0}~{1}".format(*pair): sim.distance(pair[0], pair[1])+sim.distance(pair[1], pair[0]) for pair in itertools.combinations(keys, 2)}

In [None]:
list(dists.keys())[0:3]

In [None]:
#with open(VALID_DISTANCE_DICT_PATH, mode='wb') as f:
#    pickle.dump(dists, f)

In [None]:
dists['109~93']

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.hist(list(dists.values()), bins=10)

In [None]:
tmp = pd.DataFrame({"a":[1, 2, 3], "b":[4, 5, 6]})
tmp

In [None]:
tmp.index = ["d", "e", "f"]

In [None]:
tmp

In [None]:
dists = {"{0}".format(*pair): sim.distance(pair[0], pair[1])+sim.distance(pair[1], pair[0]) for pair in itertools.combinations(keys, 2)}

In [None]:
dist = distdf["93"].sort_values(ascending=False)

In [None]:
dist.index = [vc.name(idx) for idx in dist.index]

In [None]:
dist

In [None]:
genodist = distdf["109"].sort_values(ascending=False)

In [None]:
genodist.index = [vc.name(idx) for idx in genodist.index]

In [None]:
genodist