In [None]:
CLR = {
    'blue': ['#e0f3ff', '#aadeff', '#2bb1ff', '#15587f', '#0b2c40'],
    'gold': ['#fff3dc', '#ffebc7', '#ffddab', '#b59d79', '#5C4938'],
    'red':  ['#ffd8e8', '#ff9db6', '#ff3e72', '#6B404C', '#521424'],
}

In [None]:
import ungol.analyze as ua
import ungol.training as ut

import h5py
import attr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as plt_patches
from tqdm import tqdm as _tqdm

import re
import pickle
import pathlib
import functools
import collections
import multiprocessing as mp

from typing import Tuple


tqdm = functools.partial(_tqdm, ncols=80, leave=True)

In [None]:
@attr.s
class Experiment:
    
    name: str = attr.ib()
    h5: str = attr.ib()
    codebooks: int = attr.ib()


def load_experiments(parent: str):
    path = pathlib.Path(parent)
    experiments = []
    
    for d in filter(lambda p: p.is_dir(), path.iterdir()):
        experiment = d.name
    
        h5 = str(d/'hamming-dists.h5')
        raw_conf = ut.Config.create(str(d / 'embcompr.conf'), None)
        conf = {k: c for k, c in raw_conf}[experiment]
        
        experiment = Experiment(
            name=experiment,
            h5=h5,
            codebooks=conf.dimensions.components)
    
        experiments.append(experiment)
    
    print('preparing {} experiments'.format(len(experiments)))
    for exp in experiments:
        print(exp.name, str(exp.h5))
    
    return experiments

# Distance Distribution

In [None]:
def _plot_distcount(histogram: np.array,
                    bin_edges: np.array,
                    label_fmt: str,
                    title: str,
                    subtitle: str = None,
                    display: bool = True,
                    out_file: pathlib.Path = None):

    def fig_before(title: str):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_title(title)
        ax.set_xlabel('Distance')
        ax.set_ylabel('Count')
        return fig, ax
    
    def fig_after(fix, ax, patches, out_file):
        ax.legend(handles=patches)
        if display:
            plt.show(fig)
        if out_file is not None:
            out_file.parents[0].mkdir(exist_ok=True)
            for out_file in [str(out_file) + s for s in ('.png', '.svg')]:
                print('saving to', out_file)
                fig.savefig(out_file)

    fig, ax = fig_before(title)
    patches = []

    # vertical lines
    vline_style = dict(ls='dashed', color='black', linewidth=1)
    patch_style = dict(color=CLR['blue'][3])

    vmin = bin_edges[0]
    ax.axvline(vmin, ymax=0.4, **vline_style)
    patches.append(plt_patches.Patch(label=label_fmt.format('min', vmin), **patch_style))

    argmax = bin_edges[histogram.argmax()]
#     ax.axvline(argmax, **vline_style)
    patches.append(plt_patches.Patch(label=label_fmt.format('peak:', argmax), **patch_style))

    vmax = bin_edges[-1]
    ax.axvline(vmax, ymax=0.4, **vline_style)
    patches.append(plt_patches.Patch(label=label_fmt.format('max', vmax), **patch_style))

    # legend, bottom line
    ax.axhline(0, color=CLR['blue'][2], linewidth=1)
    ax.legend(handles=patches)

    # line plot
    ax.fill_between(bin_edges, histogram, color=CLR['blue'][0])
    ax.plot(bin_edges, histogram, color=CLR['blue'][3], linewidth=0.5)

    if subtitle is not None:
        fig.suptitle(subtitle, fontsize=10)

    fig_after(fig, ax, patches, out_file)


def plot_distcounts(fname: str,
                    label_fmt: str,
                    title: str,
                    subtitle: str = None,
                    ranges: Tuple[int] = None,
                    display: bool = True,
                    out_file: pathlib.Path = None):
    
    print('reading', fname)
    fd = h5py.File(fname, mode='r+')
    raw_histogram = fd['histogram'][1:]
    raw_bin_edges = fd['histogram'].attrs['bin_edges']
    
    data: Tuple[np.array] = []
    
    print('reading global histogram')
    start, end = (int(raw_histogram.nonzero()[0][i]) for i in (0, -1))
    histogram, bin_edges = raw_histogram[start:end], raw_bin_edges[start:end]
    data.append((histogram, bin_edges, 0))
    
    print('building k-NN histograms')
    # |Vocabulary| * k * 4 Byte
    # 4e5 words with 4e3 nearest neighbours: 6.4GB
    for k in ranges:
        key = 'histogram_{}-NN'.format(k)
        if key not in fd:
            print('  calculating {}-NN histogram'.format(k))
            vmin, vmax = bin_edges[0], bin_edges[-1]
            bins = histogram.shape[0]
            h, b = np.histogram(fd['dists'][:, 1:k + 1].flat, bins=bins, range=(vmin, vmax))
        
            ds = fd.create_dataset(key, data=h)
            ds.attrs['bin_edges'] = b
        
        else:
            h, b = fd[key][:], fd[key].attrs['bin_edges']
        
        data.append((h, b[:-1], k))
    
    for histogram, bin_edges, k in data:
        plot_title = title
        if k != 0:
            plot_title += ' for {}-NN'.format(k)
            if out_file:
                fname = out_file.parents[0] / (out_file.name + '-{}-nn'.format(k))
        else:
            fname = out_file
        
        _plot_distcount(histogram,
                        bin_edges,
                        label_fmt,
                        plot_title,
                        subtitle=subtitle,
                        display=display,
                        out_file=fname)
    
    fd.close()

## Hamming Space (Embedding Codes)

In [None]:
def _plot_hamming_counts(arr, bin_edges):
    
    def fig_before(title: str):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_title(title)
        ax.set_xlabel('Distance')
        ax.set_ylabel('Count')
        return fig, ax
    
    def fig_after(fix, ax, patches, out_file):
        ax.legend(handles=patches)
        if display:
            plt.show(fig)
        if out_file is not None:
            out_file.parents[0].mkdir(exist_ok=True)
            for out_file in [str(out_file) + s for s in ('.png', '.svg')]:
                print('saving to', out_file)
                fig.savefig(out_file)
                
    fig, ax = fig_before('hamming stuff')
    ax.plot(bin_edges[:-1], arr)
    fig_after(fig, ax, [], None)
    
    
def plot_hamming(fname: str, ranges: Tuple[int]):
    fd = h5py.File(fname, 'r+')
    bins = fd['histogram'].shape[0]
    rg = (0, bins + 1)
    
    for k in ranges:
        key = 'histogram_{}-NN'.format(k)
        if key not in fd:
            hist, bin_edges = np.histogram(fd['dists'][:, 1:k + 1], bins=bins, range=rg)
            fd[key] = hist
            fd[key].attrs['bin_edges'] = bin_edges
        else:
            hist, bin_edges = fd[key][:], fd[key].attrs['bin_edges']
        
        title = 'Hamming Distance Distribution (GLOVE) for {} NN - 256 Bit'.format(k)
        out_file = pathlib.Path(fname).parents[0] / 'images' / 'hamming-dist-{}-nn'.format(k)
        _plot_distcount(hist, bin_edges[:-1], '{}: {:3.0f}', title, out_file=out_file)

# Correlation of Distance Metrics

In [None]:
fname = '../opt/correlation/euclidean-256x2/corr-100.h5'

df_ref = pd.read_hdf(fname, 'ref/common')
df_cmp = pd.read_hdf(fname, 'cmp/common')

# df_ref.iloc[:, :10000][:1].dropna(axis=1)
df_ref.iloc[:, :10000].count(axis=1)

# print(df_ref.shape)
# print(df_cmp.shape)

# counts = df_ref.iloc[:, :10000].count(axis=1)

# print('common nearest neighbours')
# plt.plot(counts.index, counts, color=CLR['blue'][2])
# plt.savefig('../opt/cnn-euclidean-256x2-corr-100-10k.svg')


## Play the organ

In [None]:
# ranges = (10, 100, 1000, 4000, )  # consider ram ;)
# ranges = (10, 100, 1000, )
# ranges = (10, 100, )
ranges = (10, )
kwargs = dict(display=True)
save = True

# f_exp = '../opt/experiments/binary'
# f_vocab = '../opt/embeddings/glove/vocabulary.pickle'

# ---  distcounts

for path in pathlib.Path('..').glob('opt/neighbours/*.h5'):
    embedding, measure = pathlib.Path(path.name).stem.split('-')
    
    title = '{} Distance Distribution ({})'.format(measure.capitalize(), embedding.upper())
    args = str(path), '{}: ~{:.3f}', title
    
    if save:
        out_file = path.parents[0] / 'images' / '{}-{}'.format(embedding, measure)
        new_kwargs = {'out_file': out_file, **kwargs}
    else:
        new_kwargs = kwargs
    
    print('-' * 60, '\n', title)
    plot_distcounts(*args, ranges=ranges, **new_kwargs)
    

# args = '../opt/neighbours/euclidean-new.h5', '{}: ~{:.3f}', 'Euclidean Distance Distribution'
# args = '../opt/neighbours/glove-cosine.h5', '{}: ~{:.3f}', 'Cosine Distance Distribution'
# plot_distcounts(*args, ranges=ranges, **kwargs)



In [None]:
plot_hamming('../opt/experiments/binary/glove-256x2/hamming-dists.h5', ranges=(10, 100, 1000, 4000, ))