In [None]:
# import pdb; pdb.set_trace()

CLR = {
    'blue': ['#e0f3ff', '#aadeff', '#2bb1ff', '#15587f', '#0b2c40'],
    'gold': ['#fff3dc', '#ffebc7', '#ffddab', '#b59d79', '#5C4938'],
    'red':  ['#ffd8e8', '#ff9db6', '#ff3e72', '#6B404C', '#521424'],
}

# Model

In [None]:
import ungol.embcompr as ue

import h5py
import torch
import numpy as np
from tabulate import tabulate
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt

from sklearn import decomposition
from sklearn import manifold

import pathlib
from typing import Tuple

## Codebooks

### Cluster analysis

In [None]:
def cluster_distances(compressor: ue.Compressor) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Calculate the distance for each codebook vector to the codebook vector's
    cluster center and then the respective distances of each cluster to another.
    
    M is the number of codebooks.
    
    Returns:
        -- vector distances: (M, ) shaped tensor (distance of the codebook vectors to cluster center)
        -- cluster distances: (M, M) shaped tensor (distance of each cluster center to all others)
    
    """
    dims = compressor.dimensions
    M, K, E = dims.components, dims.codelength, dims.embedding
    codebooks = compressor.decoder.components.view(M, K, E)

    # calculate the center of codebook clusters
    clusters = codebooks.sum(dim=1) / K
    assert clusters.shape[0] == M

    # calculate the distance of the first codebook vector
    # to its codebook cluster center. As all codebook vectors
    # are equidistant to the cluster center, this sampling is sufficient.
    # vec_dists.shape -> (M, )
    vector_dists = (clusters - codebooks[:, 0, :]).abs().norm(dim=-1)
    assert vector_dists.shape[0] == M

    # calculate the distance of each cluster center to all other
    # cluster centers, stack repeated cluster vectors and all clusters
    # cluster_dists.shape -> (M, M)
    centers = torch.stack([cluster.expand(M, -1) for cluster in clusters])
    cluster_dists = (centers - clusters.expand(M, M, -1)).abs().norm(dim=-1)

    assert cluster_dists.shape[0] == cluster_dists.shape[1]
    assert cluster_dists.diag().sum().item() == 0
    
    return vector_dists.detach(), cluster_dists.detach()


def agg_cluster_distances(compressor: ue.Compressor, cutoff: int = 10, display: bool = True, save: bool = False):
    M = compressor.dimensions.components
    vec_dists, cluster_dists = cluster_distances(compressor)
    aggregated_data = list(zip(range(1, M + 1), vec_dists, cluster_dists.mean(dim=-1)))

    title = ''.join(['\n\n', '-' * 60, '\n', glob.parts[-3], ' (', glob.name, ')'])
    sbuf = [title, 'mean distance:']
    
    sbuf.append('  vectors: {:2.3f} (std={:2.3f})'.format(
        vec_dists.mean().item(), 
        vec_dists.std().item(), ))
    
    sbuf.append('  cluster: {:2.3f} (std={:2.3f})'.format(
        cluster_dists.mean().item(), 
        cluster_dists.std().item(), ))
        
    sbuf.append('')

    sbuf.append(tabulate(
        aggregated_data[:min(cutoff, M)],
        headers=('codebook', 'vector dist', 'mean cluster dist')))
    
    return '\n'.join(sbuf)

In [None]:
def tsne_scatter(compressor: ue.Compressor, path: pathlib.Path, display: bool = True, save: bool = False):
    dims = compressor.dimensions
    M, K = dims.components, dims.codelength
    
    codebooks = compressor.decoder.components.detach().numpy()
    
    dt = np.dtype([('color', np.unicode_, 7)])
    clr = np.array([CLR['blue'][2], CLR['gold'][2], CLR['red'][2]], dtype=dt)

    # create color map
    base_cmap = clr.repeat(K).reshape(-1, K)
    cmap = [c[0] for c in base_cmap.flat]
    
    # gesundheit
    sne = manifold.TSNE(n_components=2)
    arr = sne.fit_transform(codebooks)
    
    # plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    folder = path.parents[0]
    ax.set_title('Codebook Vectors ({})'.format(folder.parents[0].parts[-1].replace('-', '/')))
    ax.scatter(arr[:, 0], arr[:, 1], color=cmap, marker='.', alpha=0.5)
    
    if save:
        img_folder = folder.parents[0] / 'images'
        img_folder.mkdir(exist_ok=True)
        
        fname = str(img_folder / ('codebooks_' + path.stem.split('-')[-1]))
        fig.savefig(fname + '.png')
        fig.savefig(fname + '.svg')
        # print('saving images to {}*'.format(fname))
    
    if display:
        plt.show(fig)
    
    plt.close(fig)

## Play the organ

In [None]:
assert False, '"run all" shall not pass!'

### Print tables and t-SNE plots

In [None]:
options = dict(
    display=False,
    save=True, )

experiment = 'tau'
embedding = 'glove'

# ---

f_base = '../opt/experiments'
f_glob = experiment + '/' + embedding + '-*/compressor/model-*.torch'

reps = []
globs = list(pathlib.Path(f_base).glob(f_glob))
for glob in tqdm(globs):
    model = ue.Compressor.load(str(glob.parents[0]), glob.name, torch.device('cpu'))
    rep = agg_cluster_distances(model, cutoff=1000, **options)
    reps.append(rep)
    tsne_scatter(model, glob, **options)

summary = '\n'.join(reps)    

if options['display']:
    print(summary)
    
if options['save']:
    fname = (pathlib.Path(f_base) / f_glob).parents[2] / (embedding + '.clusters.txt')
    with fname.open(mode='w') as fd:
        fd.write(summary)
        
print('done')

### Print bar plot with means

In [None]:
def _create_embedding(endpoint: str, fname: str):
    import ember.client as ec
    
    chunk_size = 8192
    client = ec.EmberClient(endpoint, chunk_size)
    
    with h5py.File(fname, 'w') as fd:
        vocab = client.get_vocab()
        
        ds = fd.create_dataset('embedding', shape=(len(vocab), 300))
        for i, chunk in tqdm(enumerate(client.gen_chunks()), total=len(vocab)//chunk_size+1):
            lower = i * chunk_size
            upper = min(ds.shape[0], lower + chunk_size)
            ds[lower:upper] = chunk
    
        client.close()
        return ds[:]

    
def get_embedding(endpoint: str, folder: str, h5file: str):
    fname = str(pathlib.Path(folder) / h5file)
    
    try:
        with h5py.File(fname, 'r') as fd:
            data = fd['embedding'][:]
    except OSError:
        data = _create_embedding(endpoint, fname)
        
    return data

In [None]:
def plot_norms(model_args, emb_args, display: bool = True, save: bool = False):
    folder = pathlib.Path(model_args[0])
    
    # embeddings
    emb_data = get_embedding(*emb_args)
    emb_norms = np.linalg.norm(emb_data, axis=1)
    
    # clusters
    compressor = ue.Compressor.load(*model_args, torch.device('cpu'))
    vec_dists, cluster_dists = cluster_distances(compressor)
    
    # codebooks
    codebook_data = compressor.decoder.components.view(-1, 300).detach()
    codebook_norms = np.linalg.norm(codebook_data, axis=1)
    
    # plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    ax.set_title('Distances and Vector Norms ({})'.format(
        folder.parents[0].parts[-1].replace('-', '/')))

    labels = [
        'embedding\nnorm',
        'codebook\nnorm',
        'cluster\ndistance',
        'cluster radius', ]
    
    heights = [
        emb_norms.mean(),
        codebook_norms.mean(),
        cluster_dists.mean(),
        vec_dists.mean(), ]
    
    yerrs = [
        emb_norms.std()/2,
        codebook_norms.std()/2,
        cluster_dists.std()/2,
        vec_dists.std()/2, ]
    
    color='blue'
    ax.bar(
        range(len(heights)), heights, yerr=yerrs, tick_label=labels,
        color=CLR[color][1], edgecolor=CLR[color][3], ecolor=CLR[color][3],
        linewidth=1, )
    
    if save:
        stem = model_args[1].split('-')[1].split('.')[0]
        fname = str(folder.parents[0] / 'images' / ('norms_' + stem))
        fig.savefig(fname + '.png')
        fig.savefig(fname + '.svg')
    
    if display:
        plt.show(fig)
    
    plt.close(fig)
    

# emb_args = 'tcp://localhost:8124', '../opt/embeddings/fasttext', 'wiki.en.400k.h5'
# emb_args = 'tcp://localhost:8125', '../opt/embeddings/fasttext', 'wiki.de.400k.h5'
emb_args = 'tcp://localhost:8126', '../opt/embeddings/glove', 'glove.6B.300d.h5'

experiment = 'tau'
embedding = 'glove'

path = pathlib.Path('../opt/experiments/' + experiment)
globs = list(path.glob(embedding + '-*/compressor/*torch'))

for glob in tqdm(globs):
    model_args = str(glob.parents[0]), glob.name
    plot_norms(model_args, emb_args, display=False, save=True)
    
print('done')