# Embedding Reconstruction

Create embeddings readable by ungol.common.embed based on the
reconstruction of a given embcompr model. This notebook heavily relies
on some naming conventions.

In [None]:
from ungol.models import embcompr
import ungol.common.embed as uce

import h5py
import torch
import numpy as np
from tqdm import tqdm_notebook as tqdm

import pickle
import pathlib
from collections import namedtuple

In [None]:
DEV = torch.device('cpu')

In [None]:
def create_embed(embed: uce.Embed, compr: embcompr.Compressor) -> np.ndarray:
    BATCH_SIZE = 128

    gen = enumerate(embed.chunks(size=BATCH_SIZE))
    bar = tqdm(gen, total=len(embed) // BATCH_SIZE)

    Y = []

    for i, x in bar:
        y = compr(torch.from_numpy(x)).detach().numpy()
        Y.append(y)

    Y = np.vstack(Y)
    assert Y.shape == embed.shape, f'{Y.shape} != {embed.shape}'
    return Y

In [None]:
# use naming convention as given in ../conf/embcompr.<DATASET>.conf

# opt/bow/sick
# ├── mbow.cosine-dist.h5
# ├── mbow.embedding.h5
# ├── mbow.sentences.txt
# ├── mbow.vocab.pickle
# ├── sent2vec.cosine-dist.h5
# ├── sent2vec.embedding.h5
# ├── sent2vec.sentences.txt
# ├── sent2vec.vocab.pickle
# ...


Params = namedtuple('Params', ('dataset', 'redux', 'bits'))


def _get_model_path(p: Params):
    return pathlib.Path('../opt/current') / f'{p.dataset}.{p.redux}-{p.bits}'


def load(p: Params):
    embed_in_path = pathlib.Path('../opt/bow') / p.dataset
    print(f'loading embeddings from {embed_in_path}')

    embed_in = uce.create(uce.Config(
        provider='h5py',
        file_name=str(embed_in_path / f'{p.redux}.embedding.h5'),
        vocabulary=str(embed_in_path / f'{p.redux}.vocab.pickle'),
    ))

    print(f'obtained embedding space of shape {embed_in.shape}')
    model_in_path = _get_model_path(p)
    print(f'load model from {model_in_path}')

    f_model = model_in_path / 'compressor'
    compr = embcompr.Compressor.load(str(f_model), 'model.torch', DEV)

    name = (f_model / 'model.torch').resolve().stem
    compr.eval()

    print(f'models real name is {name} (version={compr.version})')
    return embed_in, compr, name


def save(Y: np.ndarray, p: Params, f_model: str, embed: uce.Embed):
    path = _get_model_path(p)

    f_h5 = path / f'recon.{f_model}.embedding.h5'
    print(f'writing {f_h5}')
    with h5py.File(f_h5, mode='w') as fd:
        fd.create_dataset('embedding', data=Y)

    f_pickle = path / f'recon.{f_model}.vocab.pickle'
    print(f'writing {f_pickle}')
    with open(f_pickle, mode='wb') as fd:
        pickle.dump(embed.vocab, fd)

In [None]:
# embed_in, compr, name = load('sick', 'mbow', 256)

def run(p: Params):

    print('loading...')
    embed_in, compr, name = load(p)

    print('tranforming...')
    Y = create_embed(embed_in, compr)

    print('saving...')
    save(Y, p, name, embed_in)

    print('done')

# p = Params(dataset='sick', redux='mbow', bits=256)
# run(p)

In [None]:
for glob in pathlib.Path('../opt/current/').glob('*.*-*'):
    dataset, suffix = glob.name.split('.')
    redux, bits = suffix.split('-')

    p = Params(dataset=dataset, redux=redux, bits=bits)

    print(f'\n{p}\n')
    run(p)