# SentEval Preparation

In [1]:
import torch
import numpy as np

from ungol.common import embed as uce
from ungol.sentemb import redux as usr
from ungol.models import embcodr
from ungol.models import embcompr
from ungol.similarity import rhwmd

import pathlib

In [2]:
BITS = 256
dev = torch.device('cpu')
f_mod = 'model.torch'

# for redux.Sent2Vec
f_exp = '../opt/current/sick.sent2vec-512/compressor'
f_sent2vec = '../opt/lib/sent2vec/wiki_bigrams.bin'

# for all redux.Embed implementations
# f_exp = '../opt/current/sick.mbow-512/compressor'
f_embed = '../opt/embed/'
f_embed_h5 = 'glove-840b.2m.300d.h5'
f_embed_vocab = 'glove-840b.2m.300d.vocab.pickle'

### Load the model

Do not forget to invoke `model.eval()`! Otherwise activations are non-binary.

In [3]:
compr = embcompr.Compressor.load(f_exp, f_mod, dev)
compr.eval()
print(compr)

Neural Compressor Version 0.2
Compressor(
  (_encoder): Encoder(
    (_gumbel): Gumbel()
    (_layer): Sequential(
      (fcl1): Linear(in_features=700, out_features=512, bias=True)
      (fcl1-a): Tanh()
      (fcl2): Linear(in_features=512, out_features=1024, bias=True)
      (fcl2-a): Softplus(beta=1, threshold=20)
    )
  )
  (_decoder): QuantizationDecoder()
)


### Load the embeddings

In [4]:
p_embed = pathlib.Path(f_embed)

embed = uce.create(uce.Config(
    provider='h5py',
    file_name=str(p_embed / f_embed_h5),
    vocabulary=str(p_embed / f_embed_vocab),
))

print(embed)

ungol embedding provider
  2196016 words
  300 dimensions


### Transform some example sentences

In [5]:
# usually the result of some tokenization operation
tokens = (
    'that rug really tied the room together unbekannteswortasd .'.split(' '),
    'there is a carpet in my house'.split(' '),
    'yeah well you know thats just like your opinion man .'.split(' '),
)

# select sentence embedding method
# redux = usr.BoW(embed)
# redux = usr.MBoW(embed)
redux = usr.Sent2Vec(f_sent2vec)

# transform tokens to real-valued sentence vectors
reduced = torch.Tensor([redux.do(toks) for toks in tokens])
print('\nsentence embeddings:\n', [vec.shape for vec in reduced])

# create non-binary code representation
codes = embcodr.create_codes(compr, reduced, BITS)
print('\nnon-binary codes:\n', codes)

# transform these codes to "real" hash codes
hashes = embcodr.create_hashes(codes).astype(np.uint8)
print('\nhash codes:\n', hashes)


sentence embeddings:
 [torch.Size([1, 700]), torch.Size([1, 700]), torch.Size([1, 700])]



non-binary codes:
 [[0 1 0 ... 1 0 0]
 [0 0 0 ... 1 0 1]
 [0 1 1 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 [0 1 1 ... 1 1 0]
 [0 0 0 ... 1 1 1]]

hash codes:
 [[ 69  41 148 214  68 224 212  91 144 153  55  96 158 114  31  83 189  44
  108 217 189  16  24 228 232 250  59 180 135 222  59 140]
 [ 11 154 241 209  23  90 236 199 151  89 120  97  60  86 182  47   1 116
  210  55 129  77  21 244  53 135  89  10 227 125  20 229]
 [117  41 158 190  84  98 220  73 148  17   7  67 136 246  95  87 170   7
   93 217 167  22 152  76 170 126 107 117 197 247 123  79]
 [  9 170 243 216 197 114  32 227 157  73  90  98  48  88 166  46   4  54
  242 175 233 123  13 244  87 187 121   2  79  93 180 239]
 [102 175  30 151 228 227 103  75 200   9   7 105 140 242  30 151  56  46
  175 223 181  22 152 100 232 247  35  36 199 246 219 174]
 [  9  59 143 144  86 254  98 227 213  71  99  64  60  84 246 167 161  52
  212 108 233 251  18 117  83 183 124  82 103  69 181 199]]


### Calculate the hamming distance

In [6]:
print(' '.join(tokens[0]))

fmt = '{:1d} | {:.05f} | {}'

for i, (s1, s2) in enumerate(((hashes[0], hashes[1]), (hashes[0], hashes[2]))):
    # this interface might change in the future (rhwmd is the wrong name!)
    print(fmt.format(i, rhwmd.hamming(s1, s2), ' '.join(tokens[i + 1])))

that rug really tied the room together unbekannteswortasd .
0 | 0.49609 | there is a carpet in my house
1 | 0.26172 | yeah well you know thats just like your opinion man .
