In [1]:
import h5py
import numpy as np

import pathlib

S_BYTE = 8

In [8]:
experiment = 'fasttext.de-256x2'
fname = 'codemap.model-2000.h5'

basepath = pathlib.Path('../opt/experiments/binary/') / experiment
codemap = h5py.File(str(basepath / fname), 'r')['codes'][:10]

print('loaded codemap contains', codemap.dtype , 'encoded numbers')

c_words, c_bits = codemap.shape
c_bytes = c_bits // S_BYTE

assert c_bits % 8 == 0, c_bits % 8

# WORDS x BYTES x BITS
# (n, 256) -> (n, 32, 8)
raw = codemap.reshape(c_words, c_bytes, -1)

# create range of potencies to calculate byte values ([128, 64, 32, ...])
# and repeat them until they match the whole chunk
_potency_byte = np.flip(2 ** np.arange(S_BYTE, dtype=np.uint))  # (8, )
_potency_flat = np.tile(_potency_byte, c_bytes * c_words) # (n * c_bits, )

# convert raw array to int representation of bytes
potency = _potency_flat.reshape(*raw.shape)
byte_arr = (raw * potency).sum(axis=2)  # (n, c_bytes)

print('np byte array\n', byte_arr.shape)
byte_str1 = bytes((a for a in byte_arr[0]))
byte_str2 = bytes((a for a in byte_arr[0]))

print('example bytes array', byte_arr[0])
print('example byte string', byte_str1)



loaded codemap contains uint64 encoded numbers
np byte array
 (10, 32)
example bytes array [168  36 176  10 159  20 225 140  83 150 194 129  49 206 172 183  74  98
 177 196 140 135  87  99  90 236  80 119  55 207  62  92]
example byte string b'\xa8$\xb0\n\x9f\x14\xe1\x8cS\x96\xc2\x811\xce\xac\xb7Jb\xb1\xc4\x8c\x87WcZ\xecPw7\xcf>\\'
