In [1]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import zarr
from scipy.sparse import csr_matrix, save_npz, load_npz

In [2]:
data_dir = Path('D:/data/output')

# Save

In [4]:
for f in data_dir.glob("*.npz"):
    fname_parts = f.name.split(".")
    dset_name, src_emb, nbits = (
        ".".join(fname_parts[:3]),
        fname_parts[3],
        fname_parts[5],
    )
    emb_path = f"binary/{src_emb}_{nbits}"
    print(dset_name, '\t', emb_path)

    # save
    emb = load_npz(f)
    dset_path = Path("D:/data/msmarco-passages") / dset_name
    dset = zarr.open(str(dset_path))
    dset.array(emb_path, data=emb.todense(), chunks=(64, None), overwrite=True)

docs.eval.zarr 	 binary/fse_128
docs.eval.zarr 	 binary/fse_256
queries.eval.zarr 	 binary/bert_128
queries.eval.zarr 	 binary/bert_256
queries.eval.zarr 	 binary/fse_128
queries.eval.zarr 	 binary/fse_256


# Convert

In [3]:
def parse_line(line):
    uid, *ints = line.split()
    binary_str = ""
    for i in ints:
        binary_str += f"{int(i):064b}"[::-1]
    # reverse the order so that the smaller bit comes first
    binary_array = np.array([int(i) for i in binary_str], dtype=np.bool)
    # return as boolean array
    return uid, binary_array

In [4]:
for fpath in data_dir.glob("*.binary.*.txt"):
    print(f"Processing {fpath}")
    rows = []
    with fpath.open("r", encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            if i == 0:
                nrows, nbits = line.split()
                print(f"{nrows} rows, {nbits} bits")
                continue
            else:
                _, arr = parse_line(line)
                rows.append(arr)

    # save
    mat = csr_matrix(rows, dtype=np.bool)
    out_path = fpath.with_suffix('.npz')
    save_npz(out_path, mat)

2848it [00:00, 28473.10it/s]

Processing ..\output\docs.eval.zarr.fse.binary.128.txt
3823977 rows, 128 bits


3823978it [02:09, 29481.36it/s]


Processing ..\output\docs.eval.zarr.fse.binary.256.txt


3235it [00:00, 16129.87it/s]

3823977 rows, 256 bits


3823978it [03:58, 16017.53it/s]
2683it [00:00, 26823.95it/s]

Processing ..\output\queries.eval.zarr.bert.binary.128.txt
6980 rows, 128 bits


6981it [00:00, 28029.72it/s]
1630it [00:00, 16296.36it/s]

Processing ..\output\queries.eval.zarr.bert.binary.256.txt
6980 rows, 256 bits


6981it [00:00, 15772.49it/s]
3039it [00:00, 30230.02it/s]

Processing ..\output\queries.eval.zarr.fse.binary.128.txt
6980 rows, 128 bits


6981it [00:00, 30018.16it/s]
1633it [00:00, 16326.32it/s]

Processing ..\output\queries.eval.zarr.fse.binary.256.txt
6980 rows, 256 bits


6981it [00:00, 16156.06it/s]
