## FAISS

In [8]:
from pathlib import Path

import h5py
import numpy as np
import torch

DATA_DIR = Path("~/data/AmazonCat-13K").expanduser()
DATA_DIR.mkdir(exist_ok=True)

DEVICE_IDS = list(range(torch.cuda.device_count()))
print(f"Devices: {DEVICE_IDS}")

Devices: [0, 1]


In [9]:
data_file = h5py.File(DATA_DIR / "amazoncat_encoded.hdf5", "r")
data_file["train"].keys()

<KeysViewHDF5 ['embedding', 'index', 'label']>

In [3]:
train_embed = f["train"]["embedding"]
train_index = f["train"]["index"]
train_label = f["train"]["label"]

test_embed = f["test"]["embedding"]
test_index = f["test"]["index"]
test_label = f["test"]["label"]

print(train_embed)
print(test_embed)

<HDF5 dataset "embedding": shape (215469705, 768), type "<f2">
<HDF5 dataset "embedding": shape (55523476, 768), type "<f2">


In [None]:
from mp_helper import load_data

print("loading data...")

n = train_embed.shape[0]
collection, runtime = load_data(data_dir / "amazoncat_encoded.hdf5", n)

print(f"Time {runtime:.2f}sec")
print(collection.shape)

loading data...


In [None]:
import sys

GB = lambda x: sys.getsizeof(x) / (1024 * 1024 * 1024)
GB(collection)

In [None]:
import faiss
import time
from helpers import flattened_to_batched

d = train_embed.shape[-1]
n_centroids = 10_000
code_size = 16
n_bits = 8

coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, n_centroids, code_size, 8)
index.nprobe = 10

start = time.time()
index.train(collection)
end = time.time()

print(f"Time {end - start:.2f}sec")

In [None]:
GB(index)