In [2]:
import nanopq
import numpy as np
import networkx as nx
from collections import defaultdict
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_blobs
import sys
import faiss
import cython

In [3]:
N, D = 1000000, 256

CLUSTERS_NUMBER = 10000  # число групп

np.random.seed(0)
centers = np.random.randint(-2000, 2000, size=(CLUSTERS_NUMBER, D))  # создает K центров D-размерности, расположенных случайно

vectors, y = make_blobs(n_samples=N, n_features=D, centers=centers, random_state=0)
vectors = vectors.astype(np.float32)

vectors_base, train_y = vectors[:N-50000], y[:N-50000]
queries, test_y = vectors[N-50000:], y[N-50000:]

In [4]:
   
def generate_graph(vectors, k_nearest):
    index = faiss.IndexFlatL2(vectors.shape[1])  # длина вектора
    index.add(vectors.astype('float32'))
    _, indices = index.search(vectors.astype('float32'), k_nearest)
    G = nx.Graph()
    for i in tqdm(range(len(vectors)), total=len(vectors)):
        for index in indices[i]:
            if index != i:
                G.add_edge(i, index)
    return G


k = 35    # количество ближайших соседей для связывания вершин


# генерируем граф
G = generate_graph(vectors_base, k)


100%|██████████| 950000/950000 [00:41<00:00, 23043.10it/s]


In [5]:
G_nodes = np.array(list(G.nodes()))
G_edges = {}
for node in G_nodes:
    G_edges[node] = list(G.edges(node))
G_nodes = G_nodes.astype(np.int32)

In [6]:
pq = nanopq.PQ(M=2, Ks=128, verbose=False)

pq.fit(vectors_base)
X_code = pq.encode(vectors_base)

In [7]:
indexes_map = defaultdict(list)

for i in range(len(X_code)):
    indexes_map[(X_code[i][0], X_code[i][1])].append(i)

In [8]:
def find_nearest(query, query_mi, G_edges=G_edges, G_nodes=G_nodes, indexes_map=indexes_map, vectors=vectors_base):

    query_mi_neighbours = indexes_map[(query_mi[0][0], query_mi[0][1])]
    
    if len(query_mi_neighbours) > 0:
        best_node = query_mi_neighbours[0]
        best_dist = np.linalg.norm(vectors[best_node] - query)
        for best_candidate in query_mi_neighbours:
            dist = np.linalg.norm(vectors[best_candidate] - query)
            if dist < best_dist:
                best_node = best_candidate
                best_dist = dist
    else:
        best_node = np.random.choice(G.nodes())
        best_dist = np.linalg.norm(vectors[best_node] - query)
    #return best_dist, best_node
    queue = []
    queue.append(best_node)
    was = set()
    was.add(best_node)
    while len(queue) > 0:
        node = queue.pop(0)
        for edge in G_edges[node]:
            dst = edge[1]
            if dst in was:
                continue
            was.add(dst)
            dist = np.linalg.norm(vectors[dst] - query)
            if dist < best_dist:
                queue.append(dst)
                best_node = dst
                best_dist = dist
    return best_dist, best_node


In [28]:
index = faiss.IndexHNSWFlat(D, 32)
index.hnsw.efConstruction = 40

index.train(vectors_base)
print(index.ntotal)   # 0
index.add(vectors_base)
print(index.ntotal)   # 1000000

0
950000


In [25]:
import numpy as np
from find_nearest import find_nearest


In [10]:

indexes_map = dict(indexes_map)

In [29]:
import time

dist_better_my = 0
dist_better_hnsw = 0
dist_equal = 0

gt_good_my = 0
gt_good_hnsw = 0
gt_good_both = 0

result_my = []
result_hnsw = []

start = time.time()

indexes_map = dict(indexes_map)

for test_number in tqdm(range(len(queries))):
    query = np.array([queries[test_number]])
    query_mi = pq.encode(query).astype(np.int32)
    d1, i1 = find_nearest(np.array(queries[test_number]), query_mi, G_edges, G_nodes, indexes_map, vectors_base)
    result_my.append([d1, i1])

end = time.time()

print("my time:", end-start)
start = time.time()
for test_number in range(len(queries)):
    d2, i2 = index.search(np.array([queries[test_number]]), 1)
    #d2 = np.linalg.norm(vectors_base[i2[0][0]] - queries[test_number])
    result_hnsw.append([d2, i2[0][0]])
end = time.time()
print("hnsw time:", end-start)

for test_number in range(len(queries)):
    d1, i1 = result_my[test_number][0], result_my[test_number][1]
    
    d2, i2 = result_hnsw[test_number][0], result_hnsw[test_number][1]

    #gt_good_my += gt[test_number][0] == i1 and gt[test_number][0] != i2
    #gt_good_hnsw += gt[test_number][0] != i1 and gt[test_number][0] == i2
    #gt_good_both += gt[test_number][0] == i1 and gt[test_number][0] == i2
    
    dist_better_my += d1 < d2
    dist_equal += d1 == d2
    dist_better_hnsw += d1 > d2
print(dist_better_my, dist_equal, dist_better_hnsw)
print(gt_good_my, gt_good_both, gt_good_hnsw)

100%|██████████| 50000/50000 [00:06<00:00, 7418.39it/s]


my time: 6.743005275726318
hnsw time: 13.462644338607788
[[16337]] [[3531]] [[30132]]
0 0 0


In [12]:
quantizer = faiss.IndexFlatL2(D)  # Векторный квантайзер, используемый для кластеризации
index = faiss.IndexIVFFlat(quantizer, D, 10000, faiss.METRIC_L2)

# Тренируем индекс на базе данных
index.train(vectors_base)
print("Тренировка завершена")

# Добавляем данные в индекс
index.add(vectors_base)
print("Количество векторов в индексе:", index.ntotal)

# Устанавливаем параметр поиска
index.nprobe = 10  # Количество кластеров, которые будут проверяться при поиске



Тренировка завершена
Количество векторов в индексе: 950000


In [13]:
import time

dist_better_my = 0
dist_better_hnsw = 0
dist_equal = 0

gt_good_my = 0
gt_good_hnsw = 0
gt_good_both = 0

result_my = []
result_hnsw = []

start = time.time()

indexes_map = dict(indexes_map)

for test_number in tqdm(range(len(queries))):
    query = np.array([queries[test_number]])
    query_mi = pq.encode(query).astype(np.int32)
    d1, i1 = find_nearest(np.array(queries[test_number]), query_mi, G_edges, G_nodes, indexes_map, vectors_base)
    result_my.append([d1, i1])

end = time.time()

print("my time:", end-start)
start = time.time()
for test_number in range(len(queries)):
    d2, i2 = index.search(np.array([queries[test_number]]), 1)
    d2 = np.linalg.norm(vectors_base[i2[0][0]] - queries[test_number])
    result_hnsw.append([d2, i2[0][0]])
end = time.time()
print("hnsw time:", end-start)

for test_number in range(len(queries)):
    d1, i1 = result_my[test_number][0], result_my[test_number][1]
    
    d2, i2 = result_hnsw[test_number][0], result_hnsw[test_number][1]

    #gt_good_my += gt[test_number][0] == i1 and gt[test_number][0] != i2
    #gt_good_hnsw += gt[test_number][0] != i1 and gt[test_number][0] == i2
    #gt_good_both += gt[test_number][0] == i1 and gt[test_number][0] == i2
    
    dist_better_my += d1 < d2
    dist_equal += d1 == d2
    dist_better_hnsw += d1 > d2
print(dist_better_my, dist_equal, dist_better_hnsw)
print(gt_good_my, gt_good_both, gt_good_hnsw)

100%|██████████| 50000/50000 [00:23<00:00, 2095.17it/s]


my time: 23.865410804748535
hnsw time: 41.26941967010498
0 49999 1
0 0 0


In [23]:
nlist = 100  # Количество центроидов для кластеризации
m = 16  # Число подпространств для PQ
bits = 8  # Число бит на подпространство (чаще всего 8 бит)

# Создаем индекс IVFPQ
quantizer = faiss.IndexFlatL2(D)  # Векторный квантайзер, используемый для кластеризации
index = faiss.IndexIVFPQ(quantizer, D, nlist, m, bits)

# Тренируем индекс на базе данных
index.train(vectors_base)
print("Тренировка завершена")

# Добавляем данные в индекс
index.add(vectors_base)
print("Количество векторов в индексе:", index.ntotal)

# Устанавливаем параметр поиска
index.nprobe = 20  # Количество кластеров, которые будут проверяться при поиске



Тренировка завершена
Количество векторов в индексе: 950000


In [24]:
import time

dist_better_my = 0
dist_better_hnsw = 0
dist_equal = 0

gt_good_my = 0
gt_good_hnsw = 0
gt_good_both = 0

result_my = []
result_hnsw = []

start = time.time()

indexes_map = dict(indexes_map)

for test_number in tqdm(range(len(queries))):
    query = np.array([queries[test_number]])
    query_mi = pq.encode(query).astype(np.int32)
    d1, i1 = find_nearest(np.array(queries[test_number]), query_mi, G_edges, G_nodes, indexes_map, vectors_base)
    result_my.append([d1, i1])

end = time.time()

print("my time:", end-start)
start = time.time()
for test_number in range(len(queries)):
    d2, i2 = index.search(np.array([queries[test_number]]), 1)
    d2 = np.linalg.norm(vectors_base[i2[0][0]] - queries[test_number])
    result_hnsw.append([d2, i2[0][0]])
end = time.time()
print("hnsw time:", end-start)

for test_number in range(len(queries)):
    d1, i1 = result_my[test_number][0], result_my[test_number][1]
    
    d2, i2 = result_hnsw[test_number][0], result_hnsw[test_number][1]

    #gt_good_my += gt[test_number][0] == i1 and gt[test_number][0] != i2
    #gt_good_hnsw += gt[test_number][0] != i1 and gt[test_number][0] == i2
    #gt_good_both += gt[test_number][0] == i1 and gt[test_number][0] == i2
    
    dist_better_my += d1 < d2
    dist_equal += d1 == d2
    dist_better_hnsw += d1 > d2
print(dist_better_my, dist_equal, dist_better_hnsw)
print(gt_good_my, gt_good_both, gt_good_hnsw)

100%|██████████| 50000/50000 [00:28<00:00, 1773.40it/s]


my time: 28.19642925262451
hnsw time: 65.74541878700256
49485 515 0
0 0 0
