In [22]:
#!conda install -y faiss-gpu -c pytorch
from scipy.spatial.distance import cosine
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import psycopg2
from psycopg2 import extras
import time

d=768
HOST = '54.196.150.193'
USER = 'postgres'
PASS = os.environ.get('PGPASS')
PGSSLROOTCERT = os.environ.get('PGSSLROOTCERT')
if PASS == None or PGSSLROOTCERT == None:
    print("Please set PG_PASS and PGSSLROOTCERT env variable")
    raise SystemExit()
DBNAME = 'fever'
POSTGRES_DSN = f'''dbname='fever' user='{USER}' host='{HOST}' password='{PASS}' '''

In [2]:
class ResultHeap:
    """ Combine query results from a sliced dataset """

    def __init__(self, nq, k):
        " nq: number of query vectors, k: number of results per query "
        self.I = np.zeros((nq, k), dtype='int64')
        self.D = np.zeros((nq, k), dtype='float32')
        self.nq, self.k = nq, k
        heaps = faiss.float_maxheap_array_t()
        heaps.k = k
        heaps.nh = nq
        heaps.val = faiss.swig_ptr(self.D)
        heaps.ids = faiss.swig_ptr(self.I)
        heaps.heapify()
        self.heaps = heaps

    def add_batch_result(self, D, I, i0):
        assert D.shape == (self.nq, self.k)
        assert I.shape == (self.nq, self.k)
        I += i0
        self.heaps.addn_with_ids(
            self.k, faiss.swig_ptr(D),
            faiss.swig_ptr(I), self.k)

    def finalize(self):
        self.heaps.reorder()

In [3]:
def search_knn(xq, xb, k, distance_type=faiss.METRIC_L2): 
    """ wrapper around the faiss knn functions without index """
    nq, d = xq.shape
    nb, d2 = xb.shape
    assert d == d2
    
    I = np.empty((nq, k), dtype='int64')
    D = np.empty((nq, k), dtype='float32')
    
    if distance_type == faiss.METRIC_L2: 
        heaps = faiss.float_maxheap_array_t()
        heaps.k = k
        heaps.nh = nq
        heaps.val = faiss.swig_ptr(D)
        heaps.ids = faiss.swig_ptr(I)
        faiss.knn_L2sqr(
            faiss.swig_ptr(xq), faiss.swig_ptr(xb), 
            d, nq, nb, heaps
        )
    elif distance_type == faiss.METRIC_INNER_PRODUCT: 
        heaps = faiss.float_minheap_array_t()
        heaps.k = k
        heaps.nh = nq
        heaps.val = faiss.swig_ptr(D)
        heaps.ids = faiss.swig_ptr(I)
        faiss.knn_inner_product(
            faiss.swig_ptr(xq), faiss.swig_ptr(xb), 
            d, nq, nb, heaps
        )
    return D, I 

NameError: name 'faiss' is not defined

In [8]:
def search_raw_array_pytorch(res, xb, xq, k, D=None, I=None,
                             metric=faiss.METRIC_L2):
    assert xb.device == xq.device

    nq, d = xq.size()
    if xq.is_contiguous():
        xq_row_major = True
    elif xq.t().is_contiguous():
        xq = xq.t()    # I initially wrote xq:t(), Lua is still haunting me :-)
        xq_row_major = False
    else:
        raise TypeError('matrix should be row or column-major')

    xq_ptr = swig_ptr_from_FloatTensor(xq)

    nb, d2 = xb.size()
    assert d2 == d
    if xb.is_contiguous():
        xb_row_major = True
    elif xb.t().is_contiguous():
        xb = xb.t()
        xb_row_major = False
    else:
        raise TypeError('matrix should be row or column-major')
    xb_ptr = swig_ptr_from_FloatTensor(xb)

    if D is None:
        D = torch.empty(nq, k, device=xb.device, dtype=torch.float32)
    else:
        assert D.shape == (nq, k)
        assert D.device == xb.device

    if I is None:
        I = torch.empty(nq, k, device=xb.device, dtype=torch.int64)
    else:
        assert I.shape == (nq, k)
        assert I.device == xb.device

    D_ptr = swig_ptr_from_FloatTensor(D)
    I_ptr = swig_ptr_from_LongTensor(I)

    faiss.bruteForceKnn(res, metric,
                        xb_ptr, xb_row_major, nb,
                        xq_ptr, xq_row_major, nq,
                        d, k, D_ptr, I_ptr)

    return D, I

In [25]:
model = SentenceTransformer('./fever-model/')
conn = psycopg2.connect(POSTGRES_DSN)
cur = conn.cursor()
print('getting from postgres')
cur.execute('''
SELECT id, claim FROM sets.claims 
WHERE is_test_set AND verifiable
''')
res = cur.fetchall()
claim_ids, claims = zip(*res)
query_vectors = model.encode(claims, batch_size=32, show_progress_bar=True)

getting from postgres


In [10]:
def swig_ptr_from_FloatTensor(x):
    assert x.is_contiguous()
    assert x.dtype == torch.float32
    return faiss.cast_integer_to_float_ptr(
        x.storage().data_ptr() + x.storage_offset() * 4)

def swig_ptr_from_LongTensor(x):
    assert x.is_contiguous()
    assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
    return faiss.cast_integer_to_long_ptr(
        x.storage().data_ptr() + x.storage_offset() * 8)

In [39]:
k = 100
import faiss
import torch

res = faiss.StandardGpuResources()
res.setDefaultNullStreamAllDevices()
query_vectors = np.array(query_vectors)
results = ResultHeap(len(query_vectors), k)
start = time.time()
xq_t = torch.from_numpy(query_vectors).cuda()

import glob
for file in glob.glob('./fever-embs/*.npy'):
    arr = np.load(file)
    ids = arr[:, 0].astype(np.int64)
    vectors = np.ascontiguousarray(arr[:, 1:].astype(np.float32))
    xb_t = torch.from_numpy(vectors).cuda()
    D, I = search_raw_array_pytorch(res, xb_t, xq_t, k)
    I = I.cpu().numpy()
    I = np.ascontiguousarray(ids[I])
    results.add_batch_result(D.cpu().numpy(), I , 0)
    print(f'id: {i0} rate: {i0/(time.time() - start)} per second')
results.finalize()

id: 5396096 rate: 3356661.9277707604 per second
id: 5396096 rate: 578878.6751916535 per second
id: 5396096 rate: 334943.72122431686 per second
id: 5396096 rate: 224512.5100982765 per second
id: 5396096 rate: 174543.4830695143 per second
id: 5396096 rate: 139417.42379495435 per second
id: 5396096 rate: 119263.85486368318 per second
id: 5396096 rate: 100591.02669696555 per second
id: 5396096 rate: 88611.10875752025 per second
id: 5396096 rate: 79675.53612302964 per second
id: 5396096 rate: 71978.7402600271 per second
id: 5396096 rate: 66065.22132283906 per second
id: 5396096 rate: 60448.92514482267 per second
id: 5396096 rate: 55759.14861954831 per second
id: 5396096 rate: 52239.31387624846 per second
id: 5396096 rate: 49053.41797575435 per second
id: 5396096 rate: 45982.28167145725 per second
id: 5396096 rate: 43360.889504654304 per second
id: 5396096 rate: 40572.20296014832 per second
id: 5396096 rate: 38418.69396491631 per second
id: 5396096 rate: 36450.79419099415 per second
id: 5396

In [35]:
from psycopg2.extensions import register_adapter, AsIs
psycopg2.extensions.register_adapter(np.int64, psycopg2._psycopg.AsIs)

In [40]:
template = "INSERT INTO test.vector_benchmark (claim_id, article_ids) VALUES %s"
buffer = []
conn = psycopg2.connect(POSTGRES_DSN)
cur = conn.cursor()
for i, (claim_id, res) in enumerate(zip(claim_ids, results.I)):
    buffer.append((claim_id, list(res)))
    if (i+1) % 5000 == 0:
        extras.execute_values(cur, template, buffer)
        conn.commit()
        buffer = []
extras.execute_values(cur,template,buffer)
buffer = []
conn.commit()

In [9]:
index = faiss.index_factory(d, "IDMap,Flat") # no compression/ loss
# index = faiss.index_factory(d, "PCAR64,IVF65536_HNSW32,SQ8")

In [12]:
xb = np.ascontiguousarray(xb)

In [13]:
index.train(xb)
index.add(xb)
arr = None
query = xb[:5]

KeyboardInterrupt: 

In [7]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(query, k) # sanity check
print(I)
print(D)

[[    0  6311 72709 18162]
 [    1 59766 58111 52040]
 [    2 31479 58959  9319]
 [    3 35738 64225 74826]
 [    4 96100 29453 16855]]
[[ 63.39306   64.44453   65.00524   65.72736 ]
 [ 43.355114  43.770733  43.770733  43.770733]
 [146.49068  172.00456  172.46603  175.20573 ]
 [173.89133  185.32256  193.85425  193.94987 ]
 [ 79.9476    83.01735   83.22531   83.37677 ]]
