In [1]:
from pathlib import Path

import numpy as np
import torch

DATA_DIR = Path("~/data/TREC").expanduser()
DATA_DIR.mkdir(exist_ok=True)

DEVICE_IDS = list(range(torch.cuda.device_count()))
print(f"Devices: {DEVICE_IDS}")

Devices: [0, 1]


In [2]:
from datasets import load_dataset

dataset = load_dataset("trec")

Found cached dataset trec (/home/IAIS/hiser/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset["train"][0].keys()

dict_keys(['text', 'coarse_label', 'fine_label'])

In [4]:
from tqdm.notebook import tqdm

def encode_dataset(dataset, model, tokenizer):
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=512)
    
    index, embeddings = [], []
    for i, batch in enumerate(tqdm(dataloader)):
        input_dict = tokenizer(batch["text"], return_tensors="pt", padding="max_length", truncation=True)
        
        with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.float16):
            embedding = model(**input_dict).cpu().numpy()
        
        attention = input_dict["attention_mask"].bool()
        embedding = embedding[attention]
        indices = np.where(attention)[0] + dataloader.batch_size * i
        
        embeddings.append(embedding)
        index.append(indices)
    
    embeddings = np.concatenate(embeddings)
    index = np.concatenate(index)
    
    return embeddings, index

In [5]:
from transformers import AutoTokenizer
from helpers import ColBERT

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = ColBERT.from_pretrained("sebastian-hofstaetter/colberter-128-32-msmarco")
net = torch.nn.DataParallel(model.cuda(DEVICE_IDS[0]), DEVICE_IDS)

2023-01-12 17:15:10.654535: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-12 17:15:10.812368: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-12 17:15:11.700907: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-12 17:15:11.700996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [6]:
train_embed, train_index = encode_dataset(dataset["train"], net, tokenizer)
test_embed, test_index = encode_dataset(dataset["test"], net, tokenizer)
print(train_embed.shape)
print(test_embed.shape)

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

(73442, 32)
(5162, 32)


In [7]:
import faiss
import time

d = train_embed.shape[-1]
n_centroids = 1_000
code_size = 16
n_bits = 8

coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, n_centroids, code_size, 8)
index.nprobe = 10

start = time.time()
index.train(train_embed)
index.add(train_embed)
end = time.time()

print(f"Time {end - start:.2f}sec")

Time 8.02sec


In [8]:
import helpers as h

import importlib
importlib.reload(h)

def candidate_generation(I: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    candidates = np.unique(train_index[I])
    candidate_index = train_index[np.isin(train_index, candidates)]
    candidate_embed = train_embed[candidate_index]

    return h.flattened_to_batched(train_embed[candidate_index], candidate_index, return_att_mask=True)

In [9]:
start = time.time()
_, I = index.search(test_embed, k=50)
end = time.time()

print(f"Time {end - start:.2f}sec")

document_vecs, document_mask = candidate_generation(I)
query_vecs, query_mask = h.flattened_to_batched(test_embed, test_index, padding=document_vecs.shape[1], return_att_mask=True)
# convert to torch tensors
document_vecs, document_mask = torch.from_numpy(document_vecs), torch.from_numpy(document_mask)
query_vecs, query_mask = torch.from_numpy(query_vecs), torch.from_numpy(query_mask)

print(document_vecs.shape)
print(query_vecs.shape)

Time 0.02sec
torch.Size([5452, 41, 32])
torch.Size([500, 41, 32])


In [10]:
# create array that holds all possible index pair combinations
query_ind = np.arange(query_vecs.shape[0])
doc_ind = np.arange(document_vecs.shape[0])
index_pairs = np.array(np.meshgrid(query_ind, doc_ind)).T.reshape(-1, 2)
print(f"#pairs: {index_pairs.shape[0]}")

#pairs: 2726000


In [11]:
scores = model.forward_aggregation(
    query_vecs[index_pairs[:, 0]],
    document_vecs[index_pairs[:, 1]],
    query_mask[index_pairs[:, 0]],
    document_mask[index_pairs[:, 1]]
)
scores = scores.reshape(query_vecs.shape[0], document_vecs.shape[0])
scores.shape

torch.Size([500, 5452])

In [12]:
v, i = scores.max(1)
# real_i = np.unique(train_index[I])[i]
real_i = i

In [13]:
labels = np.array(dataset["train"]["coarse_label"])
y_pred = labels[real_i]
y_pred.shape

(500,)

In [14]:
y_true = np.array(dataset["test"]["coarse_label"])
y_true.shape

(500,)

In [17]:
(y_pred == y_true).sum() / y_true.shape[0]

0.188