### Load Dataset & Train Index

In [1]:
import torch
import evaluate
import datasets
from ettcl.searching import ColBERTSearcher, SearcherConfig
from ettcl.encoding import ColBERTEncoder
from ettcl.modeling import ColBERTModel, ColBERTTokenizer, ColBERTTokenizerConfig

dataset = "trec"
index_path = "../indexes/bert-base-uncased.2bits"
model_path = "bert-base-uncased"
label_column = 'fine_label'

In [2]:
model = ColBERTModel.from_pretrained(model_path)
tok_config = ColBERTTokenizerConfig(query_maxlen=32)
tokenizer = ColBERTTokenizer(model_path, tok_config)
encoder = ColBERTEncoder(model, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing ColBERTModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing ColBERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ColBERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
train_dataset = datasets.load_dataset(dataset, split="train")
train_dataset.set_format("pt", columns=[label_column])
test_dataset = datasets.load_dataset(dataset, split="test")
test_dataset.set_format("pt", columns=[label_column])

Found cached dataset trec (/home/IAIS/hiser/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)
Found cached dataset trec (/home/IAIS/hiser/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


#### Evaluation

In [4]:
k = 1
searcher_config = SearcherConfig(ncells=16)
searcher = ColBERTSearcher(index_path, encoder, searcher_config)
accuracy_metric = evaluate.load("accuracy")

[Jun 17, 14:57:11] #> Loading codec...
[Jun 17, 14:57:11] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Jun 17, 14:57:11] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Jun 17, 14:57:13] #> Loading IVF...
[Jun 17, 14:57:13] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 1503.33it/s]

[Jun 17, 14:57:13] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 79.01it/s]


In [5]:
match_indices, match_scores = searcher.search(test_dataset["text"], k=k, return_tensors="pt")
match_indices[1].shape, match_scores[1].shape

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

torch.Size([500, 32, 768])


(torch.Size([1]), torch.Size([1]))

In [7]:
y_pred = [torch.mode(train_dataset[label_column][indices]).values for indices in match_indices]
accuracy_metric.compute(predictions=y_pred, references=test_dataset[label_column])

{'accuracy': 0.732}