# Pyserini integration

Using pyserini to retrieve the top 1000 documents using BM25 and rank using ColBERT.

# Loading indexes of MsMARCO-PT

In [1]:
from pyserini.index import IndexReader
from pyserini.analysis import  get_lucene_analyzer

In [2]:
index_reader = IndexReader("/home/kenzosaki/mestrado/data/pt_msmarco/indexes")

In [3]:
index_reader.stats()

{'total_terms': 335050083,
 'documents': 8829003,
 'non_empty_documents': 8829003,
 'unique_terms': 1824588}

# Preparing the BM25 retrieval

In [18]:
from pyserini.search import SimpleSearcher
from typing import List

In [5]:
searcher = SimpleSearcher('/home/kenzosaki/mestrado/data/pt_msmarco/indexes')
searcher.set_analyzer(get_lucene_analyzer("pt")) # para textos em portugues

In [22]:
def get_topk_from_query(searcher: SimpleSearcher, query: str, k: int = 1000) -> List[str]:
    """
    Retorna os k documentos mais relevantes para a query.
    """
    return [hit.contents for hit in searcher.search(query, k)]

In [23]:
query = "o que pode detectar o exame de urina"
hits = get_topk_from_query(searcher, query)

In [24]:
type(hits[0])

str

# Loading ColBERT

In [38]:
from colbert.modeling.inference import ModelInference
from colbert.modeling.colbert import ColBERT
from colbert.utils.utils import load_checkpoint
import torch

In [33]:
PARAMETERS = {
    "model_name": "neuralmind/bert-base-portuguese-cased",
    "query_maxlen": 32,
    "doc_maxlen": 180,
    "dim": 128,
    "similarity_metric": "cosine",
    "mask_punctuation": True
}

In [39]:
def load_colbert_from_checkpoint(path_to_checkpoint: str, device: torch.device):
    """
    Carrega o modelo ColBERT a partir checkpoint.
    """
    # Parametros usados para treino
    colbert = ColBERT.from_pretrained("neuralmind/bert-base-portuguese-cased", **PARAMETERS)
    colbert.to(device)

    checkpoint = load_checkpoint(path_to_checkpoint, colbert, do_print=True)

    colbert.eval()

    return colbert, checkpoint

In [51]:
DEVICE = torch.device("cpu")

In [52]:
# Preparing to inference
colbert, checkpoint = load_colbert_from_checkpoint("/home/kenzosaki/mestrado/repos/ColBERT/experiments/ColBertimbau/train.py/colbertimbau/checkpoints/colbert-150000.dnn", device)
# colbert.to(device)
inference_model = ModelInference(colbert)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing ColBERT: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['

[dez 01, 22:49:13] #> Loading checkpoint /home/kenzosaki/mestrado/repos/ColBERT/experiments/ColBertimbau/train.py/colbertimbau/checkpoints/colbert-150000.dnn ..
[dez 01, 22:49:13] #> checkpoint['epoch'] = 0
[dez 01, 22:49:13] #> checkpoint['batch'] = 150000
Using neuralmind/bert-base-portuguese-cased tokenizer as the QueryTokenizer!
Using neuralmind/bert-base-portuguese-cased tokenizer as the DocTokenizer!


In [54]:
# DEPENDE DO DEVICE no params.py
Q = inference_model.queryFromText([query], to_cpu=True)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking arugment for argument index in method wrapper_index_select)

In [None]:
# TODO: continuar a partir do colbert.evaluation.slow.py!