# Pyserini integration

Using pyserini to retrieve the top 1000 documents using BM25 and rank using ColBERT.

In [1]:
INDEX_PATH = "/home/kenzosaki/mestrado/data/pt_msmarco/indexes"
QUERIES_PATH = "/home/kenzosaki/mestrado/data/pt_msmarco/google_translate/portuguese.queries.dev.small.tsv"
QRELS_PATH = "/home/kenzosaki/mestrado/data/en_msmarco/data/msmarco_ans_small/qrels.dev.small.tsv"
# CHECKPOINT_PATH = "/home/kenzosaki/mestrado/repos/ColBERT/experiments/ColBertimbau/train.py/colbertimbau/checkpoints/colbert-150000.dnn"
CHECKPOINT_PATH = "/home/kenzosaki/mestrado/repos/ColBERT/experiments/ColBertimbau-linear_scheduler/train.py/colbertimbau-linear_scheduler/checkpoints/colbert-200000.dnn"

In [2]:
BM25_PARAMETERS = {
    "k1": 1.25, 
    "b": 0.74
}

RM3_PARAMETERS = {
    "fb_docs": 10, 
    "fb_terms": 30, 
    "original_query_weight": 0.4
}

In [3]:
COLBERT_PARAMETERS = {
    "model_name": "neuralmind/bert-base-portuguese-cased",
    "query_maxlen": 32,
    "doc_maxlen": 180,
    "dim": 128,
    "similarity_metric": "cosine",
    "mask_punctuation": True
}

In [4]:
TOPK = 1000

# Loading indexes of MsMARCO-PT

In [5]:
from pyserini.index import IndexReader
from pyserini.analysis import  get_lucene_analyzer

In [6]:
index_reader = IndexReader(INDEX_PATH)

In [7]:
index_reader.stats()

{'total_terms': 335050083,
 'documents': 8829003,
 'non_empty_documents': 8829003,
 'unique_terms': 1824588}

# Preparing the BM25 retrieval

In [8]:
from pyserini.search import SimpleSearcher
from typing import List

In [9]:
searcher = SimpleSearcher(INDEX_PATH)
analyser = get_lucene_analyzer("pt")
searcher.set_analyzer(analyser) # para textos em portugues
searcher.set_bm25(**BM25_PARAMETERS)
searcher.set_rm3(**RM3_PARAMETERS)

In [10]:
query = "o que pode detectar o exame de urina"
hits = searcher.search(query, TOPK)

In [11]:
index_reader.analyze(query, analyser)

['pode', 'detectar', 'exam', 'urin']

In [12]:
# TODO: da pra adicionar mais stopwords aqui. do NLTK por ex?
analyser.stopwordSet.toString()

'[tua, tenho, tínhamos, éramos, aos, isso, fora, minhas, seja, são, terá, hei, como, houver, tu, estiver, vocês, suas, até, estejam, tém, elas, tivéramos, está, tem, seríamos, for, terei, nas, esta, para, teu, isto, pela, estivéramos, hajamos, ao, hão, era, as, estivesse, estejamos, aquela, meus, temos, pelo, aquele, tivera, pelos, me, houveria, sua, estamos, estas, fôssemos, tiver, aquilo, dos, formos, estava, estiveram, sejamos, tivessem, qual, fosse, fossem, se, por, estivessem, houverei, tenham, aqueles, sem, estávamos, entre, mas, você, a, seu, e, houveram, será, houvesse, lhe, depois, houvéssemos, houvera, não, serei, o, seus, te, tivesse, tuas, houvemos, esteja, essas, seria, há, fui, tenhamos, essa, delas, houveríamos, esse, de, estivéssemos, teria, com, nossos, num, pelas, estivemos, do, esteve, da, no, estive, às, teriam, eles, das, houvessem, um, sou, ela, numa, quando, tiveram, ele, este, nossa, houveremos, teríamos, tivéssemos, estavam, em, havemos, tivemos, nosso, tinha, 

# Loading queries

In [13]:
import pandas as pd

queries_df = pd.read_csv(QUERIES_PATH, sep='\t', header=None, names=['query_id', 'query'])
qrels_df = pd.read_csv(QRELS_PATH, sep='\t', header=None, names=['query_id', 'x', 'pid', 'y'])

In [14]:
queries_df = queries_df.set_index('query_id').loc[qrels_df["query_id"].unique()]

In [15]:
queries_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105 entries, 352818 to 188714
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   105 non-null    object
dtypes: object(1)
memory usage: 1.6+ KB


In [16]:
queries_df.head()

Unnamed: 0_level_0,query
query_id,Unnamed: 1_level_1
352818,como cozinhar vagem
1089760,a lesão ligamentar grave mais comum do joelho ...
1089312,tipos de fontes java
1087904,com que idade as toupeiras aparecem
1087589,quais são os gases usados ​​em sinais de néon?


# BM25 retrieval

In [17]:
from tqdm import tqdm

def run_bm25(queries_df: pd.DataFrame, searcher: SimpleSearcher, output_path: str, k: int = 1000) -> None:

    # Output file
    with open(output_path, "w") as f:
        for query_id, row in tqdm(queries_df.iterrows(), total=len(queries_df) , desc="- Running BM25"):
            # BM25 retrieval
            query = row["query"]
            hits = searcher.search(query, k)
            
            for rank, hit in enumerate(hits, start=1):
                f.write("{}\t{}\t{}\n".format(query_id, hit.docid, rank))

In [18]:
run_bm25(queries_df, searcher, "data/runs/bm25_run.tsv", k=TOPK)

- Running BM25: 100%|██████████| 105/105 [00:18<00:00,  5.72it/s]


# Loading ColBERT

In [19]:
from colbert.modeling.inference import ModelInference
from colbert.modeling.colbert import ColBERT
from colbert.utils.utils import load_checkpoint
import torch

In [20]:
def load_colbert_from_checkpoint(path_to_checkpoint: str, device: torch.device):
    """
    Carrega o modelo ColBERT a partir checkpoint.
    """
    # Parametros usados para treino
    colbert = ColBERT.from_pretrained("neuralmind/bert-base-portuguese-cased", **COLBERT_PARAMETERS)
    colbert.to(device)

    checkpoint = load_checkpoint(path_to_checkpoint, colbert, do_print=True)

    colbert.eval()

    return colbert, checkpoint

In [21]:
device = torch.device("cuda")

In [22]:
# Preparing to inference
colbert, checkpoint = load_colbert_from_checkpoint(CHECKPOINT_PATH, device)
# colbert.to(device)
inference_model = ModelInference(colbert)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing ColBERT: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['

[dez 20, 16:51:37] #> Loading checkpoint /home/kenzosaki/mestrado/repos/ColBERT/experiments/ColBertimbau-linear_scheduler/train.py/colbertimbau-linear_scheduler/checkpoints/colbert-200000.dnn ..
[dez 20, 16:51:38] #> checkpoint['epoch'] = 0
[dez 20, 16:51:38] #> checkpoint['batch'] = 200000
Using neuralmind/bert-base-portuguese-cased tokenizer as the QueryTokenizer!
Using neuralmind/bert-base-portuguese-cased tokenizer as the DocTokenizer!


# Evaluating ColBERT

In [23]:
import numpy as np
import torch

def get_docs_and_pids_from_hits(hits):

    docs = [hit.contents for hit in hits]
    pids = [hit.docid for hit in hits]

    return docs, pids

def get_scores(inference_model, query, docs, bs):
    
    with torch.no_grad():
        Q = inference_model.queryFromText([query]).squeeze(0)
        D_ = inference_model.docFromText(docs, bsize=bs)
        scores = inference_model.colbert.score(Q, D_).cpu()

    return scores


def run_colbert(queries_df: pd.DataFrame,
                searcher: SimpleSearcher,
                inference_model: ModelInference, 
                output_path: str, 
                bs: int = 32,
                k: int = 1000) -> None:

    # Output file
    with open(output_path, "w") as f:
        for query_id, row in tqdm(queries_df.iterrows(), total=len(queries_df) , desc="- Running ColBERT"):
            # BM25 retrieval
            query = row["query"]
            hits = searcher.search(query, k)

            docs, pids = get_docs_and_pids_from_hits(hits)
            scores = get_scores(inference_model, query, docs, bs)

            sorted_indexes = torch.argsort(scores, descending=True)
            
            for rank, index in enumerate(sorted_indexes, start=1):
                f.write("{}\t{}\t{}\n".format(query_id, pids[index], rank))

In [24]:
checkpoint_name = CHECKPOINT_PATH.split("/")[-1]
checkpoint_name

'colbert-200000.dnn'

In [25]:
run_colbert(queries_df, searcher, inference_model, f"data/runs/{checkpoint_name}.linear_scheduler.run.tsv", bs=512, k=TOPK)

- Running ColBERT: 100%|██████████| 105/105 [06:49<00:00,  3.90s/it]
